From e1827807e8d9d5b67c21aa565697211df3f23d25 Mon Sep 17 00:00:00 2001
From: Lauri Kasanen <cand@gmx.com>
Date: Fri, 15 Jan 2021 16:01:02 +0200
Subject: video: fbdev: simplefb: Add "r5g5b5a1" mode

Add "r5g5b5a1" which is used on the N64.

Signed-off-by: Lauri Kasanen <cand@gmx.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210115160102.cf4c85db9f815758716f086f@gmx.com
---
 include/linux/platform_data/simplefb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/simplefb.h b/include/linux/platform_data/simplefb.h
index ca8337695c2a..27ea99af6e1d 100644
--- a/include/linux/platform_data/simplefb.h
+++ b/include/linux/platform_data/simplefb.h
@@ -16,6 +16,7 @@
 #define SIMPLEFB_FORMATS \
 { \
 	{ "r5g6b5", 16, {11, 5}, {5, 6}, {0, 5}, {0, 0}, DRM_FORMAT_RGB565 }, \
+	{ "r5g5b5a1", 16, {11, 5}, {6, 5}, {1, 5}, {0, 1}, DRM_FORMAT_RGBA5551 }, \
 	{ "x1r5g5b5", 16, {10, 5}, {5, 5}, {0, 5}, {0, 0}, DRM_FORMAT_XRGB1555 }, \
 	{ "a1r5g5b5", 16, {10, 5}, {5, 5}, {0, 5}, {15, 1}, DRM_FORMAT_ARGB1555 }, \
 	{ "r8g8b8", 24, {16, 8}, {8, 8}, {0, 8}, {0, 0}, DRM_FORMAT_RGB888 }, \
-- 
cgit v1.2.3


From 7621350c6bb20fb6ab7eb988833ab96eac3dcbef Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Fri, 15 Jan 2021 14:32:39 +0100
Subject: drm/syncobj: make lockdep complain on WAIT_FOR_SUBMIT v3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT can't be used when we hold locks
since we are basically waiting for userspace to do something.

Holding a lock while doing so can trivial deadlock with page faults
etc...

So make lockdep complain when a driver tries to do this.

v2: Add lockdep_assert_none_held() macro.
v3: Add might_sleep() and also use lockdep_assert_none_held() in the
    IOCTL path.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patchwork.freedesktop.org/patch/414944/
---
 drivers/gpu/drm/drm_syncobj.c | 12 ++++++++++++
 include/linux/lockdep.h       |  5 +++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 6e74e6745eca..9ecfec1e5567 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -387,6 +387,15 @@ int drm_syncobj_find_fence(struct drm_file *file_private,
 	if (!syncobj)
 		return -ENOENT;
 
+	/* Waiting for userspace with locks help is illegal cause that can
+	 * trivial deadlock with page faults for example. Make lockdep complain
+	 * about it early on.
+	 */
+	if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT) {
+		might_sleep();
+		lockdep_assert_none_held_once();
+	}
+
 	*fence = drm_syncobj_fence_get(syncobj);
 	drm_syncobj_put(syncobj);
 
@@ -940,6 +949,9 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
 	uint64_t *points;
 	uint32_t signaled_count, i;
 
+	if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT)
+		lockdep_assert_none_held_once();
+
 	points = kmalloc_array(count, sizeof(*points), GFP_KERNEL);
 	if (points == NULL)
 		return -ENOMEM;
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b9e9adec73e8..6eb117c0d0f3 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -310,6 +310,10 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 		WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 
+#define lockdep_assert_none_held_once()	do {				\
+		WARN_ON_ONCE(debug_locks && current->lockdep_depth);	\
+	} while (0)
+
 #define lockdep_recursing(tsk)	((tsk)->lockdep_recursion)
 
 #define lockdep_pin_lock(l)	lock_pin_lock(&(l)->dep_map)
@@ -387,6 +391,7 @@ extern int lockdep_is_held(const void *);
 #define lockdep_assert_held_write(l)	do { (void)(l); } while (0)
 #define lockdep_assert_held_read(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
+#define lockdep_assert_none_held_once()	do { } while (0)
 
 #define lockdep_recursing(tsk)			(0)
 
-- 
cgit v1.2.3


From c7f59e3dd60313071a989227dcb69094f499d310 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 19 Jan 2021 20:45:08 +0000
Subject: dma-buf: heaps: Rework heap allocation hooks to return struct dma_buf
 instead of fd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every heap needs to create a dmabuf and then export it to a fd
via dma_buf_fd(), so to consolidate things a bit, have the heaps
just return a struct dmabuf * and let the top level
dma_heap_buffer_alloc() call handle creating the fd via
dma_buf_fd().

Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Liam Mark <lmark@codeaurora.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Brian Starkey <Brian.Starkey@arm.com>
Cc: Hridya Valsaraju <hridya@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sandeep Patil <sspatil@google.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
Cc: Ørjan Eide <orjan.eide@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ezequiel Garcia <ezequiel@collabora.com>
Cc: Simon Ser <contact@emersion.fr>
Cc: James Jones <jajones@nvidia.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
 [sumits: minor reword of commit message]

Link: https://patchwork.freedesktop.org/patch/msgid/20210119204508.9256-3-john.stultz@linaro.org
---
 drivers/dma-buf/dma-heap.c          | 14 +++++++++++++-
 drivers/dma-buf/heaps/cma_heap.c    | 22 +++++++---------------
 drivers/dma-buf/heaps/system_heap.c | 21 +++++++--------------
 include/linux/dma-heap.h            | 12 ++++++------
 4 files changed, 33 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
index afd22c9dbdcf..6b5db954569f 100644
--- a/drivers/dma-buf/dma-heap.c
+++ b/drivers/dma-buf/dma-heap.c
@@ -52,6 +52,9 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
 				 unsigned int fd_flags,
 				 unsigned int heap_flags)
 {
+	struct dma_buf *dmabuf;
+	int fd;
+
 	/*
 	 * Allocations from all heaps have to begin
 	 * and end on page boundaries.
@@ -60,7 +63,16 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
 	if (!len)
 		return -EINVAL;
 
-	return heap->ops->allocate(heap, len, fd_flags, heap_flags);
+	dmabuf = heap->ops->allocate(heap, len, fd_flags, heap_flags);
+	if (IS_ERR(dmabuf))
+		return PTR_ERR(dmabuf);
+
+	fd = dma_buf_fd(dmabuf, fd_flags);
+	if (fd < 0) {
+		dma_buf_put(dmabuf);
+		/* just return, as put will call release and that will free */
+	}
+	return fd;
 }
 
 static int dma_heap_open(struct inode *inode, struct file *file)
diff --git a/drivers/dma-buf/heaps/cma_heap.c b/drivers/dma-buf/heaps/cma_heap.c
index 3c4e34301172..84c6cd1b96a9 100644
--- a/drivers/dma-buf/heaps/cma_heap.c
+++ b/drivers/dma-buf/heaps/cma_heap.c
@@ -268,10 +268,10 @@ static const struct dma_buf_ops cma_heap_buf_ops = {
 	.release = cma_heap_dma_buf_release,
 };
 
-static int cma_heap_allocate(struct dma_heap *heap,
-				  unsigned long len,
-				  unsigned long fd_flags,
-				  unsigned long heap_flags)
+static struct dma_buf *cma_heap_allocate(struct dma_heap *heap,
+					 unsigned long len,
+					 unsigned long fd_flags,
+					 unsigned long heap_flags)
 {
 	struct cma_heap *cma_heap = dma_heap_get_drvdata(heap);
 	struct cma_heap_buffer *buffer;
@@ -286,7 +286,7 @@ static int cma_heap_allocate(struct dma_heap *heap,
 
 	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
 	if (!buffer)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&buffer->attachments);
 	mutex_init(&buffer->lock);
@@ -345,15 +345,7 @@ static int cma_heap_allocate(struct dma_heap *heap,
 		ret = PTR_ERR(dmabuf);
 		goto free_pages;
 	}
-
-	ret = dma_buf_fd(dmabuf, fd_flags);
-	if (ret < 0) {
-		dma_buf_put(dmabuf);
-		/* just return, as put will call release and that will free */
-		return ret;
-	}
-
-	return ret;
+	return dmabuf;
 
 free_pages:
 	kfree(buffer->pages);
@@ -362,7 +354,7 @@ free_cma:
 free_buffer:
 	kfree(buffer);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 
 static const struct dma_heap_ops cma_heap_ops = {
diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
index 405351aad2a8..29e49ac17251 100644
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -331,10 +331,10 @@ static struct page *alloc_largest_available(unsigned long size,
 	return NULL;
 }
 
-static int system_heap_allocate(struct dma_heap *heap,
-				unsigned long len,
-				unsigned long fd_flags,
-				unsigned long heap_flags)
+static struct dma_buf *system_heap_allocate(struct dma_heap *heap,
+					    unsigned long len,
+					    unsigned long fd_flags,
+					    unsigned long heap_flags)
 {
 	struct system_heap_buffer *buffer;
 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
@@ -349,7 +349,7 @@ static int system_heap_allocate(struct dma_heap *heap,
 
 	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
 	if (!buffer)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&buffer->attachments);
 	mutex_init(&buffer->lock);
@@ -399,14 +399,7 @@ static int system_heap_allocate(struct dma_heap *heap,
 		ret = PTR_ERR(dmabuf);
 		goto free_pages;
 	}
-
-	ret = dma_buf_fd(dmabuf, fd_flags);
-	if (ret < 0) {
-		dma_buf_put(dmabuf);
-		/* just return, as put will call release and that will free */
-		return ret;
-	}
-	return ret;
+	return dmabuf;
 
 free_pages:
 	for_each_sgtable_sg(table, sg, i) {
@@ -420,7 +413,7 @@ free_buffer:
 		__free_pages(page, compound_order(page));
 	kfree(buffer);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 
 static const struct dma_heap_ops system_heap_ops = {
diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
index 454e354d1ffb..5bc5c946af58 100644
--- a/include/linux/dma-heap.h
+++ b/include/linux/dma-heap.h
@@ -16,15 +16,15 @@ struct dma_heap;
 
 /**
  * struct dma_heap_ops - ops to operate on a given heap
- * @allocate:		allocate dmabuf and return fd
+ * @allocate:		allocate dmabuf and return struct dma_buf ptr
  *
- * allocate returns dmabuf fd  on success, -errno on error.
+ * allocate returns dmabuf on success, ERR_PTR(-errno) on error.
  */
 struct dma_heap_ops {
-	int (*allocate)(struct dma_heap *heap,
-			unsigned long len,
-			unsigned long fd_flags,
-			unsigned long heap_flags);
+	struct dma_buf *(*allocate)(struct dma_heap *heap,
+				    unsigned long len,
+				    unsigned long fd_flags,
+				    unsigned long heap_flags);
 };
 
 /**
-- 
cgit v1.2.3


From 5a164ac4dbd21b82bcdc03186d40e455ff467fdc Mon Sep 17 00:00:00 2001
From: Veera Sundaram Sankaran <veeras@codeaurora.org>
Date: Fri, 15 Jan 2021 16:31:46 -0800
Subject: dma-fence: allow signaling drivers to set fence timestamp

Some drivers have hardware capability to get the precise HW timestamp
of certain events based on which the fences are triggered. The delta
between the event HW timestamp & current HW reference timestamp can
be used to calculate the timestamp in kernel's CLOCK_MONOTONIC time
domain. This allows it to set accurate timestamp factoring out any
software and IRQ latencies. Add a timestamp variant of fence signal
function, dma_fence_signal_timestamp to allow drivers to update the
precise timestamp for fences.

Changes in v2:
- Add a new fence signal variant instead of modifying fence struct

Changes in v3:
- Add timestamp domain information to commit-text and
dma_fence_signal_timestamp documentation

Signed-off-by: Veera Sundaram Sankaran <veeras@codeaurora.org>
Reviewed-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
 [sumits: minor parenthesis alignment]
Link: https://patchwork.freedesktop.org/patch/msgid/1610757107-11892-1-git-send-email-veeras@codeaurora.org
---
 drivers/dma-buf/dma-fence.c | 70 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/dma-fence.h   |  3 ++
 2 files changed, 66 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index 7475e09b0680..d64fc03929be 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -312,22 +312,25 @@ void __dma_fence_might_wait(void)
 
 
 /**
- * dma_fence_signal_locked - signal completion of a fence
+ * dma_fence_signal_timestamp_locked - signal completion of a fence
  * @fence: the fence to signal
+ * @timestamp: fence signal timestamp in kernel's CLOCK_MONOTONIC time domain
  *
  * Signal completion for software callbacks on a fence, this will unblock
  * dma_fence_wait() calls and run all the callbacks added with
  * dma_fence_add_callback(). Can be called multiple times, but since a fence
  * can only go from the unsignaled to the signaled state and not back, it will
- * only be effective the first time.
+ * only be effective the first time. Set the timestamp provided as the fence
+ * signal timestamp.
  *
- * Unlike dma_fence_signal(), this function must be called with &dma_fence.lock
- * held.
+ * Unlike dma_fence_signal_timestamp(), this function must be called with
+ * &dma_fence.lock held.
  *
  * Returns 0 on success and a negative error value when @fence has been
  * signalled already.
  */
-int dma_fence_signal_locked(struct dma_fence *fence)
+int dma_fence_signal_timestamp_locked(struct dma_fence *fence,
+				      ktime_t timestamp)
 {
 	struct dma_fence_cb *cur, *tmp;
 	struct list_head cb_list;
@@ -341,7 +344,7 @@ int dma_fence_signal_locked(struct dma_fence *fence)
 	/* Stash the cb_list before replacing it with the timestamp */
 	list_replace(&fence->cb_list, &cb_list);
 
-	fence->timestamp = ktime_get();
+	fence->timestamp = timestamp;
 	set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags);
 	trace_dma_fence_signaled(fence);
 
@@ -352,6 +355,59 @@ int dma_fence_signal_locked(struct dma_fence *fence)
 
 	return 0;
 }
+EXPORT_SYMBOL(dma_fence_signal_timestamp_locked);
+
+/**
+ * dma_fence_signal_timestamp - signal completion of a fence
+ * @fence: the fence to signal
+ * @timestamp: fence signal timestamp in kernel's CLOCK_MONOTONIC time domain
+ *
+ * Signal completion for software callbacks on a fence, this will unblock
+ * dma_fence_wait() calls and run all the callbacks added with
+ * dma_fence_add_callback(). Can be called multiple times, but since a fence
+ * can only go from the unsignaled to the signaled state and not back, it will
+ * only be effective the first time. Set the timestamp provided as the fence
+ * signal timestamp.
+ *
+ * Returns 0 on success and a negative error value when @fence has been
+ * signalled already.
+ */
+int dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!fence)
+		return -EINVAL;
+
+	spin_lock_irqsave(fence->lock, flags);
+	ret = dma_fence_signal_timestamp_locked(fence, timestamp);
+	spin_unlock_irqrestore(fence->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(dma_fence_signal_timestamp);
+
+/**
+ * dma_fence_signal_locked - signal completion of a fence
+ * @fence: the fence to signal
+ *
+ * Signal completion for software callbacks on a fence, this will unblock
+ * dma_fence_wait() calls and run all the callbacks added with
+ * dma_fence_add_callback(). Can be called multiple times, but since a fence
+ * can only go from the unsignaled to the signaled state and not back, it will
+ * only be effective the first time.
+ *
+ * Unlike dma_fence_signal(), this function must be called with &dma_fence.lock
+ * held.
+ *
+ * Returns 0 on success and a negative error value when @fence has been
+ * signalled already.
+ */
+int dma_fence_signal_locked(struct dma_fence *fence)
+{
+	return dma_fence_signal_timestamp_locked(fence, ktime_get());
+}
 EXPORT_SYMBOL(dma_fence_signal_locked);
 
 /**
@@ -379,7 +435,7 @@ int dma_fence_signal(struct dma_fence *fence)
 	tmp = dma_fence_begin_signalling();
 
 	spin_lock_irqsave(fence->lock, flags);
-	ret = dma_fence_signal_locked(fence);
+	ret = dma_fence_signal_timestamp_locked(fence, ktime_get());
 	spin_unlock_irqrestore(fence->lock, flags);
 
 	dma_fence_end_signalling(tmp);
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 09e23adb351d..9f12efaaa93a 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -372,6 +372,9 @@ static inline void __dma_fence_might_wait(void) {}
 
 int dma_fence_signal(struct dma_fence *fence);
 int dma_fence_signal_locked(struct dma_fence *fence);
+int dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp);
+int dma_fence_signal_timestamp_locked(struct dma_fence *fence,
+				      ktime_t timestamp);
 signed long dma_fence_default_wait(struct dma_fence *fence,
 				   bool intr, signed long timeout);
 int dma_fence_add_callback(struct dma_fence *fence,
-- 
cgit v1.2.3


From 66922f850910056a2a3193388dd69b900e824717 Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Sun, 24 Jan 2021 05:40:10 +0100
Subject: drm: Fix HDMI_STATIC_METADATA_TYPE1 constant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to the CTA 861.G spec, HDMI_STATIC_METADATA_TYPE1 is
not 1, but zero, so fix this enum.

While this doesn't cause problems in the kernel yet, as the
constant isn't actively used by drivers yet, it did create
confusion while debugging HDR problems in yours truly, and
also potential bugs in userspace components, as the wrong
enum propagates to components, e.g., like it did already
into intel-gpu-tools (tests/kms_hdr.c) or is used as wrong
reference when writing future new userspace HDR components
like compositors.

Fixes: fbb5d0353c62 ("drm: Add HDR source metadata property")
Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Cc: Uma Shankar <uma.shankar@intel.com>
Cc: Shashank Sharma <shashank.sharma@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Simon Ser <contact@emersion.fr>
Link: https://patchwork.freedesktop.org/patch/msgid/20210124044010.18678-1-mario.kleiner.de@gmail.com
---
 include/linux/hdmi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 9850d59d6f1c..c8ec982ff498 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -156,7 +156,7 @@ enum hdmi_content_type {
 };
 
 enum hdmi_metadata_type {
-	HDMI_STATIC_METADATA_TYPE1 = 1,
+	HDMI_STATIC_METADATA_TYPE1 = 0,
 };
 
 enum hdmi_eotf {
-- 
cgit v1.2.3


From 2463e073497385ef63c220571013a2b89e9b95cc Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Thu, 18 Feb 2021 20:49:41 +0000
Subject: netdevice: Add missing IFF_PHONY_HEADROOM self-definition

This is harmless for now, but can be fatal for future refactors.

Fixes: 871b642adebe3 ("netdev: introduce ndo_set_rx_headroom")
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210218204908.5455-2-alobakin@pm.me
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ddf4cfc12615..3b6f82c2c271 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1577,6 +1577,7 @@ enum netdev_priv_flags {
 #define IFF_L3MDEV_SLAVE		IFF_L3MDEV_SLAVE
 #define IFF_TEAM			IFF_TEAM
 #define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
+#define IFF_PHONY_HEADROOM		IFF_PHONY_HEADROOM
 #define IFF_MACSEC			IFF_MACSEC
 #define IFF_NO_RX_HANDLER		IFF_NO_RX_HANDLER
 #define IFF_FAILOVER			IFF_FAILOVER
-- 
cgit v1.2.3


From c2ff53d8049f30098153cd2d1299a44d7b124c57 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Thu, 18 Feb 2021 20:50:02 +0000
Subject: net: Add priv_flags for allow tx skb without linear

In some cases, we hope to construct skb directly based on the existing
memory without copying data. In this case, the page will be placed
directly in the skb, and the linear space of skb is empty. But
unfortunately, many the network card does not support this operation.
For example Mellanox Technologies MT27710 Family [ConnectX-4 Lx] will
get the following error message:

    mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8,
    qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
    00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
    WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
    00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
    00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
    00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb

So a priv_flag is added here to indicate whether the network card
supports this feature.

Suggested-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210218204908.5455-3-alobakin@pm.me
---
 include/linux/netdevice.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3b6f82c2c271..6cef47b76cc6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1518,6 +1518,8 @@ struct net_device_ops {
  * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
  * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
  * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
+ * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
+ *	skb_headlen(skb) == 0 (data starts from frag0)
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1551,6 +1553,7 @@ enum netdev_priv_flags {
 	IFF_FAILOVER_SLAVE		= 1<<28,
 	IFF_L3MDEV_RX_HANDLER		= 1<<29,
 	IFF_LIVE_RENAME_OK		= 1<<30,
+	IFF_TX_SKB_NO_LINEAR		= 1<<31,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1584,6 +1587,7 @@ enum netdev_priv_flags {
 #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
 #define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
+#define IFF_TX_SKB_NO_LINEAR		IFF_TX_SKB_NO_LINEAR
 
 /**
  *	struct net_device - The DEVICE structure.
-- 
cgit v1.2.3


From 98cd02da467cb4a319b7041d6242de8a4a33e833 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 9 Feb 2021 19:48:17 +0000
Subject: dma-buf: dma-heap: Provide accessor to get heap name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It can be useful to access the name for the heap,
so provide an accessor to do so.

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Liam Mark <lmark@codeaurora.org>
Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Brian Starkey <Brian.Starkey@arm.com>
Cc: Hridya Valsaraju <hridya@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sandeep Patil <sspatil@google.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Ørjan Eide <orjan.eide@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ezequiel Garcia <ezequiel@collabora.com>
Cc: Simon Ser <contact@emersion.fr>
Cc: James Jones <jajones@nvidia.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20210209194818.2459062-1-john.stultz@linaro.org
---
 drivers/dma-buf/dma-heap.c | 12 ++++++++++++
 include/linux/dma-heap.h   |  9 +++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
index 6b5db954569f..56bf5ad01ad5 100644
--- a/drivers/dma-buf/dma-heap.c
+++ b/drivers/dma-buf/dma-heap.c
@@ -202,6 +202,18 @@ void *dma_heap_get_drvdata(struct dma_heap *heap)
 	return heap->priv;
 }
 
+/**
+ * dma_heap_get_name() - get heap name
+ * @heap: DMA-Heap to retrieve private data for
+ *
+ * Returns:
+ * The char* for the heap name.
+ */
+const char *dma_heap_get_name(struct dma_heap *heap)
+{
+	return heap->name;
+}
+
 struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info)
 {
 	struct dma_heap *heap, *h, *err_ret;
diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h
index 5bc5c946af58..0c05561cad6e 100644
--- a/include/linux/dma-heap.h
+++ b/include/linux/dma-heap.h
@@ -50,6 +50,15 @@ struct dma_heap_export_info {
  */
 void *dma_heap_get_drvdata(struct dma_heap *heap);
 
+/**
+ * dma_heap_get_name() - get heap name
+ * @heap: DMA-Heap to retrieve private data for
+ *
+ * Returns:
+ * The char* for the heap name.
+ */
+const char *dma_heap_get_name(struct dma_heap *heap);
+
 /**
  * dma_heap_add - adds a heap to dmabuf heaps
  * @exp_info:		information needed to register this heap
-- 
cgit v1.2.3


From f00bdce0455233a0b76dae6364442dca717a574c Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 23 Feb 2021 14:19:04 +0800
Subject: vdpa: set the virtqueue num during register

This patch delay the queue number setting to vDPA device
registering. This allows us to probe the virtqueue numbers between
device allocation and registering.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210223061905.422659-3-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/ifcvf/ifcvf_main.c      |  5 ++---
 drivers/vdpa/mlx5/net/mlx5_vnet.c    |  4 ++--
 drivers/vdpa/vdpa.c                  | 18 ++++++++++--------
 drivers/vdpa/vdpa_sim/vdpa_sim.c     |  2 +-
 drivers/vdpa/vdpa_sim/vdpa_sim_net.c |  2 +-
 include/linux/vdpa.h                 | 10 +++++-----
 6 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index 7c8bbfcf6c3e..d555a6a5d1ba 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -431,8 +431,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
-				    dev, &ifc_vdpa_ops,
-				    IFCVF_MAX_QUEUE_PAIRS * 2, NULL);
+				    dev, &ifc_vdpa_ops, NULL);
 	if (adapter == NULL) {
 		IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
 		return -ENOMEM;
@@ -456,7 +455,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
 		vf->vring[i].irq = -EINVAL;
 
-	ret = vdpa_register_device(&adapter->vdpa);
+	ret = vdpa_register_device(&adapter->vdpa, IFCVF_MAX_QUEUE_PAIRS * 2);
 	if (ret) {
 		IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
 		goto err;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 10e9b09932eb..71397fdafa6a 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1982,7 +1982,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
 	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
 
 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
-				 2 * mlx5_vdpa_max_qps(max_vqs), NULL);
+				 NULL);
 	if (IS_ERR(ndev))
 		return PTR_ERR(ndev);
 
@@ -2009,7 +2009,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
 	if (err)
 		goto err_res;
 
-	err = vdpa_register_device(&mvdev->vdev);
+	err = vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs));
 	if (err)
 		goto err_reg;
 
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index da67f07e24fd..5cffce67cab0 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -69,7 +69,6 @@ static void vdpa_release_dev(struct device *d)
  * initialized but before registered.
  * @parent: the parent device
  * @config: the bus operations that is supported by this device
- * @nvqs: number of virtqueues supported by this device
  * @size: size of the parent structure that contains private data
  * @name: name of the vdpa device; optional.
  *
@@ -81,7 +80,7 @@ static void vdpa_release_dev(struct device *d)
  */
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 					const struct vdpa_config_ops *config,
-					int nvqs, size_t size, const char *name)
+					size_t size, const char *name)
 {
 	struct vdpa_device *vdev;
 	int err = -EINVAL;
@@ -107,7 +106,6 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 	vdev->index = err;
 	vdev->config = config;
 	vdev->features_valid = false;
-	vdev->nvqs = nvqs;
 
 	if (name)
 		err = dev_set_name(&vdev->dev, "%s", name);
@@ -136,10 +134,12 @@ static int vdpa_name_match(struct device *dev, const void *data)
 	return (strcmp(dev_name(&vdev->dev), data) == 0);
 }
 
-static int __vdpa_register_device(struct vdpa_device *vdev)
+static int __vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
 	struct device *dev;
 
+	vdev->nvqs = nvqs;
+
 	lockdep_assert_held(&vdpa_dev_mutex);
 	dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match);
 	if (dev) {
@@ -155,15 +155,16 @@ static int __vdpa_register_device(struct vdpa_device *vdev)
  * Caller must invoke this routine in the management device dev_add()
  * callback after setting up valid mgmtdev for this vdpa device.
  * @vdev: the vdpa device to be registered to vDPA bus
+ * @nvqs: number of virtqueues supported by this device
  *
  * Returns an error when fail to add device to vDPA bus
  */
-int _vdpa_register_device(struct vdpa_device *vdev)
+int _vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
 	if (!vdev->mdev)
 		return -EINVAL;
 
-	return __vdpa_register_device(vdev);
+	return __vdpa_register_device(vdev, nvqs);
 }
 EXPORT_SYMBOL_GPL(_vdpa_register_device);
 
@@ -171,15 +172,16 @@ EXPORT_SYMBOL_GPL(_vdpa_register_device);
  * vdpa_register_device - register a vDPA device
  * Callers must have a succeed call of vdpa_alloc_device() before.
  * @vdev: the vdpa device to be registered to vDPA bus
+ * @nvqs: number of virtqueues supported by this device
  *
  * Returns an error when fail to add to vDPA bus
  */
-int vdpa_register_device(struct vdpa_device *vdev)
+int vdpa_register_device(struct vdpa_device *vdev, int nvqs)
 {
 	int err;
 
 	mutex_lock(&vdpa_dev_mutex);
-	err = __vdpa_register_device(vdev);
+	err = __vdpa_register_device(vdev, nvqs);
 	mutex_unlock(&vdpa_dev_mutex);
 	return err;
 }
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index d5942842432d..5b6b2f87d40c 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -235,7 +235,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
 		ops = &vdpasim_config_ops;
 
 	vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
-				    dev_attr->nvqs, dev_attr->name);
+				    dev_attr->name);
 	if (!vdpasim)
 		goto err_alloc;
 
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
index d344c5b7c914..702be74877d2 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
@@ -147,7 +147,7 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
 	if (IS_ERR(simdev))
 		return PTR_ERR(simdev);
 
-	ret = _vdpa_register_device(&simdev->vdpa);
+	ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM);
 	if (ret)
 		goto reg_err;
 
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 4ab5494503a8..15fa085fab05 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -250,20 +250,20 @@ struct vdpa_config_ops {
 
 struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 					const struct vdpa_config_ops *config,
-					int nvqs, size_t size, const char *name);
+					size_t size, const char *name);
 
-#define vdpa_alloc_device(dev_struct, member, parent, config, nvqs, name)   \
+#define vdpa_alloc_device(dev_struct, member, parent, config, name)   \
 			  container_of(__vdpa_alloc_device( \
-				       parent, config, nvqs, \
+				       parent, config, \
 				       sizeof(dev_struct) + \
 				       BUILD_BUG_ON_ZERO(offsetof( \
 				       dev_struct, member)), name), \
 				       dev_struct, member)
 
-int vdpa_register_device(struct vdpa_device *vdev);
+int vdpa_register_device(struct vdpa_device *vdev, int nvqs);
 void vdpa_unregister_device(struct vdpa_device *vdev);
 
-int _vdpa_register_device(struct vdpa_device *vdev);
+int _vdpa_register_device(struct vdpa_device *vdev, int nvqs);
 void _vdpa_unregister_device(struct vdpa_device *vdev);
 
 /**
-- 
cgit v1.2.3


From a10787e6d58c24b51e91c19c6d16c5da89fcaa4b Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 25 Feb 2021 15:43:14 -0800
Subject: bpf: Enable task local storage for tracing programs

To access per-task data, BPF programs usually creates a hash table with
pid as the key. This is not ideal because:
 1. The user need to estimate the proper size of the hash table, which may
    be inaccurate;
 2. Big hash tables are slow;
 3. To clean up the data properly during task terminations, the user need
    to write extra logic.

Task local storage overcomes these issues and offers a better option for
these per-task data. Task local storage is only available to BPF_LSM. Now
enable it for tracing programs.

Unlike LSM programs, tracing programs can be called in IRQ contexts.
Helpers that access task local storage are updated to use
raw_spin_lock_irqsave() instead of raw_spin_lock_bh().

Tracing programs can attach to functions on the task free path, e.g.
exit_creds(). To avoid allocating task local storage after
bpf_task_storage_free(). bpf_task_storage_get() is updated to not allocate
new storage when the task is not refcounted (task->usage == 0).

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210225234319.336131-2-songliubraving@fb.com
---
 include/linux/bpf.h            |  7 +++++++
 include/linux/bpf_lsm.h        | 22 ---------------------
 include/linux/bpf_types.h      |  2 +-
 include/linux/sched.h          |  5 +++++
 kernel/bpf/Makefile            |  3 +--
 kernel/bpf/bpf_local_storage.c | 28 ++++++++++++++++-----------
 kernel/bpf/bpf_lsm.c           |  4 ----
 kernel/bpf/bpf_task_storage.c  | 43 +++++++++++-------------------------------
 kernel/fork.c                  |  5 +++++
 kernel/trace/bpf_trace.c       |  4 ++++
 10 files changed, 51 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cccaef1088ea..e2cfc4809219 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1499,6 +1499,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+void bpf_task_storage_free(struct task_struct *task);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -1684,6 +1685,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 {
 	return NULL;
 }
+
+static inline void bpf_task_storage_free(struct task_struct *task)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@ -1886,6 +1891,8 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
 extern const struct bpf_func_proto bpf_sock_from_file_proto;
 extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
+extern const struct bpf_func_proto bpf_task_storage_get_proto;
+extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 0d1c33ace398..479c101546ad 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -38,21 +38,9 @@ static inline struct bpf_storage_blob *bpf_inode(
 	return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
 }
 
-static inline struct bpf_storage_blob *bpf_task(
-	const struct task_struct *task)
-{
-	if (unlikely(!task->security))
-		return NULL;
-
-	return task->security + bpf_lsm_blob_sizes.lbs_task;
-}
-
 extern const struct bpf_func_proto bpf_inode_storage_get_proto;
 extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
-extern const struct bpf_func_proto bpf_task_storage_get_proto;
-extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 void bpf_inode_storage_free(struct inode *inode);
-void bpf_task_storage_free(struct task_struct *task);
 
 #else /* !CONFIG_BPF_LSM */
 
@@ -73,20 +61,10 @@ static inline struct bpf_storage_blob *bpf_inode(
 	return NULL;
 }
 
-static inline struct bpf_storage_blob *bpf_task(
-	const struct task_struct *task)
-{
-	return NULL;
-}
-
 static inline void bpf_inode_storage_free(struct inode *inode)
 {
 }
 
-static inline void bpf_task_storage_free(struct task_struct *task)
-{
-}
-
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 99f7fd657d87..b9edee336d80 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -109,8 +109,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
 #ifdef CONFIG_BPF_LSM
 BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d568288abf9..e5fbf8e6952a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -42,6 +42,7 @@ struct audit_context;
 struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
+struct bpf_local_storage;
 struct capture_control;
 struct cfs_rq;
 struct fs_struct;
@@ -1348,6 +1349,10 @@ struct task_struct {
 	/* Used by LSM modules for access restriction: */
 	void				*security;
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+	/* Used by BPF task local storage */
+	struct bpf_local_storage __rcu	*bpf_storage;
+#endif
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 	unsigned long			lowest_stack;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index d1249340fd6b..7f33098ca63f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
-obj-${CONFIG_BPF_LSM}	  += bpf_task_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
@@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
 endif
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index dd5aedee99e7..9bd47ad2b26f 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
 {
 	struct bpf_local_storage *local_storage;
 	bool free_local_storage = false;
+	unsigned long flags;
 
 	if (unlikely(!selem_linked_to_storage(selem)))
 		/* selem has already been unlinked from sk */
 		return;
 
 	local_storage = rcu_dereference(selem->local_storage);
-	raw_spin_lock_bh(&local_storage->lock);
+	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
 			local_storage, selem, true);
-	raw_spin_unlock_bh(&local_storage->lock);
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_local_storage)
 		kfree_rcu(local_storage, rcu);
@@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 {
 	struct bpf_local_storage_map *smap;
 	struct bpf_local_storage_map_bucket *b;
+	unsigned long flags;
 
 	if (unlikely(!selem_linked_to_map(selem)))
 		/* selem has already be unlinked from smap */
@@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 
 	smap = rcu_dereference(SDATA(selem)->smap);
 	b = select_bucket(smap, selem);
-	raw_spin_lock_bh(&b->lock);
+	raw_spin_lock_irqsave(&b->lock, flags);
 	if (likely(selem_linked_to_map(selem)))
 		hlist_del_init_rcu(&selem->map_node);
-	raw_spin_unlock_bh(&b->lock);
+	raw_spin_unlock_irqrestore(&b->lock, flags);
 }
 
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem)
 {
 	struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+	unsigned long flags;
 
-	raw_spin_lock_bh(&b->lock);
+	raw_spin_lock_irqsave(&b->lock, flags);
 	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 	hlist_add_head_rcu(&selem->map_node, &b->list);
-	raw_spin_unlock_bh(&b->lock);
+	raw_spin_unlock_irqrestore(&b->lock, flags);
 }
 
 void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
@@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 
 	sdata = SDATA(selem);
 	if (cacheit_lockit) {
+		unsigned long flags;
+
 		/* spinlock is needed to avoid racing with the
 		 * parallel delete.  Otherwise, publishing an already
 		 * deleted sdata to the cache will become a use-after-free
 		 * problem in the next bpf_local_storage_lookup().
 		 */
-		raw_spin_lock_bh(&local_storage->lock);
+		raw_spin_lock_irqsave(&local_storage->lock, flags);
 		if (selem_linked_to_storage(selem))
 			rcu_assign_pointer(local_storage->cache[smap->cache_idx],
 					   sdata);
-		raw_spin_unlock_bh(&local_storage->lock);
+		raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	}
 
 	return sdata;
@@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	struct bpf_local_storage_data *old_sdata = NULL;
 	struct bpf_local_storage_elem *selem;
 	struct bpf_local_storage *local_storage;
+	unsigned long flags;
 	int err;
 
 	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
@@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		}
 	}
 
-	raw_spin_lock_bh(&local_storage->lock);
+	raw_spin_lock_irqsave(&local_storage->lock, flags);
 
 	/* Recheck local_storage->list under local_storage->lock */
 	if (unlikely(hlist_empty(&local_storage->list))) {
@@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	}
 
 unlock:
-	raw_spin_unlock_bh(&local_storage->lock);
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	return SDATA(selem);
 
 unlock_err:
-	raw_spin_unlock_bh(&local_storage->lock);
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	return ERR_PTR(err);
 }
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 1622a44d1617..9829f381b51c 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_spin_lock_proto;
 	case BPF_FUNC_spin_unlock:
 		return &bpf_spin_unlock_proto;
-	case BPF_FUNC_task_storage_get:
-		return &bpf_task_storage_get_proto;
-	case BPF_FUNC_task_storage_delete:
-		return &bpf_task_storage_delete_proto;
 	case BPF_FUNC_bprm_opts_set:
 		return &bpf_bprm_opts_set_proto;
 	case BPF_FUNC_ima_inode_hash:
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index e0da0258b732..baf3566e2323 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -15,7 +15,6 @@
 #include <linux/bpf_local_storage.h>
 #include <linux/filter.h>
 #include <uapi/linux/btf.h>
-#include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/fdtable.h>
 
@@ -24,12 +23,8 @@ DEFINE_BPF_STORAGE_CACHE(task_cache);
 static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
 {
 	struct task_struct *task = owner;
-	struct bpf_storage_blob *bsb;
 
-	bsb = bpf_task(task);
-	if (!bsb)
-		return NULL;
-	return &bsb->storage;
+	return &task->bpf_storage;
 }
 
 static struct bpf_local_storage_data *
@@ -38,13 +33,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
 {
 	struct bpf_local_storage *task_storage;
 	struct bpf_local_storage_map *smap;
-	struct bpf_storage_blob *bsb;
-
-	bsb = bpf_task(task);
-	if (!bsb)
-		return NULL;
 
-	task_storage = rcu_dereference(bsb->storage);
+	task_storage = rcu_dereference(task->bpf_storage);
 	if (!task_storage)
 		return NULL;
 
@@ -57,16 +47,12 @@ void bpf_task_storage_free(struct task_struct *task)
 	struct bpf_local_storage_elem *selem;
 	struct bpf_local_storage *local_storage;
 	bool free_task_storage = false;
-	struct bpf_storage_blob *bsb;
 	struct hlist_node *n;
-
-	bsb = bpf_task(task);
-	if (!bsb)
-		return;
+	unsigned long flags;
 
 	rcu_read_lock();
 
-	local_storage = rcu_dereference(bsb->storage);
+	local_storage = rcu_dereference(task->bpf_storage);
 	if (!local_storage) {
 		rcu_read_unlock();
 		return;
@@ -81,7 +67,7 @@ void bpf_task_storage_free(struct task_struct *task)
 	 * when unlinking elem from the local_storage->list and
 	 * the map's bucket->list.
 	 */
-	raw_spin_lock_bh(&local_storage->lock);
+	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
 		/* Always unlink from map before unlinking from
 		 * local_storage.
@@ -90,7 +76,7 @@ void bpf_task_storage_free(struct task_struct *task)
 		free_task_storage = bpf_selem_unlink_storage_nolock(
 			local_storage, selem, false);
 	}
-	raw_spin_unlock_bh(&local_storage->lock);
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	rcu_read_unlock();
 
 	/* free_task_storage should always be true as long as
@@ -150,7 +136,7 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
 	 */
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	task = pid_task(pid, PIDTYPE_PID);
-	if (!task || !task_storage_ptr(task)) {
+	if (!task) {
 		err = -ENOENT;
 		goto out;
 	}
@@ -213,23 +199,16 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
 	if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
 		return (unsigned long)NULL;
 
-	/* explicitly check that the task_storage_ptr is not
-	 * NULL as task_storage_lookup returns NULL in this case and
-	 * bpf_local_storage_update expects the owner to have a
-	 * valid storage pointer.
-	 */
-	if (!task || !task_storage_ptr(task))
+	if (!task)
 		return (unsigned long)NULL;
 
 	sdata = task_storage_lookup(task, map, true);
 	if (sdata)
 		return (unsigned long)sdata->data;
 
-	/* This helper must only be called from places where the lifetime of the task
-	 * is guaranteed. Either by being refcounted or by being protected
-	 * by an RCU read-side critical section.
-	 */
-	if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+	/* only allocate new storage, when the task is refcounted */
+	if (refcount_read(&task->usage) &&
+	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
 		sdata = bpf_local_storage_update(
 			task, (struct bpf_local_storage_map *)map, value,
 			BPF_NOEXIST);
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211..181604db2d65 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,6 +96,7 @@
 #include <linux/kasan.h>
 #include <linux/scs.h>
 #include <linux/io_uring.h>
+#include <linux/bpf.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk)
 	cgroup_free(tsk);
 	task_numa_free(tsk, true);
 	security_task_free(tsk);
+	bpf_task_storage_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 	put_signal_struct(tsk->signal);
@@ -2062,6 +2064,9 @@ static __latent_entropy struct task_struct *copy_process(
 	p->sequential_io	= 0;
 	p->sequential_io_avg	= 0;
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+	RCU_INIT_POINTER(p->bpf_storage, NULL);
+#endif
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b0c45d923f0f..e9701744d8e4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1367,6 +1367,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_per_cpu_ptr_proto;
 	case BPF_FUNC_this_cpu_ptr:
 		return &bpf_this_cpu_ptr_proto;
+	case BPF_FUNC_task_storage_get:
+		return &bpf_task_storage_get_proto;
+	case BPF_FUNC_task_storage_delete:
+		return &bpf_task_storage_delete_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From bc235cdb423a2daed6f337676006a66557429cd1 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 25 Feb 2021 15:43:15 -0800
Subject: bpf: Prevent deadlock from recursive bpf_task_storage_[get|delete]

BPF helpers bpf_task_storage_[get|delete] could hold two locks:
bpf_local_storage_map_bucket->lock and bpf_local_storage->lock. Calling
these helpers from fentry/fexit programs on functions in bpf_*_storage.c
may cause deadlock on either locks.

Prevent such deadlock with a per cpu counter, bpf_task_storage_busy. We
need this counter to be global, because the two locks here belong to two
different objects: bpf_local_storage_map and bpf_local_storage. If we
pick one of them as the owner of the counter, it is still possible to
trigger deadlock on the other lock. For example, if bpf_local_storage_map
owns the counters, it cannot prevent deadlock on bpf_local_storage->lock
when two maps are used.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210225234319.336131-3-songliubraving@fb.com
---
 include/linux/bpf_local_storage.h |  3 +-
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/bpf_local_storage.c    | 11 +++++++-
 kernel/bpf/bpf_task_storage.c     | 59 +++++++++++++++++++++++++++++++++------
 net/core/bpf_sk_storage.c         |  2 +-
 5 files changed, 65 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index b2c9463f36a1..b902c580c48d 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -126,7 +126,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 			 struct bpf_local_storage_map *smap,
 			 bool cacheit_lockit);
 
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap);
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
+				int __percpu *busy_counter);
 
 int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 				    const struct btf *btf,
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 6639640523c0..da753721457c 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map)
 
 	smap = (struct bpf_local_storage_map *)map;
 	bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
-	bpf_local_storage_map_free(smap);
+	bpf_local_storage_map_free(smap, NULL);
 }
 
 static int inode_storage_map_btf_id;
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 9bd47ad2b26f..b305270b7a4b 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -474,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
 	spin_unlock(&cache->idx_lock);
 }
 
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
+				int __percpu *busy_counter)
 {
 	struct bpf_local_storage_elem *selem;
 	struct bpf_local_storage_map_bucket *b;
@@ -503,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap)
 		while ((selem = hlist_entry_safe(
 				rcu_dereference_raw(hlist_first_rcu(&b->list)),
 				struct bpf_local_storage_elem, map_node))) {
+			if (busy_counter) {
+				migrate_disable();
+				__this_cpu_inc(*busy_counter);
+			}
 			bpf_selem_unlink(selem);
+			if (busy_counter) {
+				__this_cpu_dec(*busy_counter);
+				migrate_enable();
+			}
 			cond_resched_rcu();
 		}
 		rcu_read_unlock();
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index baf3566e2323..fd3c74ef608e 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -20,6 +20,31 @@
 
 DEFINE_BPF_STORAGE_CACHE(task_cache);
 
+DEFINE_PER_CPU(int, bpf_task_storage_busy);
+
+static void bpf_task_storage_lock(void)
+{
+	migrate_disable();
+	__this_cpu_inc(bpf_task_storage_busy);
+}
+
+static void bpf_task_storage_unlock(void)
+{
+	__this_cpu_dec(bpf_task_storage_busy);
+	migrate_enable();
+}
+
+static bool bpf_task_storage_trylock(void)
+{
+	migrate_disable();
+	if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+		__this_cpu_dec(bpf_task_storage_busy);
+		migrate_enable();
+		return false;
+	}
+	return true;
+}
+
 static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
 {
 	struct task_struct *task = owner;
@@ -67,6 +92,7 @@ void bpf_task_storage_free(struct task_struct *task)
 	 * when unlinking elem from the local_storage->list and
 	 * the map's bucket->list.
 	 */
+	bpf_task_storage_lock();
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
 		/* Always unlink from map before unlinking from
@@ -77,6 +103,7 @@ void bpf_task_storage_free(struct task_struct *task)
 			local_storage, selem, false);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_task_storage_unlock();
 	rcu_read_unlock();
 
 	/* free_task_storage should always be true as long as
@@ -109,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
 		goto out;
 	}
 
+	bpf_task_storage_lock();
 	sdata = task_storage_lookup(task, map, true);
+	bpf_task_storage_unlock();
 	put_pid(pid);
 	return sdata ? sdata->data : NULL;
 out:
@@ -141,8 +170,10 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
 		goto out;
 	}
 
+	bpf_task_storage_lock();
 	sdata = bpf_local_storage_update(
 		task, (struct bpf_local_storage_map *)map, value, map_flags);
+	bpf_task_storage_unlock();
 
 	err = PTR_ERR_OR_ZERO(sdata);
 out:
@@ -185,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
 		goto out;
 	}
 
+	bpf_task_storage_lock();
 	err = task_storage_delete(task, map);
+	bpf_task_storage_unlock();
 out:
 	put_pid(pid);
 	return err;
@@ -202,34 +235,44 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
 	if (!task)
 		return (unsigned long)NULL;
 
+	if (!bpf_task_storage_trylock())
+		return (unsigned long)NULL;
+
 	sdata = task_storage_lookup(task, map, true);
 	if (sdata)
-		return (unsigned long)sdata->data;
+		goto unlock;
 
 	/* only allocate new storage, when the task is refcounted */
 	if (refcount_read(&task->usage) &&
-	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
+	    (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
 		sdata = bpf_local_storage_update(
 			task, (struct bpf_local_storage_map *)map, value,
 			BPF_NOEXIST);
-		return IS_ERR(sdata) ? (unsigned long)NULL :
-					     (unsigned long)sdata->data;
-	}
 
-	return (unsigned long)NULL;
+unlock:
+	bpf_task_storage_unlock();
+	return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
+		(unsigned long)sdata->data;
 }
 
 BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
 	   task)
 {
+	int ret;
+
 	if (!task)
 		return -EINVAL;
 
+	if (!bpf_task_storage_trylock())
+		return -EBUSY;
+
 	/* This helper must only be called from places where the lifetime of the task
 	 * is guaranteed. Either by being refcounted or by being protected
 	 * by an RCU read-side critical section.
 	 */
-	return task_storage_delete(task, map);
+	ret = task_storage_delete(task, map);
+	bpf_task_storage_unlock();
+	return ret;
 }
 
 static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -255,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map)
 
 	smap = (struct bpf_local_storage_map *)map;
 	bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
-	bpf_local_storage_map_free(smap);
+	bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
 }
 
 static int task_storage_map_btf_id;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 4edd033e899c..cc3712ad8716 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
 	smap = (struct bpf_local_storage_map *)map;
 	bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
-	bpf_local_storage_map_free(smap);
+	bpf_local_storage_map_free(smap, NULL);
 }
 
 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
-- 
cgit v1.2.3


From 523a4cf491b3c9e2d546040d57250f1a0ca84f03 Mon Sep 17 00:00:00 2001
From: Dmitrii Banshchikov <me@ubique.spb.ru>
Date: Fri, 26 Feb 2021 00:26:29 +0400
Subject: bpf: Use MAX_BPF_FUNC_REG_ARGS macro

Instead of using integer literal here and there use macro name for
better context.

Signed-off-by: Dmitrii Banshchikov <me@ubique.spb.ru>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210225202629.585485-1-me@ubique.spb.ru
---
 include/linux/bpf.h   |  5 +++++
 kernel/bpf/btf.c      | 25 ++++++++++++++-----------
 kernel/bpf/verifier.c |  2 +-
 3 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e2cfc4809219..ae2c35641619 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -506,6 +506,11 @@ enum bpf_cgroup_storage_type {
  */
 #define MAX_BPF_FUNC_ARGS 12
 
+/* The maximum number of arguments passed through registers
+ * a single function may have.
+ */
+#define MAX_BPF_FUNC_REG_ARGS 5
+
 struct btf_func_model {
 	u8 ret_size;
 	u8 nr_args;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2efeb5f4b343..16e8148a28e2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4594,8 +4594,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	}
 	arg = off / 8;
 	args = (const struct btf_param *)(t + 1);
-	/* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */
-	nr_args = t ? btf_type_vlen(t) : 5;
+	/* if (t == NULL) Fall back to default BPF prog with
+	 * MAX_BPF_FUNC_REG_ARGS u64 arguments.
+	 */
+	nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
 	if (prog->aux->attach_btf_trace) {
 		/* skip first 'void *__data' argument in btf_trace_##name typedef */
 		args++;
@@ -4651,7 +4653,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		}
 	} else {
 		if (!t)
-			/* Default prog with 5 args */
+			/* Default prog with MAX_BPF_FUNC_REG_ARGS args */
 			return true;
 		t = btf_type_by_id(btf, args[arg].type);
 	}
@@ -5102,12 +5104,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 
 	if (!func) {
 		/* BTF function prototype doesn't match the verifier types.
-		 * Fall back to 5 u64 args.
+		 * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
 		 */
-		for (i = 0; i < 5; i++)
+		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
 			m->arg_size[i] = 8;
 		m->ret_size = 8;
-		m->nr_args = 5;
+		m->nr_args = MAX_BPF_FUNC_REG_ARGS;
 		return 0;
 	}
 	args = (const struct btf_param *)(func + 1);
@@ -5330,8 +5332,9 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 	}
 	args = (const struct btf_param *)(t + 1);
 	nargs = btf_type_vlen(t);
-	if (nargs > 5) {
-		bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
+	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+		bpf_log(log, "Function %s has %d > %d args\n", tname, nargs,
+			MAX_BPF_FUNC_REG_ARGS);
 		goto out;
 	}
 
@@ -5460,9 +5463,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	}
 	args = (const struct btf_param *)(t + 1);
 	nargs = btf_type_vlen(t);
-	if (nargs > 5) {
-		bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
-			tname, nargs);
+	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+		bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
 	/* check that function returns int */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1dda9d81f12c..9f7e35590fc6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5544,7 +5544,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 	meta.func_id = func_id;
 	/* check args */
-	for (i = 0; i < 5; i++) {
+	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
 		err = check_func_arg(env, i, &meta, fn);
 		if (err)
 			return err;
-- 
cgit v1.2.3


From 887596095ec2a9ea39ffcf98f27bf2e77c5eb512 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 23 Feb 2021 10:49:26 -0800
Subject: bpf: Clean up sockmap related Kconfigs

As suggested by John, clean up sockmap related Kconfigs:

Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream
parser, to reflect its name.

Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL
and CONFIG_INET, the latter is still needed at this point because
of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched,
as it is used by non-sockmap cases.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210223184934.6054-2-xiyou.wangcong@gmail.com
---
 include/linux/bpf.h       |  26 ++++-----
 include/linux/bpf_types.h |   6 +-
 include/linux/skmsg.h     |  18 ++++++
 include/net/tcp.h         |  16 ++---
 include/net/udp.h         |   4 +-
 init/Kconfig              |   1 +
 net/Kconfig               |   6 +-
 net/core/Makefile         |   6 +-
 net/core/skmsg.c          | 145 +++++++++++++++++++++++++---------------------
 net/core/sock_map.c       |   2 +
 net/ipv4/Makefile         |   2 +-
 net/ipv4/tcp_bpf.c        |   4 +-
 12 files changed, 133 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ae2c35641619..2be47ada5f2d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1778,7 +1778,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 }
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
-#if defined(CONFIG_BPF_STREAM_PARSER)
+#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 			 struct bpf_prog *old, u32 which);
 int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
@@ -1786,7 +1786,18 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
 void sock_map_unhash(struct sock *sk);
 void sock_map_close(struct sock *sk, long timeout);
+
+void bpf_sk_reuseport_detach(struct sock *sk);
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+				       void *value);
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags);
 #else
+static inline void bpf_sk_reuseport_detach(struct sock *sk)
+{
+}
+
+#ifdef CONFIG_BPF_SYSCALL
 static inline int sock_map_prog_update(struct bpf_map *map,
 				       struct bpf_prog *prog,
 				       struct bpf_prog *old, u32 which)
@@ -1811,20 +1822,7 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void
 {
 	return -EOPNOTSUPP;
 }
-#endif /* CONFIG_BPF_STREAM_PARSER */
 
-#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
-void bpf_sk_reuseport_detach(struct sock *sk);
-int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
-				       void *value);
-int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 map_flags);
-#else
-static inline void bpf_sk_reuseport_detach(struct sock *sk)
-{
-}
-
-#ifdef CONFIG_BPF_SYSCALL
 static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
 						     void *key, void *value)
 {
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b9edee336d80..f883f01a5061 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -103,10 +103,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
-#if defined(CONFIG_BPF_STREAM_PARSER)
-BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
-BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
-#endif
 #ifdef CONFIG_BPF_LSM
 BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops)
 #endif
@@ -116,6 +112,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
 #endif
 #ifdef CONFIG_INET
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
 #endif
 #endif
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 8edbbf5f2f93..db7a08be4725 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -71,7 +71,9 @@ struct sk_psock_link {
 };
 
 struct sk_psock_parser {
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 	struct strparser		strp;
+#endif
 	bool				enabled;
 	void (*saved_data_ready)(struct sock *sk);
 };
@@ -305,9 +307,25 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node);
 
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+#else
+static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+}
+
+static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+}
+#endif
+
 void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock);
 void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 963cd86d12dd..c00e125dcfb9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2222,25 +2222,27 @@ void tcp_update_ulp(struct sock *sk, struct proto *p,
 	__MODULE_INFO(alias, alias_userspace, name);		\
 	__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
 
+#ifdef CONFIG_NET_SOCK_MSG
 struct sk_msg;
 struct sk_psock;
 
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
 struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
-#else
-static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
-{
-}
-#endif /* CONFIG_BPF_STREAM_PARSER */
+#endif /* CONFIG_BPF_SYSCALL */
 
-#ifdef CONFIG_NET_SOCK_MSG
 int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
 			  int flags);
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 		      struct msghdr *msg, int len, int flags);
 #endif /* CONFIG_NET_SOCK_MSG */
 
+#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
+static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
+{
+}
+#endif
+
 #ifdef CONFIG_CGROUP_BPF
 static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
 				      struct sk_buff *skb,
diff --git a/include/net/udp.h b/include/net/udp.h
index a132a02b2f2c..d4d064c59232 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -515,9 +515,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
 	return segs;
 }
 
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
 struct sk_psock;
 struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
-#endif /* BPF_STREAM_PARSER */
+#endif
 
 #endif	/* _UDP_H */
diff --git a/init/Kconfig b/init/Kconfig
index 096e1af5c586..66cef5eac275 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1702,6 +1702,7 @@ config BPF_SYSCALL
 	select BPF
 	select IRQ_WORK
 	select TASKS_TRACE_RCU
+	select NET_SOCK_MSG if INET
 	default n
 	help
 	  Enable the bpf() system call that allows to manipulate eBPF
diff --git a/net/Kconfig b/net/Kconfig
index 8cea808ad9e8..0ead7ec0d2bd 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -317,13 +317,9 @@ config BPF_STREAM_PARSER
 	select STREAM_PARSER
 	select NET_SOCK_MSG
 	help
-	  Enabling this allows a stream parser to be used with
+	  Enabling this allows a TCP stream parser to be used with
 	  BPF_MAP_TYPE_SOCKMAP.
 
-	  BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
-	  It can be used to enforce socket policy, implement socket redirects,
-	  etc.
-
 config NET_FLOW_LIMIT
 	bool
 	depends on RPS
diff --git a/net/core/Makefile b/net/core/Makefile
index 3e2c378e5f31..0c2233c826fd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,7 +16,6 @@ obj-y		     += dev.o dev_addr_lists.o dst.o netevent.o \
 obj-y += net-sysfs.o
 obj-$(CONFIG_PAGE_POOL) += page_pool.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
-obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
 obj-$(CONFIG_GRO_CELLS) += gro_cells.o
 obj-$(CONFIG_FAILOVER) += failover.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
+obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
+endif
 obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 1261512d6807..e017744111e1 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -645,15 +645,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
 	}
 }
 
+static void sk_psock_done_strp(struct sk_psock *psock);
+
 static void sk_psock_destroy_deferred(struct work_struct *gc)
 {
 	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
 
 	/* No sk_callback_lock since already detached. */
 
-	/* Parser has been stopped */
-	if (psock->progs.skb_parser)
-		strp_done(&psock->parser.strp);
+	sk_psock_done_strp(psock);
 
 	cancel_work_sync(&psock->work);
 
@@ -750,14 +750,6 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
 	return bpf_prog_run_pin_on_cpu(prog, skb);
 }
 
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
-{
-	struct sk_psock_parser *parser;
-
-	parser = container_of(strp, struct sk_psock_parser, strp);
-	return container_of(parser, struct sk_psock, parser);
-}
-
 static void sk_psock_skb_redirect(struct sk_buff *skb)
 {
 	struct sk_psock *psock_other;
@@ -866,6 +858,24 @@ out_free:
 	}
 }
 
+static void sk_psock_write_space(struct sock *sk)
+{
+	struct sk_psock *psock;
+	void (*write_space)(struct sock *sk) = NULL;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (likely(psock)) {
+		if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+			schedule_work(&psock->work);
+		write_space = psock->saved_write_space;
+	}
+	rcu_read_unlock();
+	if (write_space)
+		write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
 {
 	struct sk_psock *psock;
@@ -897,6 +907,14 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
 	return err;
 }
 
+static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
+{
+	struct sk_psock_parser *parser;
+
+	parser = container_of(strp, struct sk_psock_parser, strp);
+	return container_of(parser, struct sk_psock, parser);
+}
+
 static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
 {
 	struct sk_psock *psock = sk_psock_from_strp(strp);
@@ -933,6 +951,56 @@ static void sk_psock_strp_data_ready(struct sock *sk)
 	rcu_read_unlock();
 }
 
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+	static const struct strp_callbacks cb = {
+		.rcv_msg	= sk_psock_strp_read,
+		.read_sock_done	= sk_psock_strp_read_done,
+		.parse_msg	= sk_psock_strp_parse,
+	};
+
+	psock->parser.enabled = false;
+	return strp_init(&psock->parser.strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_parser *parser = &psock->parser;
+
+	if (parser->enabled)
+		return;
+
+	parser->saved_data_ready = sk->sk_data_ready;
+	sk->sk_data_ready = sk_psock_strp_data_ready;
+	sk->sk_write_space = sk_psock_write_space;
+	parser->enabled = true;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_parser *parser = &psock->parser;
+
+	if (!parser->enabled)
+		return;
+
+	sk->sk_data_ready = parser->saved_data_ready;
+	parser->saved_data_ready = NULL;
+	strp_stop(&parser->strp);
+	parser->enabled = false;
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+	/* Parser has been stopped */
+	if (psock->progs.skb_parser)
+		strp_done(&psock->parser.strp);
+}
+#else
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
 static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
 				 unsigned int offset, size_t orig_len)
 {
@@ -984,35 +1052,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
 	sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
 }
 
-static void sk_psock_write_space(struct sock *sk)
-{
-	struct sk_psock *psock;
-	void (*write_space)(struct sock *sk) = NULL;
-
-	rcu_read_lock();
-	psock = sk_psock(sk);
-	if (likely(psock)) {
-		if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
-			schedule_work(&psock->work);
-		write_space = psock->saved_write_space;
-	}
-	rcu_read_unlock();
-	if (write_space)
-		write_space(sk);
-}
-
-int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
-{
-	static const struct strp_callbacks cb = {
-		.rcv_msg	= sk_psock_strp_read,
-		.read_sock_done	= sk_psock_strp_read_done,
-		.parse_msg	= sk_psock_strp_parse,
-	};
-
-	psock->parser.enabled = false;
-	return strp_init(&psock->parser.strp, sk, &cb);
-}
-
 void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
 {
 	struct sk_psock_parser *parser = &psock->parser;
@@ -1026,32 +1065,6 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
 	parser->enabled = true;
 }
 
-void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
-{
-	struct sk_psock_parser *parser = &psock->parser;
-
-	if (parser->enabled)
-		return;
-
-	parser->saved_data_ready = sk->sk_data_ready;
-	sk->sk_data_ready = sk_psock_strp_data_ready;
-	sk->sk_write_space = sk_psock_write_space;
-	parser->enabled = true;
-}
-
-void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
-{
-	struct sk_psock_parser *parser = &psock->parser;
-
-	if (!parser->enabled)
-		return;
-
-	sk->sk_data_ready = parser->saved_data_ready;
-	parser->saved_data_ready = NULL;
-	strp_stop(&parser->strp);
-	parser->enabled = false;
-}
-
 void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
 {
 	struct sk_psock_parser *parser = &psock->parser;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index d758fb83c884..ee3334dd3a38 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1461,9 +1461,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 	case BPF_SK_MSG_VERDICT:
 		pprog = &progs->msg_parser;
 		break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 	case BPF_SK_SKB_STREAM_PARSER:
 		pprog = &progs->skb_parser;
 		break;
+#endif
 	case BPF_SK_SKB_STREAM_VERDICT:
 		pprog = &progs->skb_verdict;
 		break;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 5b77a46885b9..bbdd9c44f14e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o
+obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index bc7d2a586e18..17c322b875fd 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -229,7 +229,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
 }
 EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
 
-#ifdef CONFIG_BPF_STREAM_PARSER
+#ifdef CONFIG_BPF_SYSCALL
 static bool tcp_bpf_stream_read(const struct sock *sk)
 {
 	struct sk_psock *psock;
@@ -629,4 +629,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
 	if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE])
 		newsk->sk_prot = sk->sk_prot_creator;
 }
-#endif /* CONFIG_BPF_STREAM_PARSER */
+#endif /* CONFIG_BPF_SYSCALL */
-- 
cgit v1.2.3


From 5a685cd94b21a88efa6be77169eddef525368034 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 23 Feb 2021 10:49:27 -0800
Subject: skmsg: Get rid of struct sk_psock_parser

struct sk_psock_parser is embedded in sk_psock, it is
unnecessary as skb verdict also uses ->saved_data_ready.
We can simply fold these fields into sk_psock, and get rid
of ->enabled.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210223184934.6054-3-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 19 +++++++-----------
 net/core/skmsg.c      | 53 ++++++++++++++++-----------------------------------
 net/core/sock_map.c   |  8 ++++----
 3 files changed, 27 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index db7a08be4725..22e26f82de33 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -70,14 +70,6 @@ struct sk_psock_link {
 	void				*link_raw;
 };
 
-struct sk_psock_parser {
-#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
-	struct strparser		strp;
-#endif
-	bool				enabled;
-	void (*saved_data_ready)(struct sock *sk);
-};
-
 struct sk_psock_work_state {
 	struct sk_buff			*skb;
 	u32				len;
@@ -92,7 +84,9 @@ struct sk_psock {
 	u32				eval;
 	struct sk_msg			*cork;
 	struct sk_psock_progs		progs;
-	struct sk_psock_parser		parser;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+	struct strparser		strp;
+#endif
 	struct sk_buff_head		ingress_skb;
 	struct list_head		ingress_msg;
 	unsigned long			state;
@@ -102,6 +96,7 @@ struct sk_psock {
 	void (*saved_unhash)(struct sock *sk);
 	void (*saved_close)(struct sock *sk, long timeout);
 	void (*saved_write_space)(struct sock *sk);
+	void (*saved_data_ready)(struct sock *sk);
 	struct proto			*sk_proto;
 	struct sk_psock_work_state	work_state;
 	struct work_struct		work;
@@ -418,8 +413,8 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
 
 static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
 {
-	if (psock->parser.enabled)
-		psock->parser.saved_data_ready(sk);
+	if (psock->saved_data_ready)
+		psock->saved_data_ready(sk);
 	else
 		sk->sk_data_ready(sk);
 }
@@ -458,6 +453,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 {
 	if (!psock)
 		return false;
-	return psock->parser.enabled;
+	return !!psock->saved_data_ready;
 }
 #endif /* _LINUX_SKMSG_H */
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index e017744111e1..d00c9a4b47e7 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -907,17 +907,9 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
 	return err;
 }
 
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
-{
-	struct sk_psock_parser *parser;
-
-	parser = container_of(strp, struct sk_psock_parser, strp);
-	return container_of(parser, struct sk_psock, parser);
-}
-
 static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
 {
-	struct sk_psock *psock = sk_psock_from_strp(strp);
+	struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
 	struct bpf_prog *prog;
 	int ret = skb->len;
 
@@ -941,10 +933,10 @@ static void sk_psock_strp_data_ready(struct sock *sk)
 	psock = sk_psock(sk);
 	if (likely(psock)) {
 		if (tls_sw_has_ctx_rx(sk)) {
-			psock->parser.saved_data_ready(sk);
+			psock->saved_data_ready(sk);
 		} else {
 			write_lock_bh(&sk->sk_callback_lock);
-			strp_data_ready(&psock->parser.strp);
+			strp_data_ready(&psock->strp);
 			write_unlock_bh(&sk->sk_callback_lock);
 		}
 	}
@@ -959,41 +951,34 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
 		.parse_msg	= sk_psock_strp_parse,
 	};
 
-	psock->parser.enabled = false;
-	return strp_init(&psock->parser.strp, sk, &cb);
+	return strp_init(&psock->strp, sk, &cb);
 }
 
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
 {
-	struct sk_psock_parser *parser = &psock->parser;
-
-	if (parser->enabled)
+	if (psock->saved_data_ready)
 		return;
 
-	parser->saved_data_ready = sk->sk_data_ready;
+	psock->saved_data_ready = sk->sk_data_ready;
 	sk->sk_data_ready = sk_psock_strp_data_ready;
 	sk->sk_write_space = sk_psock_write_space;
-	parser->enabled = true;
 }
 
 void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
 {
-	struct sk_psock_parser *parser = &psock->parser;
-
-	if (!parser->enabled)
+	if (!psock->saved_data_ready)
 		return;
 
-	sk->sk_data_ready = parser->saved_data_ready;
-	parser->saved_data_ready = NULL;
-	strp_stop(&parser->strp);
-	parser->enabled = false;
+	sk->sk_data_ready = psock->saved_data_ready;
+	psock->saved_data_ready = NULL;
+	strp_stop(&psock->strp);
 }
 
 static void sk_psock_done_strp(struct sk_psock *psock)
 {
 	/* Parser has been stopped */
 	if (psock->progs.skb_parser)
-		strp_done(&psock->parser.strp);
+		strp_done(&psock->strp);
 }
 #else
 static void sk_psock_done_strp(struct sk_psock *psock)
@@ -1054,25 +1039,19 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
 
 void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
 {
-	struct sk_psock_parser *parser = &psock->parser;
-
-	if (parser->enabled)
+	if (psock->saved_data_ready)
 		return;
 
-	parser->saved_data_ready = sk->sk_data_ready;
+	psock->saved_data_ready = sk->sk_data_ready;
 	sk->sk_data_ready = sk_psock_verdict_data_ready;
 	sk->sk_write_space = sk_psock_write_space;
-	parser->enabled = true;
 }
 
 void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
 {
-	struct sk_psock_parser *parser = &psock->parser;
-
-	if (!parser->enabled)
+	if (!psock->saved_data_ready)
 		return;
 
-	sk->sk_data_ready = parser->saved_data_ready;
-	parser->saved_data_ready = NULL;
-	parser->enabled = false;
+	sk->sk_data_ready = psock->saved_data_ready;
+	psock->saved_data_ready = NULL;
 }
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index ee3334dd3a38..1a28a5c2c61e 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk,
 			struct bpf_map *map = link->map;
 			struct bpf_stab *stab = container_of(map, struct bpf_stab,
 							     map);
-			if (psock->parser.enabled && stab->progs.skb_parser)
+			if (psock->saved_data_ready && stab->progs.skb_parser)
 				strp_stop = true;
-			if (psock->parser.enabled && stab->progs.skb_verdict)
+			if (psock->saved_data_ready && stab->progs.skb_verdict)
 				verdict_stop = true;
 			list_del(&link->list);
 			sk_psock_free_link(link);
@@ -283,14 +283,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 		goto out_drop;
 
 	write_lock_bh(&sk->sk_callback_lock);
-	if (skb_parser && skb_verdict && !psock->parser.enabled) {
+	if (skb_parser && skb_verdict && !psock->saved_data_ready) {
 		ret = sk_psock_init_strp(sk, psock);
 		if (ret)
 			goto out_unlock_drop;
 		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
 		psock_set_prog(&psock->progs.skb_parser, skb_parser);
 		sk_psock_start_strp(sk, psock);
-	} else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
+	} else if (!skb_parser && skb_verdict && !psock->saved_data_ready) {
 		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
 		sk_psock_start_verdict(sk,psock);
 	}
-- 
cgit v1.2.3


From e3526bb92a2084cdaec6cb2855bcec98b280426c Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 23 Feb 2021 10:49:29 -0800
Subject: skmsg: Move sk_redir from TCP_SKB_CB to skb

Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly
does not work for any other non-TCP protocols. We can move them to
skb ext, but it introduces a memory allocation on fast path.

Fortunately, we only need to a word-size to store all the information,
because the flags actually only contains 1 bit so can be just packed
into the lowest bit of the "pointer", which is stored as unsigned
long.

Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is
no longer needed after ->sk_data_ready() so we can just drop it.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210223184934.6054-5-xiyou.wangcong@gmail.com
---
 include/linux/skbuff.h |  3 +++
 include/linux/skmsg.h  | 38 ++++++++++++++++++++++++++++++++++++++
 include/net/tcp.h      | 19 -------------------
 net/core/skmsg.c       | 31 +++++++++++++++++++------------
 net/core/sock_map.c    |  8 ++------
 5 files changed, 62 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6d0a33d1c0db..bd84f799c952 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -755,6 +755,9 @@ struct sk_buff {
 			void		(*destructor)(struct sk_buff *skb);
 		};
 		struct list_head	tcp_tsorted_anchor;
+#ifdef CONFIG_NET_SOCK_MSG
+		unsigned long		_sk_redir;
+#endif
 	};
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 22e26f82de33..e0de45527bb6 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -455,4 +455,42 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 		return false;
 	return !!psock->saved_data_ready;
 }
+
+#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+
+/* We only have one bit so far. */
+#define BPF_F_PTR_MASK ~(BPF_F_INGRESS)
+
+static inline bool skb_bpf_ingress(const struct sk_buff *skb)
+{
+	unsigned long sk_redir = skb->_sk_redir;
+
+	return sk_redir & BPF_F_INGRESS;
+}
+
+static inline void skb_bpf_set_ingress(struct sk_buff *skb)
+{
+	skb->_sk_redir |= BPF_F_INGRESS;
+}
+
+static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir,
+				     bool ingress)
+{
+	skb->_sk_redir = (unsigned long)sk_redir;
+	if (ingress)
+		skb->_sk_redir |= BPF_F_INGRESS;
+}
+
+static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb)
+{
+	unsigned long sk_redir = skb->_sk_redir;
+
+	return (struct sock *)(sk_redir & BPF_F_PTR_MASK);
+}
+
+static inline void skb_bpf_redirect_clear(struct sk_buff *skb)
+{
+	skb->_sk_redir = 0;
+}
+#endif /* CONFIG_NET_SOCK_MSG */
 #endif /* _LINUX_SKMSG_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 947ef5da6867..075de26f449d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -883,30 +883,11 @@ struct tcp_skb_cb {
 			struct inet6_skb_parm	h6;
 #endif
 		} header;	/* For incoming skbs */
-		struct {
-			__u32 flags;
-			struct sock *sk_redir;
-		} bpf;
 	};
 };
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 
-static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
-{
-	return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
-}
-
-static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
-{
-	return TCP_SKB_CB(skb)->bpf.sk_redir;
-}
-
-static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
-{
-	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
-}
-
 extern const struct inet_connection_sock_af_ops ipv4_specific;
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 8822001ab3dc..409258367bea 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work)
 		len = skb->len;
 		off = 0;
 start:
-		ingress = tcp_skb_bpf_ingress(skb);
+		ingress = skb_bpf_ingress(skb);
+		skb_bpf_redirect_clear(skb);
 		do {
 			ret = -EIO;
 			if (likely(psock->sk->sk_socket))
@@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 
 static void sk_psock_zap_ingress(struct sk_psock *psock)
 {
-	__skb_queue_purge(&psock->ingress_skb);
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) {
+		skb_bpf_redirect_clear(skb);
+		kfree_skb(skb);
+	}
 	__sk_psock_purge_ingress_msg(psock);
 }
 
@@ -754,7 +760,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
 	struct sk_psock *psock_other;
 	struct sock *sk_other;
 
-	sk_other = tcp_skb_bpf_redirect_fetch(skb);
+	sk_other = skb_bpf_redirect_fetch(skb);
 	/* This error is a buggy BPF program, it returned a redirect
 	 * return code, but then didn't set a redirect interface.
 	 */
@@ -804,9 +810,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
 		 * TLS context.
 		 */
 		skb->sk = psock->sk;
-		tcp_skb_bpf_redirect_clear(skb);
+		skb_dst_drop(skb);
+		skb_bpf_redirect_clear(skb);
 		ret = sk_psock_bpf_run(psock, prog, skb);
-		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+		ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
 		skb->sk = NULL;
 	}
 	sk_psock_tls_verdict_apply(skb, psock->sk, ret);
@@ -818,7 +825,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
 static void sk_psock_verdict_apply(struct sk_psock *psock,
 				   struct sk_buff *skb, int verdict)
 {
-	struct tcp_skb_cb *tcp;
 	struct sock *sk_other;
 	int err = -EIO;
 
@@ -830,8 +836,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 			goto out_free;
 		}
 
-		tcp = TCP_SKB_CB(skb);
-		tcp->bpf.flags |= BPF_F_INGRESS;
+		skb_bpf_set_ingress(skb);
 
 		/* If the queue is empty then we can submit directly
 		 * into the msg queue. If its not empty we have to
@@ -892,9 +897,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
 	skb_set_owner_r(skb, sk);
 	prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
-		tcp_skb_bpf_redirect_clear(skb);
+		skb_dst_drop(skb);
+		skb_bpf_redirect_clear(skb);
 		ret = sk_psock_bpf_run(psock, prog, skb);
-		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+		ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
 	}
 	sk_psock_verdict_apply(psock, skb, ret);
 out:
@@ -1011,9 +1017,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
 	skb_set_owner_r(skb, sk);
 	prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
-		tcp_skb_bpf_redirect_clear(skb);
+		skb_dst_drop(skb);
+		skb_bpf_redirect_clear(skb);
 		ret = sk_psock_bpf_run(psock, prog, skb);
-		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+		ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
 	}
 	sk_psock_verdict_apply(psock, skb, ret);
 out:
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1a28a5c2c61e..dbfcd7006338 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 	struct sock *sk;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
 		return SK_DROP;
 
-	tcb->bpf.flags = flags;
-	tcb->bpf.sk_redir = sk;
+	skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
 	return SK_PASS;
 }
 
@@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = {
 BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
 	   struct bpf_map *, map, void *, key, u64, flags)
 {
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 	struct sock *sk;
 
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
@@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
 	if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
 		return SK_DROP;
 
-	tcb->bpf.flags = flags;
-	tcb->bpf.sk_redir = sk;
+	skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
 	return SK_PASS;
 }
 
-- 
cgit v1.2.3


From ae8b8332fbb512f53bf50ff6a7586dd0f90ed18a Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 23 Feb 2021 10:49:30 -0800
Subject: sock_map: Rename skb_parser and skb_verdict

These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER
and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact
they are only used for TCP. And save the name 'skb_verdict' for
general use later.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210223184934.6054-6-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h                              |  8 +--
 net/core/skmsg.c                                   | 14 ++---
 net/core/sock_map.c                                | 60 +++++++++++-----------
 .../selftests/bpf/prog_tests/sockmap_listen.c      |  8 +--
 .../selftests/bpf/progs/test_sockmap_listen.c      |  4 +-
 5 files changed, 47 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e0de45527bb6..d9f6ec4a9cf2 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -56,8 +56,8 @@ struct sk_msg {
 
 struct sk_psock_progs {
 	struct bpf_prog			*msg_parser;
-	struct bpf_prog			*skb_parser;
-	struct bpf_prog			*skb_verdict;
+	struct bpf_prog			*stream_parser;
+	struct bpf_prog			*stream_verdict;
 };
 
 enum sk_psock_state_bits {
@@ -443,8 +443,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog,
 static inline void psock_progs_drop(struct sk_psock_progs *progs)
 {
 	psock_set_prog(&progs->msg_parser, NULL);
-	psock_set_prog(&progs->skb_parser, NULL);
-	psock_set_prog(&progs->skb_verdict, NULL);
+	psock_set_prog(&progs->stream_parser, NULL);
+	psock_set_prog(&progs->stream_verdict, NULL);
 }
 
 int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 409258367bea..35f9caa3b125 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -691,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 	write_lock_bh(&sk->sk_callback_lock);
 	sk_psock_restore_proto(sk, psock);
 	rcu_assign_sk_user_data(sk, NULL);
-	if (psock->progs.skb_parser)
+	if (psock->progs.stream_parser)
 		sk_psock_stop_strp(sk, psock);
-	else if (psock->progs.skb_verdict)
+	else if (psock->progs.stream_verdict)
 		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
 	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
@@ -803,7 +803,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
 	int ret = __SK_PASS;
 
 	rcu_read_lock();
-	prog = READ_ONCE(psock->progs.skb_verdict);
+	prog = READ_ONCE(psock->progs.stream_verdict);
 	if (likely(prog)) {
 		/* We skip full set_owner_r here because if we do a SK_PASS
 		 * or SK_DROP we can skip skb memory accounting and use the
@@ -895,7 +895,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
 		goto out;
 	}
 	skb_set_owner_r(skb, sk);
-	prog = READ_ONCE(psock->progs.skb_verdict);
+	prog = READ_ONCE(psock->progs.stream_verdict);
 	if (likely(prog)) {
 		skb_dst_drop(skb);
 		skb_bpf_redirect_clear(skb);
@@ -919,7 +919,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
 	int ret = skb->len;
 
 	rcu_read_lock();
-	prog = READ_ONCE(psock->progs.skb_parser);
+	prog = READ_ONCE(psock->progs.stream_parser);
 	if (likely(prog)) {
 		skb->sk = psock->sk;
 		ret = sk_psock_bpf_run(psock, prog, skb);
@@ -982,7 +982,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
 static void sk_psock_done_strp(struct sk_psock *psock)
 {
 	/* Parser has been stopped */
-	if (psock->progs.skb_parser)
+	if (psock->progs.stream_parser)
 		strp_done(&psock->strp);
 }
 #else
@@ -1015,7 +1015,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
 		goto out;
 	}
 	skb_set_owner_r(skb, sk);
-	prog = READ_ONCE(psock->progs.skb_verdict);
+	prog = READ_ONCE(psock->progs.stream_verdict);
 	if (likely(prog)) {
 		skb_dst_drop(skb);
 		skb_bpf_redirect_clear(skb);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index dbfcd7006338..69785070f02d 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk,
 			struct bpf_map *map = link->map;
 			struct bpf_stab *stab = container_of(map, struct bpf_stab,
 							     map);
-			if (psock->saved_data_ready && stab->progs.skb_parser)
+			if (psock->saved_data_ready && stab->progs.stream_parser)
 				strp_stop = true;
-			if (psock->saved_data_ready && stab->progs.skb_verdict)
+			if (psock->saved_data_ready && stab->progs.stream_verdict)
 				verdict_stop = true;
 			list_del(&link->list);
 			sk_psock_free_link(link);
@@ -224,23 +224,23 @@ out:
 static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 			 struct sock *sk)
 {
-	struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+	struct bpf_prog *msg_parser, *stream_parser, *stream_verdict;
 	struct sk_psock *psock;
 	int ret;
 
-	skb_verdict = READ_ONCE(progs->skb_verdict);
-	if (skb_verdict) {
-		skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
-		if (IS_ERR(skb_verdict))
-			return PTR_ERR(skb_verdict);
+	stream_verdict = READ_ONCE(progs->stream_verdict);
+	if (stream_verdict) {
+		stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
+		if (IS_ERR(stream_verdict))
+			return PTR_ERR(stream_verdict);
 	}
 
-	skb_parser = READ_ONCE(progs->skb_parser);
-	if (skb_parser) {
-		skb_parser = bpf_prog_inc_not_zero(skb_parser);
-		if (IS_ERR(skb_parser)) {
-			ret = PTR_ERR(skb_parser);
-			goto out_put_skb_verdict;
+	stream_parser = READ_ONCE(progs->stream_parser);
+	if (stream_parser) {
+		stream_parser = bpf_prog_inc_not_zero(stream_parser);
+		if (IS_ERR(stream_parser)) {
+			ret = PTR_ERR(stream_parser);
+			goto out_put_stream_verdict;
 		}
 	}
 
@@ -249,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 		msg_parser = bpf_prog_inc_not_zero(msg_parser);
 		if (IS_ERR(msg_parser)) {
 			ret = PTR_ERR(msg_parser);
-			goto out_put_skb_parser;
+			goto out_put_stream_parser;
 		}
 	}
 
@@ -261,8 +261,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 
 	if (psock) {
 		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
-		    (skb_parser  && READ_ONCE(psock->progs.skb_parser)) ||
-		    (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
+		    (stream_parser  && READ_ONCE(psock->progs.stream_parser)) ||
+		    (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
 			sk_psock_put(sk, psock);
 			ret = -EBUSY;
 			goto out_progs;
@@ -283,15 +283,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 		goto out_drop;
 
 	write_lock_bh(&sk->sk_callback_lock);
-	if (skb_parser && skb_verdict && !psock->saved_data_ready) {
+	if (stream_parser && stream_verdict && !psock->saved_data_ready) {
 		ret = sk_psock_init_strp(sk, psock);
 		if (ret)
 			goto out_unlock_drop;
-		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
-		psock_set_prog(&psock->progs.skb_parser, skb_parser);
+		psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
+		psock_set_prog(&psock->progs.stream_parser, stream_parser);
 		sk_psock_start_strp(sk, psock);
-	} else if (!skb_parser && skb_verdict && !psock->saved_data_ready) {
-		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+	} else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
+		psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
 		sk_psock_start_verdict(sk,psock);
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
@@ -303,12 +303,12 @@ out_drop:
 out_progs:
 	if (msg_parser)
 		bpf_prog_put(msg_parser);
-out_put_skb_parser:
-	if (skb_parser)
-		bpf_prog_put(skb_parser);
-out_put_skb_verdict:
-	if (skb_verdict)
-		bpf_prog_put(skb_verdict);
+out_put_stream_parser:
+	if (stream_parser)
+		bpf_prog_put(stream_parser);
+out_put_stream_verdict:
+	if (stream_verdict)
+		bpf_prog_put(stream_verdict);
 	return ret;
 }
 
@@ -1459,11 +1459,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 		break;
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 	case BPF_SK_SKB_STREAM_PARSER:
-		pprog = &progs->skb_parser;
+		pprog = &progs->stream_parser;
 		break;
 #endif
 	case BPF_SK_SKB_STREAM_VERDICT:
-		pprog = &progs->skb_verdict;
+		pprog = &progs->stream_verdict;
 		break;
 	default:
 		return -EOPNOTSUPP;
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index d7d65a700799..c26e6bf05e49 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel,
 					struct bpf_map *inner_map, int family,
 					int sotype)
 {
-	int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
-	int parser = bpf_program__fd(skel->progs.prog_skb_parser);
+	int verdict = bpf_program__fd(skel->progs.prog_stream_verdict);
+	int parser = bpf_program__fd(skel->progs.prog_stream_parser);
 	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
 	int sock_map = bpf_map__fd(inner_map);
 	int err;
@@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel,
 					struct bpf_map *inner_map, int family,
 					int sotype)
 {
-	int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
-	int parser = bpf_program__fd(skel->progs.prog_skb_parser);
+	int verdict = bpf_program__fd(skel->progs.prog_stream_verdict);
+	int parser = bpf_program__fd(skel->progs.prog_stream_parser);
 	int verdict_map = bpf_map__fd(skel->maps.verdict_map);
 	int sock_map = bpf_map__fd(inner_map);
 	int err;
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
index a3a366c57ce1..fa221141e9c1 100644
--- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
@@ -31,13 +31,13 @@ struct {
 static volatile bool test_sockmap; /* toggled by user-space */
 
 SEC("sk_skb/stream_parser")
-int prog_skb_parser(struct __sk_buff *skb)
+int prog_stream_parser(struct __sk_buff *skb)
 {
 	return skb->len;
 }
 
 SEC("sk_skb/stream_verdict")
-int prog_skb_verdict(struct __sk_buff *skb)
+int prog_stream_verdict(struct __sk_buff *skb)
 {
 	unsigned int *count;
 	__u32 zero = 0;
-- 
cgit v1.2.3


From 4675e234b9e15159894b90ead9340e1dc202b670 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 23 Feb 2021 10:49:31 -0800
Subject: sock_map: Make sock_map_prog_update() static

It is only used within sock_map.c so can become static.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210223184934.6054-7-xiyou.wangcong@gmail.com
---
 include/linux/bpf.h | 9 ---------
 net/core/sock_map.c | 7 +++++--
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2be47ada5f2d..e1e4d2f60527 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1779,8 +1779,6 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-			 struct bpf_prog *old, u32 which);
 int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
 int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
@@ -1798,13 +1796,6 @@ static inline void bpf_sk_reuseport_detach(struct sock *sk)
 }
 
 #ifdef CONFIG_BPF_SYSCALL
-static inline int sock_map_prog_update(struct bpf_map *map,
-				       struct bpf_prog *prog,
-				       struct bpf_prog *old, u32 which)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int sock_map_get_from_fd(const union bpf_attr *attr,
 				       struct bpf_prog *prog)
 {
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 69785070f02d..dd53a7771d7e 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,6 +24,9 @@ struct bpf_stab {
 #define SOCK_CREATE_FLAG_MASK				\
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+				struct bpf_prog *old, u32 which);
+
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_stab *stab;
@@ -1444,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
 	return NULL;
 }
 
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-			 struct bpf_prog *old, u32 which)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+				struct bpf_prog *old, u32 which)
 {
 	struct sk_psock_progs *progs = sock_map_progs(map);
 	struct bpf_prog **pprog;
-- 
cgit v1.2.3


From cd81cefb1abc52bd164f4d9760cd22eadc0e4468 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 23 Feb 2021 10:49:32 -0800
Subject: skmsg: Make __sk_psock_purge_ingress_msg() static

It is only used within skmsg.c so can become static.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210223184934.6054-8-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 2 --
 net/core/skmsg.c      | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index d9f6ec4a9cf2..676d48e08159 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -340,8 +340,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link)
 
 struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
 
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
-
 static inline void sk_psock_cork_free(struct sk_psock *psock)
 {
 	if (psock->cork) {
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 35f9caa3b125..46e29d2c0c48 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -619,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
 	return link;
 }
 
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 {
 	struct sk_msg *msg, *tmp;
 
-- 
cgit v1.2.3


From ff9614b81be65d648ec4615b593c6e4b2dac6375 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 23 Feb 2021 10:49:34 -0800
Subject: skmsg: Remove unused sk_psock_stop() declaration

It is not defined or used anywhere.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210223184934.6054-10-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 676d48e08159..6c09d94be2e9 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -400,7 +400,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk)
 	return psock;
 }
 
-void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
 
 static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
-- 
cgit v1.2.3


From 69c087ba6225b574afb6e505b72cb75242a3d844 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 26 Feb 2021 12:49:25 -0800
Subject: bpf: Add bpf_for_each_map_elem() helper

The bpf_for_each_map_elem() helper is introduced which
iterates all map elements with a callback function. The
helper signature looks like
  long bpf_for_each_map_elem(map, callback_fn, callback_ctx, flags)
and for each map element, the callback_fn will be called. For example,
like hashmap, the callback signature may look like
  long callback_fn(map, key, val, callback_ctx)

There are two known use cases for this. One is from upstream ([1]) where
a for_each_map_elem helper may help implement a timeout mechanism
in a more generic way. Another is from our internal discussion
for a firewall use case where a map contains all the rules. The packet
data can be compared to all these rules to decide allow or deny
the packet.

For array maps, users can already use a bounded loop to traverse
elements. Using this helper can avoid using bounded loop. For other
type of maps (e.g., hash maps) where bounded loop is hard or
impossible to use, this helper provides a convenient way to
operate on all elements.

For callback_fn, besides map and map element, a callback_ctx,
allocated on caller stack, is also passed to the callback
function. This callback_ctx argument can provide additional
input and allow to write to caller stack for output.

If the callback_fn returns 0, the helper will iterate through next
element if available. If the callback_fn returns 1, the helper
will stop iterating and returns to the bpf program. Other return
values are not used for now.

Currently, this helper is only available with jit. It is possible
to make it work with interpreter with so effort but I leave it
as the future work.

[1]: https://lore.kernel.org/bpf/20210122205415.113822-1-xiyou.wangcong@gmail.com/

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210226204925.3884923-1-yhs@fb.com
---
 include/linux/bpf.h            |  13 +++
 include/linux/bpf_verifier.h   |   3 +
 include/uapi/linux/bpf.h       |  38 ++++++++
 kernel/bpf/bpf_iter.c          |  16 ++++
 kernel/bpf/helpers.c           |   2 +
 kernel/bpf/verifier.c          | 208 ++++++++++++++++++++++++++++++++++++++---
 kernel/trace/bpf_trace.c       |   2 +
 tools/include/uapi/linux/bpf.h |  38 ++++++++
 8 files changed, 307 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1e4d2f60527..aeb1b93a4d75 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@ struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
 struct mem_cgroup;
+struct bpf_func_state;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -129,6 +130,13 @@ struct bpf_map_ops {
 	bool (*map_meta_equal)(const struct bpf_map *meta0,
 			       const struct bpf_map *meta1);
 
+
+	int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
+					      struct bpf_func_state *caller,
+					      struct bpf_func_state *callee);
+	int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn,
+				     void *callback_ctx, u64 flags);
+
 	/* BTF name and id of struct allocated by map_alloc */
 	const char * const map_btf_name;
 	int *map_btf_id;
@@ -295,6 +303,8 @@ enum bpf_arg_type {
 	ARG_CONST_ALLOC_SIZE_OR_ZERO,	/* number of allocated bytes requested */
 	ARG_PTR_TO_BTF_ID_SOCK_COMMON,	/* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
 	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */
+	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
+	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
 	__BPF_ARG_TYPE_MAX,
 };
 
@@ -411,6 +421,8 @@ enum bpf_reg_type {
 	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
 	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
+	PTR_TO_FUNC,		 /* reg points to a bpf program function */
+	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -1887,6 +1899,7 @@ extern const struct bpf_func_proto bpf_sock_from_file_proto;
 extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
 extern const struct bpf_func_proto bpf_task_storage_get_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_proto;
+extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 971b33aca13d..51c2ffa3d901 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -68,6 +68,8 @@ struct bpf_reg_state {
 			unsigned long raw1;
 			unsigned long raw2;
 		} raw;
+
+		u32 subprogno; /* for PTR_TO_FUNC */
 	};
 	/* For PTR_TO_PACKET, used to find other pointers with the same variable
 	 * offset, so they can share range knowledge.
@@ -204,6 +206,7 @@ struct bpf_func_state {
 	int acquired_refs;
 	struct bpf_reference_state *refs;
 	int allocated_stack;
+	bool in_callback_fn;
 	struct bpf_stack_state *stack;
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 79c893310492..b89af20cfa19 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -393,6 +393,15 @@ enum bpf_link_type {
  *                   is struct/union.
  */
 #define BPF_PSEUDO_BTF_ID	3
+/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ * insn[0].imm:      insn offset to the func
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the function
+ * verifier type:    PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC		4
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
@@ -3909,6 +3918,34 @@ union bpf_attr {
  *		* **BPF_MTU_CHK_RET_FRAG_NEEDED**
  *		* **BPF_MTU_CHK_RET_SEGS_TOOBIG**
  *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		For each element in **map**, call **callback_fn** function with
+ *		**map**, **callback_ctx** and other map-specific parameters.
+ *		The **callback_fn** should be a static function and
+ *		the **callback_ctx** should be a pointer to the stack.
+ *		The **flags** is used to control certain aspects of the helper.
+ *		Currently, the **flags** must be 0.
+ *
+ *		The following are a list of supported map types and their
+ *		respective expected callback signatures:
+ *
+ *		BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ *		BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ *		BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ *		long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ *		For per_cpu maps, the map_value is the value on the cpu where the
+ *		bpf_prog is running.
+ *
+ *		If **callback_fn** return 0, the helper will continue to the next
+ *		element. If return value is 1, the helper will skip the rest of
+ *		elements and return. Other return values are not used now.
+ *
+ *	Return
+ *		The number of traversed map elements for success, **-EINVAL** for
+ *		invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4075,6 +4112,7 @@ union bpf_attr {
 	FN(ima_inode_hash),		\
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
+	FN(for_each_map_elem),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index a0d9eade9c80..931870f9cf56 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
 	 */
 	return ret == 0 ? 0 : -EAGAIN;
 }
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+	   void *, callback_ctx, u64, flags)
+{
+	return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+	.func		= bpf_for_each_map_elem,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 308427fe03a3..074800226327 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ringbuf_discard_proto;
 	case BPF_FUNC_ringbuf_query:
 		return &bpf_ringbuf_query_proto;
+	case BPF_FUNC_for_each_map_elem:
+		return &bpf_for_each_map_elem_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dbdca49ac6cc..53afe9461b03 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
 	       insn->src_reg == BPF_PSEUDO_CALL;
 }
 
+static bool bpf_pseudo_func(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+	       insn->src_reg == BPF_PSEUDO_FUNC;
+}
+
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
 	bool raw_mode;
@@ -248,6 +254,7 @@ struct bpf_call_arg_meta {
 	u32 btf_id;
 	struct btf *ret_btf;
 	u32 ret_btf_id;
+	u32 subprogno;
 };
 
 struct btf *btf_vmlinux;
@@ -427,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type)
 	return type == PTR_TO_SOCKET ||
 		type == PTR_TO_TCP_SOCK ||
 		type == PTR_TO_MAP_VALUE ||
+		type == PTR_TO_MAP_KEY ||
 		type == PTR_TO_SOCK_COMMON;
 }
 
@@ -469,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type)
 	       type == ARG_PTR_TO_MEM_OR_NULL ||
 	       type == ARG_PTR_TO_CTX_OR_NULL ||
 	       type == ARG_PTR_TO_SOCKET_OR_NULL ||
-	       type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+	       type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+	       type == ARG_PTR_TO_STACK_OR_NULL;
 }
 
 /* Determine whether the function releases some resources allocated by another
@@ -552,6 +561,8 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
 	[PTR_TO_RDWR_BUF]	= "rdwr_buf",
 	[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+	[PTR_TO_FUNC]		= "func",
+	[PTR_TO_MAP_KEY]	= "map_key",
 };
 
 static char slot_type_char[] = {
@@ -623,6 +634,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (type_is_pkt_pointer(t))
 				verbose(env, ",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
+				 t == PTR_TO_MAP_KEY ||
 				 t == PTR_TO_MAP_VALUE ||
 				 t == PTR_TO_MAP_VALUE_OR_NULL)
 				verbose(env, ",ks=%d,vs=%d",
@@ -1555,6 +1567,19 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
 	/* determine subprog starts. The end is one before the next starts */
 	for (i = 0; i < insn_cnt; i++) {
+		if (bpf_pseudo_func(insn + i)) {
+			if (!env->bpf_capable) {
+				verbose(env,
+					"function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+				return -EPERM;
+			}
+			ret = add_subprog(env, i + insn[i].imm + 1);
+			if (ret < 0)
+				return ret;
+			/* remember subprog */
+			insn[i + 1].imm = ret;
+			continue;
+		}
 		if (!bpf_pseudo_call(insn + i))
 			continue;
 		if (!env->bpf_capable) {
@@ -2286,6 +2311,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_PERCPU_BTF_ID:
 	case PTR_TO_MEM:
 	case PTR_TO_MEM_OR_NULL:
+	case PTR_TO_FUNC:
+	case PTR_TO_MAP_KEY:
 		return true;
 	default:
 		return false;
@@ -2890,6 +2917,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno,
 
 	reg = &cur_regs(env)[regno];
 	switch (reg->type) {
+	case PTR_TO_MAP_KEY:
+		verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+			mem_size, off, size);
+		break;
 	case PTR_TO_MAP_VALUE:
 		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			mem_size, off, size);
@@ -3295,6 +3326,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_FLOW_KEYS:
 		pointer_desc = "flow keys ";
 		break;
+	case PTR_TO_MAP_KEY:
+		pointer_desc = "key ";
+		break;
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -3396,7 +3430,7 @@ process_func:
 continue_func:
 	subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
-		if (!bpf_pseudo_call(insn + i))
+		if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
 			continue;
 		/* remember insn and function to return to */
 		ret_insn[frame] = i + 1;
@@ -3833,7 +3867,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	/* for access checks, reg->off is just part of off */
 	off += reg->off;
 
-	if (reg->type == PTR_TO_MAP_VALUE) {
+	if (reg->type == PTR_TO_MAP_KEY) {
+		if (t == BPF_WRITE) {
+			verbose(env, "write to change key R%d not allowed\n", regno);
+			return -EACCES;
+		}
+
+		err = check_mem_region_access(env, regno, off, size,
+					      reg->map_ptr->key_size, false);
+		if (err)
+			return err;
+		if (value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose(env, "R%d leaks addr into map\n", value_regno);
@@ -4249,6 +4295,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size,
 					   zero_size_allowed);
+	case PTR_TO_MAP_KEY:
+		return check_mem_region_access(env, regno, reg->off, access_size,
+					       reg->map_ptr->key_size, false);
 	case PTR_TO_MAP_VALUE:
 		if (check_map_access_type(env, regno, reg->off, access_size,
 					  meta && meta->raw_mode ? BPF_WRITE :
@@ -4465,6 +4514,7 @@ static const struct bpf_reg_types map_key_value_types = {
 		PTR_TO_STACK,
 		PTR_TO_PACKET,
 		PTR_TO_PACKET_META,
+		PTR_TO_MAP_KEY,
 		PTR_TO_MAP_VALUE,
 	},
 };
@@ -4496,6 +4546,7 @@ static const struct bpf_reg_types mem_types = {
 		PTR_TO_STACK,
 		PTR_TO_PACKET,
 		PTR_TO_PACKET_META,
+		PTR_TO_MAP_KEY,
 		PTR_TO_MAP_VALUE,
 		PTR_TO_MEM,
 		PTR_TO_RDONLY_BUF,
@@ -4508,6 +4559,7 @@ static const struct bpf_reg_types int_ptr_types = {
 		PTR_TO_STACK,
 		PTR_TO_PACKET,
 		PTR_TO_PACKET_META,
+		PTR_TO_MAP_KEY,
 		PTR_TO_MAP_VALUE,
 	},
 };
@@ -4520,6 +4572,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T
 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -4548,6 +4602,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_INT]		= &int_ptr_types,
 	[ARG_PTR_TO_LONG]		= &int_ptr_types,
 	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types,
+	[ARG_PTR_TO_FUNC]		= &func_ptr_types,
+	[ARG_PTR_TO_STACK_OR_NULL]	= &stack_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4729,6 +4785,8 @@ skip_type_check:
 			verbose(env, "verifier internal error\n");
 			return -EFAULT;
 		}
+	} else if (arg_type == ARG_PTR_TO_FUNC) {
+		meta->subprogno = reg->subprogno;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		/* The access to this pointer is only checked when we hit the
 		 * next is_mem_size argument below.
@@ -5375,6 +5433,35 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
 }
 
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+				       struct bpf_func_state *caller,
+				       struct bpf_func_state *callee,
+				       int insn_idx)
+{
+	struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+	struct bpf_map *map;
+	int err;
+
+	if (bpf_map_ptr_poisoned(insn_aux)) {
+		verbose(env, "tail_call abusing map_ptr\n");
+		return -EINVAL;
+	}
+
+	map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+	if (!map->ops->map_set_for_each_callback_args ||
+	    !map->ops->map_for_each_callback) {
+		verbose(env, "callback function not allowed for map\n");
+		return -ENOTSUPP;
+	}
+
+	err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+	if (err)
+		return err;
+
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -5397,8 +5484,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 
 	state->curframe--;
 	caller = state->frame[state->curframe];
-	/* return to the caller whatever r0 had in the callee */
-	caller->regs[BPF_REG_0] = *r0;
+	if (callee->in_callback_fn) {
+		/* enforce R0 return value range [0, 1]. */
+		struct tnum range = tnum_range(0, 1);
+
+		if (r0->type != SCALAR_VALUE) {
+			verbose(env, "R0 not a scalar value\n");
+			return -EACCES;
+		}
+		if (!tnum_in(range, r0->var_off)) {
+			verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+			return -EINVAL;
+		}
+	} else {
+		/* return to the caller whatever r0 had in the callee */
+		caller->regs[BPF_REG_0] = *r0;
+	}
 
 	/* Transfer references to the caller */
 	err = transfer_reference_state(caller, callee);
@@ -5453,7 +5554,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	    func_id != BPF_FUNC_map_delete_elem &&
 	    func_id != BPF_FUNC_map_push_elem &&
 	    func_id != BPF_FUNC_map_pop_elem &&
-	    func_id != BPF_FUNC_map_peek_elem)
+	    func_id != BPF_FUNC_map_peek_elem &&
+	    func_id != BPF_FUNC_for_each_map_elem)
 		return 0;
 
 	if (map == NULL) {
@@ -5534,15 +5636,18 @@ static int check_reference_leak(struct bpf_verifier_env *env)
 	return state->acquired_refs ? -EINVAL : 0;
 }
 
-static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			     int *insn_idx_p)
 {
 	const struct bpf_func_proto *fn = NULL;
 	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
+	int insn_idx = *insn_idx_p;
 	bool changes_data;
-	int i, err;
+	int i, err, func_id;
 
 	/* find function prototype */
+	func_id = insn->imm;
 	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
 		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
 			func_id);
@@ -5638,6 +5743,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_for_each_map_elem) {
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_map_elem_callback_state);
+		if (err < 0)
+			return -EINVAL;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
@@ -5891,6 +6003,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 		else
 			*ptr_limit = -off;
 		return 0;
+	case PTR_TO_MAP_KEY:
+		/* Currently, this code is not exercised as the only use
+		 * is bpf_for_each_map_elem() helper which requires
+		 * bpf_capble. The code has been tested manually for
+		 * future use.
+		 */
+		if (mask_to_left) {
+			*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+		} else {
+			off = ptr_reg->smin_value + ptr_reg->off;
+			*ptr_limit = ptr_reg->map_ptr->key_size - off;
+		}
+		return 0;
 	case PTR_TO_MAP_VALUE:
 		if (mask_to_left) {
 			*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
@@ -6092,6 +6217,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
+	case PTR_TO_MAP_KEY:
 	case PTR_TO_MAP_VALUE:
 		if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
 			verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
@@ -8271,6 +8397,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return 0;
 	}
 
+	if (insn->src_reg == BPF_PSEUDO_FUNC) {
+		struct bpf_prog_aux *aux = env->prog->aux;
+		u32 subprogno = insn[1].imm;
+
+		if (!aux->func_info) {
+			verbose(env, "missing btf func_info\n");
+			return -EINVAL;
+		}
+		if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+			verbose(env, "callback function not static\n");
+			return -EINVAL;
+		}
+
+		dst_reg->type = PTR_TO_FUNC;
+		dst_reg->subprogno = subprogno;
+		return 0;
+	}
+
 	map = env->used_maps[aux->map_index];
 	mark_reg_known_zero(env, regs, insn->dst_reg);
 	dst_reg->map_ptr = map;
@@ -8657,6 +8801,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
 	struct bpf_insn *insns = env->prog->insnsi;
 	int ret;
 
+	if (bpf_pseudo_func(insns + t))
+		return visit_func_call_insn(t, insn_cnt, insns, env, true);
+
 	/* All non-branch instructions have a single fall-through edge. */
 	if (BPF_CLASS(insns[t].code) != BPF_JMP &&
 	    BPF_CLASS(insns[t].code) != BPF_JMP32)
@@ -9277,6 +9424,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			 */
 			return false;
 		}
+	case PTR_TO_MAP_KEY:
 	case PTR_TO_MAP_VALUE:
 		/* If the new min/max/var_off satisfy the old ones and
 		 * everything else matches, we are OK.
@@ -10123,10 +10271,9 @@ static int do_check(struct bpf_verifier_env *env)
 				if (insn->src_reg == BPF_PSEUDO_CALL)
 					err = check_func_call(env, insn, &env->insn_idx);
 				else
-					err = check_helper_call(env, insn->imm, env->insn_idx);
+					err = check_helper_call(env, insn, &env->insn_idx);
 				if (err)
 					return err;
-
 			} else if (opcode == BPF_JA) {
 				if (BPF_SRC(insn->code) != BPF_K ||
 				    insn->imm != 0 ||
@@ -10555,6 +10702,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 				goto next_insn;
 			}
 
+			if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+				aux = &env->insn_aux_data[i];
+				aux->ptr_type = PTR_TO_FUNC;
+				goto next_insn;
+			}
+
 			/* In final convert_pseudo_ld_imm64() step, this is
 			 * converted into regular 64-bit imm load insn.
 			 */
@@ -10687,9 +10840,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
 	int insn_cnt = env->prog->len;
 	int i;
 
-	for (i = 0; i < insn_cnt; i++, insn++)
-		if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
-			insn->src_reg = 0;
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+			continue;
+		if (insn->src_reg == BPF_PSEUDO_FUNC)
+			continue;
+		insn->src_reg = 0;
+	}
 }
 
 /* single env->prog->insni[off] instruction was replaced with the range
@@ -11330,6 +11487,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		return 0;
 
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+		if (bpf_pseudo_func(insn)) {
+			env->insn_aux_data[i].call_imm = insn->imm;
+			/* subprog is encoded in insn[1].imm */
+			continue;
+		}
+
 		if (!bpf_pseudo_call(insn))
 			continue;
 		/* Upon error here we cannot fall back to interpreter but
@@ -11459,6 +11622,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	for (i = 0; i < env->subprog_cnt; i++) {
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
+			if (bpf_pseudo_func(insn)) {
+				subprog = insn[1].imm;
+				insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+				insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+				continue;
+			}
 			if (!bpf_pseudo_call(insn))
 				continue;
 			subprog = insn->off;
@@ -11504,6 +11673,11 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+		if (bpf_pseudo_func(insn)) {
+			insn[0].imm = env->insn_aux_data[i].call_imm;
+			insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+			continue;
+		}
 		if (!bpf_pseudo_call(insn))
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
@@ -11568,6 +11742,14 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 		return -EINVAL;
 	}
 	for (i = 0; i < prog->len; i++, insn++) {
+		if (bpf_pseudo_func(insn)) {
+			/* When JIT fails the progs with callback calls
+			 * have to be rejected, since interpreter doesn't support them yet.
+			 */
+			verbose(env, "callbacks are not allowed in non-JITed programs\n");
+			return -EINVAL;
+		}
+
 		if (!bpf_pseudo_call(insn))
 			continue;
 		depth = get_callee_stack_depth(env, insn, i);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e9701744d8e4..0d23755c2747 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1371,6 +1371,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_task_storage_get_proto;
 	case BPF_FUNC_task_storage_delete:
 		return &bpf_task_storage_delete_proto;
+	case BPF_FUNC_for_each_map_elem:
+		return &bpf_for_each_map_elem_proto;
 	default:
 		return NULL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 79c893310492..b89af20cfa19 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -393,6 +393,15 @@ enum bpf_link_type {
  *                   is struct/union.
  */
 #define BPF_PSEUDO_BTF_ID	3
+/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ * insn[0].imm:      insn offset to the func
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the function
+ * verifier type:    PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC		4
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
@@ -3909,6 +3918,34 @@ union bpf_attr {
  *		* **BPF_MTU_CHK_RET_FRAG_NEEDED**
  *		* **BPF_MTU_CHK_RET_SEGS_TOOBIG**
  *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ *	Description
+ *		For each element in **map**, call **callback_fn** function with
+ *		**map**, **callback_ctx** and other map-specific parameters.
+ *		The **callback_fn** should be a static function and
+ *		the **callback_ctx** should be a pointer to the stack.
+ *		The **flags** is used to control certain aspects of the helper.
+ *		Currently, the **flags** must be 0.
+ *
+ *		The following are a list of supported map types and their
+ *		respective expected callback signatures:
+ *
+ *		BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ *		BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ *		BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ *		long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ *		For per_cpu maps, the map_value is the value on the cpu where the
+ *		bpf_prog is running.
+ *
+ *		If **callback_fn** return 0, the helper will continue to the next
+ *		element. If return value is 1, the helper will skip the rest of
+ *		elements and return. Other return values are not used now.
+ *
+ *	Return
+ *		The number of traversed map elements for success, **-EINVAL** for
+ *		invalid **flags**.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4075,6 +4112,7 @@ union bpf_attr {
 	FN(ima_inode_hash),		\
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
+	FN(for_each_map_elem),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From 314ee05e2fc601a7bece14376547d2b7a04bab67 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 26 Feb 2021 12:49:27 -0800
Subject: bpf: Add hashtab support for bpf_for_each_map_elem() helper

This patch added support for hashmap, percpu hashmap,
lru hashmap and percpu lru hashmap.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210226204927.3885020-1-yhs@fb.com
---
 include/linux/bpf.h   |  4 ++++
 kernel/bpf/hashtab.c  | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c | 27 +++++++++++++++++++++
 3 files changed, 96 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index aeb1b93a4d75..4c730863fa77 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1397,6 +1397,10 @@ void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
 int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
 				struct bpf_link_info *info);
 
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+				   struct bpf_func_state *caller,
+				   struct bpf_func_state *callee);
+
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d63912e73ad9..330d721dd2af 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
 };
 
+static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
+				  void *callback_ctx, u64 flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_nulls_head *head;
+	struct hlist_nulls_node *n;
+	struct htab_elem *elem;
+	u32 roundup_key_size;
+	int i, num_elems = 0;
+	void __percpu *pptr;
+	struct bucket *b;
+	void *key, *val;
+	bool is_percpu;
+	u64 ret = 0;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	is_percpu = htab_is_percpu(htab);
+
+	roundup_key_size = round_up(map->key_size, 8);
+	/* disable migration so percpu value prepared here will be the
+	 * same as the one seen by the bpf program with bpf_map_lookup_elem().
+	 */
+	if (is_percpu)
+		migrate_disable();
+	for (i = 0; i < htab->n_buckets; i++) {
+		b = &htab->buckets[i];
+		rcu_read_lock();
+		head = &b->head;
+		hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+			key = elem->key;
+			if (is_percpu) {
+				/* current cpu value for percpu map */
+				pptr = htab_elem_get_ptr(elem, map->key_size);
+				val = this_cpu_ptr(pptr);
+			} else {
+				val = elem->key + roundup_key_size;
+			}
+			num_elems++;
+			ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
+					(u64)(long)key, (u64)(long)val,
+					(u64)(long)callback_ctx, 0);
+			/* return value: 0 - continue, 1 - stop and return */
+			if (ret) {
+				rcu_read_unlock();
+				goto out;
+			}
+		}
+		rcu_read_unlock();
+	}
+out:
+	if (is_percpu)
+		migrate_enable();
+	return num_elems;
+}
+
 static int htab_map_btf_id;
 const struct bpf_map_ops htab_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
@@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 	.map_gen_lookup = htab_map_gen_lookup,
 	.map_seq_show_elem = htab_map_seq_show_elem,
+	.map_set_for_each_callback_args = map_set_for_each_callback_args,
+	.map_for_each_callback = bpf_for_each_hash_elem,
 	BATCH_OPS(htab),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_map_btf_id,
@@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 	.map_gen_lookup = htab_lru_map_gen_lookup,
 	.map_seq_show_elem = htab_map_seq_show_elem,
+	.map_set_for_each_callback_args = map_set_for_each_callback_args,
+	.map_for_each_callback = bpf_for_each_hash_elem,
 	BATCH_OPS(htab_lru),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_lru_map_btf_id,
@@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_update_elem = htab_percpu_map_update_elem,
 	.map_delete_elem = htab_map_delete_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+	.map_set_for_each_callback_args = map_set_for_each_callback_args,
+	.map_for_each_callback = bpf_for_each_hash_elem,
 	BATCH_OPS(htab_percpu),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_percpu_map_btf_id,
@@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_update_elem = htab_lru_percpu_map_update_elem,
 	.map_delete_elem = htab_lru_map_delete_elem,
 	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
+	.map_set_for_each_callback_args = map_set_for_each_callback_args,
+	.map_for_each_callback = bpf_for_each_hash_elem,
 	BATCH_OPS(htab_lru_percpu),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_lru_percpu_map_btf_id,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 53afe9461b03..9fe90ce52a65 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5403,6 +5403,33 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	return 0;
 }
 
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+				   struct bpf_func_state *caller,
+				   struct bpf_func_state *callee)
+{
+	/* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
+	 *      void *callback_ctx, u64 flags);
+	 * callback_fn(struct bpf_map *map, void *key, void *value,
+	 *      void *callback_ctx);
+	 */
+	callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+	callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+	callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+	callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+	callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+	/* pointer to stack or null */
+	callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+	return 0;
+}
+
 static int set_callee_state(struct bpf_verifier_env *env,
 			    struct bpf_func_state *caller,
 			    struct bpf_func_state *callee, int insn_idx)
-- 
cgit v1.2.3


From e940e0895a82c6fbaa259f2615eb52b57ee91a7e Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Fri, 26 Feb 2021 10:24:56 +0100
Subject: can: skb: can_skb_set_owner(): fix ref counting if socket was closed
 before setting skb ownership

There are two ref count variables controlling the free()ing of a socket:
- struct sock::sk_refcnt - which is changed by sock_hold()/sock_put()
- struct sock::sk_wmem_alloc - which accounts the memory allocated by
  the skbs in the send path.

In case there are still TX skbs on the fly and the socket() is closed,
the struct sock::sk_refcnt reaches 0. In the TX-path the CAN stack
clones an "echo" skb, calls sock_hold() on the original socket and
references it. This produces the following back trace:

| WARNING: CPU: 0 PID: 280 at lib/refcount.c:25 refcount_warn_saturate+0x114/0x134
| refcount_t: addition on 0; use-after-free.
| Modules linked in: coda_vpu(E) v4l2_jpeg(E) videobuf2_vmalloc(E) imx_vdoa(E)
| CPU: 0 PID: 280 Comm: test_can.sh Tainted: G            E     5.11.0-04577-gf8ff6603c617 #203
| Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree)
| Backtrace:
| [<80bafea4>] (dump_backtrace) from [<80bb0280>] (show_stack+0x20/0x24) r7:00000000 r6:600f0113 r5:00000000 r4:81441220
| [<80bb0260>] (show_stack) from [<80bb593c>] (dump_stack+0xa0/0xc8)
| [<80bb589c>] (dump_stack) from [<8012b268>] (__warn+0xd4/0x114) r9:00000019 r8:80f4a8c2 r7:83e4150c r6:00000000 r5:00000009 r4:80528f90
| [<8012b194>] (__warn) from [<80bb09c4>] (warn_slowpath_fmt+0x88/0xc8) r9:83f26400 r8:80f4a8d1 r7:00000009 r6:80528f90 r5:00000019 r4:80f4a8c2
| [<80bb0940>] (warn_slowpath_fmt) from [<80528f90>] (refcount_warn_saturate+0x114/0x134) r8:00000000 r7:00000000 r6:82b44000 r5:834e5600 r4:83f4d540
| [<80528e7c>] (refcount_warn_saturate) from [<8079a4c8>] (__refcount_add.constprop.0+0x4c/0x50)
| [<8079a47c>] (__refcount_add.constprop.0) from [<8079a57c>] (can_put_echo_skb+0xb0/0x13c)
| [<8079a4cc>] (can_put_echo_skb) from [<8079ba98>] (flexcan_start_xmit+0x1c4/0x230) r9:00000010 r8:83f48610 r7:0fdc0000 r6:0c080000 r5:82b44000 r4:834e5600
| [<8079b8d4>] (flexcan_start_xmit) from [<80969078>] (netdev_start_xmit+0x44/0x70) r9:814c0ba0 r8:80c8790c r7:00000000 r6:834e5600 r5:82b44000 r4:82ab1f00
| [<80969034>] (netdev_start_xmit) from [<809725a4>] (dev_hard_start_xmit+0x19c/0x318) r9:814c0ba0 r8:00000000 r7:82ab1f00 r6:82b44000 r5:00000000 r4:834e5600
| [<80972408>] (dev_hard_start_xmit) from [<809c6584>] (sch_direct_xmit+0xcc/0x264) r10:834e5600 r9:00000000 r8:00000000 r7:82b44000 r6:82ab1f00 r5:834e5600 r4:83f27400
| [<809c64b8>] (sch_direct_xmit) from [<809c6c0c>] (__qdisc_run+0x4f0/0x534)

To fix this problem, only set skb ownership to sockets which have still
a ref count > 0.

Fixes: 0ae89beb283a ("can: add destructor for self generated skbs")
Cc: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: Andre Naujoks <nautsch2@gmail.com>
Link: https://lore.kernel.org/r/20210226092456.27126-1-o.rempel@pengutronix.de
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/skb.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index 685f34cfba20..d438eb058069 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -65,8 +65,12 @@ static inline void can_skb_reserve(struct sk_buff *skb)
 
 static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk)
 {
-	if (sk) {
-		sock_hold(sk);
+	/* If the socket has already been closed by user space, the
+	 * refcount may already be 0 (and the socket will be freed
+	 * after the last TX skb has been freed). So only increase
+	 * socket refcount if the refcount is > 0.
+	 */
+	if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
 		skb->destructor = sock_efree;
 		skb->sk = sk;
 	}
-- 
cgit v1.2.3


From 79bfe480a0a0b259ab9fddcd2fe52c03542b1196 Mon Sep 17 00:00:00 2001
From: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
Date: Tue, 16 Feb 2021 00:58:49 +0900
Subject: firmware: xilinx: Remove zynqmp_pm_get_eemi_ops() in
 IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE)

zynqmp_pm_get_eemi_ops() was removed in commit 4db8180ffe7c: "Firmware: xilinx:
Remove eemi ops for fpga related APIs", but not in IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE).
Any driver who want to communicate with PMC using EEMI APIs use the functions provided
for each function
This removed zynqmp_pm_get_eemi_ops() in IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE), and also
modify the documentation for this driver.

Fixes: 4db8180ffe7c ("firmware: xilinx: Remove eemi ops for fpga related APIs")
Signed-off-by: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
Link: https://lore.kernel.org/r/20210215155849.2425846-1-iwamatsu@nigauri.org
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 Documentation/driver-api/xilinx/eemi.rst | 31 ++-----------------------------
 include/linux/firmware/xlnx-zynqmp.h     |  5 -----
 2 files changed, 2 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/xilinx/eemi.rst b/Documentation/driver-api/xilinx/eemi.rst
index 9dcbc6f18d75..c1bc47b9000d 100644
--- a/Documentation/driver-api/xilinx/eemi.rst
+++ b/Documentation/driver-api/xilinx/eemi.rst
@@ -16,35 +16,8 @@ components running across different processing clusters on a chip or
 device to communicate with a power management controller (PMC) on a
 device to issue or respond to power management requests.
 
-EEMI ops is a structure containing all eemi APIs supported by Zynq MPSoC.
-The zynqmp-firmware driver maintain all EEMI APIs in zynqmp_eemi_ops
-structure. Any driver who want to communicate with PMC using EEMI APIs
-can call zynqmp_pm_get_eemi_ops().
-
-Example of EEMI ops::
-
-	/* zynqmp-firmware driver maintain all EEMI APIs */
-	struct zynqmp_eemi_ops {
-		int (*get_api_version)(u32 *version);
-		int (*query_data)(struct zynqmp_pm_query_data qdata, u32 *out);
-	};
-
-	static const struct zynqmp_eemi_ops eemi_ops = {
-		.get_api_version = zynqmp_pm_get_api_version,
-		.query_data = zynqmp_pm_query_data,
-	};
-
-Example of EEMI ops usage::
-
-	static const struct zynqmp_eemi_ops *eemi_ops;
-	u32 ret_payload[PAYLOAD_ARG_CNT];
-	int ret;
-
-	eemi_ops = zynqmp_pm_get_eemi_ops();
-	if (IS_ERR(eemi_ops))
-		return PTR_ERR(eemi_ops);
-
-	ret = eemi_ops->query_data(qdata, ret_payload);
+Any driver who wants to communicate with PMC using EEMI APIs use the
+functions provided for each function.
 
 IOCTL
 ------
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 71177b17eee5..66e2423d9feb 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -354,11 +354,6 @@ int zynqmp_pm_read_pggs(u32 index, u32 *value);
 int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype);
 int zynqmp_pm_set_boot_health_status(u32 value);
 #else
-static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void)
-{
-	return ERR_PTR(-ENODEV);
-}
-
 static inline int zynqmp_pm_get_api_version(u32 *version)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From b228c9b058760500fda5edb3134527f629fc2dc3 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 1 Mar 2021 15:09:44 +0000
Subject: net: expand textsearch ts_state to fit skb_seq_state

The referenced commit expands the skb_seq_state used by
skb_find_text with a 4B frag_off field, growing it to 48B.

This exceeds container ts_state->cb, causing a stack corruption:

[   73.238353] Kernel panic - not syncing: stack-protector: Kernel stack
is corrupted in: skb_find_text+0xc5/0xd0
[   73.247384] CPU: 1 PID: 376 Comm: nping Not tainted 5.11.0+ #4
[   73.252613] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.14.0-2 04/01/2014
[   73.260078] Call Trace:
[   73.264677]  dump_stack+0x57/0x6a
[   73.267866]  panic+0xf6/0x2b7
[   73.270578]  ? skb_find_text+0xc5/0xd0
[   73.273964]  __stack_chk_fail+0x10/0x10
[   73.277491]  skb_find_text+0xc5/0xd0
[   73.280727]  string_mt+0x1f/0x30
[   73.283639]  ipt_do_table+0x214/0x410

The struct is passed between skb_find_text and its callbacks
skb_prepare_seq_read, skb_seq_read and skb_abort_seq read through
the textsearch interface using TS_SKB_CB.

I assumed that this mapped to skb->cb like other .._SKB_CB wrappers.
skb->cb is 48B. But it maps to ts_state->cb, which is only 40B.

skb->cb was increased from 40B to 48B after ts_state was introduced,
in commit 3e3850e989c5 ("[NETFILTER]: Fix xfrm lookup in
ip_route_me_harder/ip6_route_me_harder").

Increase ts_state.cb[] to 48 to fit the struct.

Also add a BUILD_BUG_ON to avoid a repeat.

The alternative is to directly add a dependency from textsearch onto
linux/skbuff.h, but I think the intent is textsearch to have no such
dependencies on its callers.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=211911
Fixes: 97550f6fa592 ("net: compound page support in skb_seq_read")
Reported-by: Kris Karas <bugs-a17@moonlit-rail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/textsearch.h | 2 +-
 net/core/skbuff.c          | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h
index 13770cfe33ad..6673e4d4ac2e 100644
--- a/include/linux/textsearch.h
+++ b/include/linux/textsearch.h
@@ -23,7 +23,7 @@ struct ts_config;
 struct ts_state
 {
 	unsigned int		offset;
-	char			cb[40];
+	char			cb[48];
 };
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 545a472273a5..c421c8f80925 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3659,6 +3659,8 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
 	struct ts_state state;
 	unsigned int ret;
 
+	BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
+
 	config->get_next_block = skb_ts_get_next_block;
 	config->finish = skb_ts_finish;
 
-- 
cgit v1.2.3


From c95c34f01bbda4421c25fdc9b04a4a4aab10d36c Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Wed, 3 Mar 2021 19:56:34 +0100
Subject: xsk: Remove dangling function declaration from header file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xdp_umem_query() is dead for a long time, drop the declaration from
include/linux/netdevice.h

Fixes: c9b47cc1fabc ("xsk: fix bug when trying to use both copy and zero-copy on one queue id")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Link: https://lore.kernel.org/bpf/20210303185636.18070-2-maciej.fijalkowski@intel.com
---
 include/linux/netdevice.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f06fbee8638e..5b67ea89d5f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3959,8 +3959,6 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
 
-int xdp_umem_query(struct net_device *dev, u16 queue_id);
-
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
-- 
cgit v1.2.3


From 6ed6e1c761f6c8391af654facbbbf1748ae9f386 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Mon, 1 Mar 2021 10:48:05 -0800
Subject: skmsg: Add function doc for skb->_sk_redir

This should fix the following warning:

include/linux/skbuff.h:932: warning: Function parameter or member
'_sk_redir' not described in 'sk_buff'

Reported-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Lorenz Bauer <lmb@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210301184805.8174-1-xiyou.wangcong@gmail.com
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bd84f799c952..0503c917d773 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -656,6 +656,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@protocol: Packet protocol from driver
  *	@destructor: Destruct function
  *	@tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
+ *	@_sk_redir: socket redirection information for skmsg
  *	@_nfct: Associated connection, if any (with nfctinfo bits)
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
-- 
cgit v1.2.3


From 4ec591790356f0e5a95f8d278b0cfd04aea2ae52 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jan 2021 10:33:05 +0800
Subject: scsi: sbitmap: Remove sbitmap_clear_bit_unlock

No one uses this helper any more, so kill it.

Link: https://lore.kernel.org/r/20210122023317.687987-2-ming.lei@redhat.com
Cc: Omar Sandoval <osandov@fb.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Tested-by: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/sbitmap.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 74cc6384715e..16353fbee765 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -315,12 +315,6 @@ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int b
 	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
 }
 
-static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
-					    unsigned int bitnr)
-{
-	clear_bit_unlock(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
-}
-
 static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
 {
 	return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
-- 
cgit v1.2.3


From efe1f3a1d5833c0ddd61ee50dbef8908f65a0a5e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jan 2021 10:33:06 +0800
Subject: scsi: sbitmap: Maintain allocation round_robin in sbitmap

Currently the allocation round_robin info is maintained by sbitmap_queue.

However, bit allocation really belongs to sbitmap. Move it there.

Link: https://lore.kernel.org/r/20210122023317.687987-3-ming.lei@redhat.com
Cc: Omar Sandoval <osandov@fb.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: virtualization@lists.linux-foundation.org
Tested-by: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-mq.c          |  2 +-
 block/kyber-iosched.c   |  3 ++-
 drivers/vhost/scsi.c    |  4 ++--
 include/linux/sbitmap.h | 20 ++++++++++----------
 lib/sbitmap.c           | 28 ++++++++++++++--------------
 5 files changed, 29 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d4d7c1caa439..f133bbfa6cd4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2702,7 +2702,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 		goto free_cpumask;
 
 	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
-				gfp, node))
+				gfp, node, false))
 		goto free_ctxs;
 	hctx->nr_ctx = 0;
 
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 33d34d69cade..d39ac13f0105 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -478,7 +478,8 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
-				      ilog2(8), GFP_KERNEL, hctx->numa_node)) {
+				      ilog2(8), GFP_KERNEL, hctx->numa_node,
+				      false)) {
 			while (--i >= 0)
 				sbitmap_free(&khd->kcq_map[i]);
 			goto err_kcqs;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 5de21ad4bd05..3412a3ad28e6 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -614,7 +614,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg,
 		return ERR_PTR(-EIO);
 	}
 
-	tag = sbitmap_get(&svq->scsi_tags, 0, false);
+	tag = sbitmap_get(&svq->scsi_tags, 0);
 	if (tag < 0) {
 		pr_err("Unable to obtain tag for vhost_scsi_cmd\n");
 		return ERR_PTR(-ENOMEM);
@@ -1512,7 +1512,7 @@ static int vhost_scsi_setup_vq_cmds(struct vhost_virtqueue *vq, int max_cmds)
 		return 0;
 
 	if (sbitmap_init_node(&svq->scsi_tags, max_cmds, -1, GFP_KERNEL,
-			      NUMA_NO_NODE))
+			      NUMA_NO_NODE, false))
 		return -ENOMEM;
 	svq->max_cmds = max_cmds;
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 16353fbee765..734ee6214cd6 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -56,6 +56,11 @@ struct sbitmap {
 	 */
 	unsigned int map_nr;
 
+	/**
+	 * @round_robin: Allocate bits in strict round-robin order.
+	 */
+	bool round_robin;
+
 	/**
 	 * @map: Allocated bitmap.
 	 */
@@ -124,11 +129,6 @@ struct sbitmap_queue {
 	 */
 	atomic_t ws_active;
 
-	/**
-	 * @round_robin: Allocate bits in strict round-robin order.
-	 */
-	bool round_robin;
-
 	/**
 	 * @min_shallow_depth: The minimum shallow depth which may be passed to
 	 * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
@@ -144,11 +144,14 @@ struct sbitmap_queue {
  *         given, a good default is chosen.
  * @flags: Allocation flags.
  * @node: Memory node to allocate on.
+ * @round_robin: If true, be stricter about allocation order; always allocate
+ *               starting from the last allocated bit. This is less efficient
+ *               than the default behavior (false).
  *
  * Return: Zero on success or negative errno on failure.
  */
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
-		      gfp_t flags, int node);
+		      gfp_t flags, int node, bool round_robin);
 
 /**
  * sbitmap_free() - Free memory used by a &struct sbitmap.
@@ -174,15 +177,12 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
  * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
  * @sb: Bitmap to allocate from.
  * @alloc_hint: Hint for where to start searching for a free bit.
- * @round_robin: If true, be stricter about allocation order; always allocate
- *               starting from the last allocated bit. This is less efficient
- *               than the default behavior (false).
  *
  * This operation provides acquire barrier semantics if it succeeds.
  *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
-int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint);
 
 /**
  * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index d693d9213ceb..7000636933b3 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -33,7 +33,7 @@ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
 }
 
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
-		      gfp_t flags, int node)
+		      gfp_t flags, int node, bool round_robin)
 {
 	unsigned int bits_per_word;
 	unsigned int i;
@@ -58,6 +58,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 	sb->shift = shift;
 	sb->depth = depth;
 	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+	sb->round_robin = round_robin;
 
 	if (depth == 0) {
 		sb->map = NULL;
@@ -129,14 +130,14 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 }
 
 static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
-				     unsigned int alloc_hint, bool round_robin)
+				     unsigned int alloc_hint)
 {
 	struct sbitmap_word *map = &sb->map[index];
 	int nr;
 
 	do {
 		nr = __sbitmap_get_word(&map->word, map->depth, alloc_hint,
-					!round_robin);
+					!sb->round_robin);
 		if (nr != -1)
 			break;
 		if (!sbitmap_deferred_clear(map))
@@ -146,7 +147,7 @@ static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
 	return nr;
 }
 
-int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint)
 {
 	unsigned int i, index;
 	int nr = -1;
@@ -158,14 +159,13 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 	 * alloc_hint to find the right word index. No point in looping
 	 * twice in find_next_zero_bit() for that case.
 	 */
-	if (round_robin)
+	if (sb->round_robin)
 		alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);
 	else
 		alloc_hint = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
-						round_robin);
+		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint);
 		if (nr != -1) {
 			nr += index << sb->shift;
 			break;
@@ -350,7 +350,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	int ret;
 	int i;
 
-	ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node);
+	ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node,
+				round_robin);
 	if (ret)
 		return ret;
 
@@ -382,7 +383,6 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 		atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
 	}
 
-	sbq->round_robin = round_robin;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
@@ -424,12 +424,12 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 		hint = depth ? prandom_u32() % depth : 0;
 		this_cpu_write(*sbq->alloc_hint, hint);
 	}
-	nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);
+	nr = sbitmap_get(&sbq->sb, hint);
 
 	if (nr == -1) {
 		/* If the map is full, a hint won't do us much good. */
 		this_cpu_write(*sbq->alloc_hint, 0);
-	} else if (nr == hint || unlikely(sbq->round_robin)) {
+	} else if (nr == hint || unlikely(sbq->sb.round_robin)) {
 		/* Only update the hint if we used it. */
 		hint = nr + 1;
 		if (hint >= depth - 1)
@@ -460,7 +460,7 @@ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 	if (nr == -1) {
 		/* If the map is full, a hint won't do us much good. */
 		this_cpu_write(*sbq->alloc_hint, 0);
-	} else if (nr == hint || unlikely(sbq->round_robin)) {
+	} else if (nr == hint || unlikely(sbq->sb.round_robin)) {
 		/* Only update the hint if we used it. */
 		hint = nr + 1;
 		if (hint >= depth - 1)
@@ -576,7 +576,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 	smp_mb__after_atomic();
 	sbitmap_queue_wake_up(sbq);
 
-	if (likely(!sbq->round_robin && nr < sbq->sb.depth))
+	if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
 		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
@@ -633,7 +633,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	}
 	seq_puts(m, "}\n");
 
-	seq_printf(m, "round_robin=%d\n", sbq->round_robin);
+	seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin);
 	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
-- 
cgit v1.2.3


From c548e62bcf6adc7066ff201e9ecc88e536dd8890 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jan 2021 10:33:08 +0800
Subject: scsi: sbitmap: Move allocation hint into sbitmap

Allocation hint should have belonged to sbitmap. Also, when sbitmap's depth
is high and there is no need to use mulitple wakeup queues, user can
benefit from percpu allocation hint too.

Move allocation hint into sbitmap, then SCSI device queue can benefit from
allocation hint when converting to plain sbitmap.

Convert vhost/scsi.c to use sbitmap allocation with percpu alloc hint. This
is more efficient than the previous approach.

Link: https://lore.kernel.org/r/20210122023317.687987-5-ming.lei@redhat.com
Cc: Omar Sandoval <osandov@fb.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: virtualization@lists.linux-foundation.org
Tested-by: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-mq.c          |   2 +-
 block/kyber-iosched.c   |   2 +-
 drivers/vhost/scsi.c    |   4 +-
 include/linux/sbitmap.h |  41 +++++++++++-------
 lib/sbitmap.c           | 112 ++++++++++++++++++++++++++++--------------------
 5 files changed, 96 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f133bbfa6cd4..2e8c94e00f58 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2702,7 +2702,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 		goto free_cpumask;
 
 	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
-				gfp, node, false))
+				gfp, node, false, false))
 		goto free_ctxs;
 	hctx->nr_ctx = 0;
 
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index d39ac13f0105..8969e122f081 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -479,7 +479,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
 				      ilog2(8), GFP_KERNEL, hctx->numa_node,
-				      false)) {
+				      false, false)) {
 			while (--i >= 0)
 				sbitmap_free(&khd->kcq_map[i]);
 			goto err_kcqs;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 3412a3ad28e6..936584250a0b 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -614,7 +614,7 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg,
 		return ERR_PTR(-EIO);
 	}
 
-	tag = sbitmap_get(&svq->scsi_tags, 0);
+	tag = sbitmap_get(&svq->scsi_tags);
 	if (tag < 0) {
 		pr_err("Unable to obtain tag for vhost_scsi_cmd\n");
 		return ERR_PTR(-ENOMEM);
@@ -1512,7 +1512,7 @@ static int vhost_scsi_setup_vq_cmds(struct vhost_virtqueue *vq, int max_cmds)
 		return 0;
 
 	if (sbitmap_init_node(&svq->scsi_tags, max_cmds, -1, GFP_KERNEL,
-			      NUMA_NO_NODE, false))
+			      NUMA_NO_NODE, false, true))
 		return -ENOMEM;
 	svq->max_cmds = max_cmds;
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 734ee6214cd6..247776fcc02c 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -65,6 +65,14 @@ struct sbitmap {
 	 * @map: Allocated bitmap.
 	 */
 	struct sbitmap_word *map;
+
+	/*
+	 * @alloc_hint: Cache of last successfully allocated or freed bit.
+	 *
+	 * This is per-cpu, which allows multiple users to stick to different
+	 * cachelines until the map is exhausted.
+	 */
+	unsigned int __percpu *alloc_hint;
 };
 
 #define SBQ_WAIT_QUEUES 8
@@ -100,14 +108,6 @@ struct sbitmap_queue {
 	 */
 	struct sbitmap sb;
 
-	/*
-	 * @alloc_hint: Cache of last successfully allocated or freed bit.
-	 *
-	 * This is per-cpu, which allows multiple users to stick to different
-	 * cachelines until the map is exhausted.
-	 */
-	unsigned int __percpu *alloc_hint;
-
 	/**
 	 * @wake_batch: Number of bits which must be freed before we wake up any
 	 * waiters.
@@ -147,11 +147,13 @@ struct sbitmap_queue {
  * @round_robin: If true, be stricter about allocation order; always allocate
  *               starting from the last allocated bit. This is less efficient
  *               than the default behavior (false).
+ * @alloc_hint: If true, apply percpu hint for where to start searching for
+ *              a free bit.
  *
  * Return: Zero on success or negative errno on failure.
  */
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
-		      gfp_t flags, int node, bool round_robin);
+		      gfp_t flags, int node, bool round_robin, bool alloc_hint);
 
 /**
  * sbitmap_free() - Free memory used by a &struct sbitmap.
@@ -159,6 +161,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
  */
 static inline void sbitmap_free(struct sbitmap *sb)
 {
+	free_percpu(sb->alloc_hint);
 	kfree(sb->map);
 	sb->map = NULL;
 }
@@ -176,19 +179,17 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
 /**
  * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
  * @sb: Bitmap to allocate from.
- * @alloc_hint: Hint for where to start searching for a free bit.
  *
  * This operation provides acquire barrier semantics if it succeeds.
  *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
-int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint);
+int sbitmap_get(struct sbitmap *sb);
 
 /**
  * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
  * limiting the depth used from each word.
  * @sb: Bitmap to allocate from.
- * @alloc_hint: Hint for where to start searching for a free bit.
  * @shallow_depth: The maximum number of bits to allocate from a single word.
  *
  * This rather specific operation allows for having multiple users with
@@ -200,8 +201,7 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint);
  *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
-int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
-			unsigned long shallow_depth);
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth);
 
 /**
  * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
@@ -315,6 +315,18 @@ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int b
 	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
 }
 
+/*
+ * Pair of sbitmap_get, and this one applies both cleared bit and
+ * allocation hint.
+ */
+static inline void sbitmap_put(struct sbitmap *sb, unsigned int bitnr)
+{
+	sbitmap_deferred_clear_bit(sb, bitnr);
+
+	if (likely(sb->alloc_hint && !sb->round_robin && bitnr < sb->depth))
+		*this_cpu_ptr(sb->alloc_hint) = bitnr;
+}
+
 static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
 {
 	return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
@@ -363,7 +375,6 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
 {
 	kfree(sbq->ws);
-	free_percpu(sbq->alloc_hint);
 	sbitmap_free(&sbq->sb);
 }
 
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2b43a6aefec3..e395435654aa 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -9,52 +9,51 @@
 #include <linux/sbitmap.h>
 #include <linux/seq_file.h>
 
-static int init_alloc_hint(struct sbitmap_queue *sbq, gfp_t flags)
+static int init_alloc_hint(struct sbitmap *sb, gfp_t flags)
 {
-	unsigned depth = sbq->sb.depth;
+	unsigned depth = sb->depth;
 
-	sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
-	if (!sbq->alloc_hint)
+	sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
+	if (!sb->alloc_hint)
 		return -ENOMEM;
 
-	if (depth && !sbq->sb.round_robin) {
+	if (depth && !sb->round_robin) {
 		int i;
 
 		for_each_possible_cpu(i)
-			*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
+			*per_cpu_ptr(sb->alloc_hint, i) = prandom_u32() % depth;
 	}
-
 	return 0;
 }
 
-static inline unsigned update_alloc_hint_before_get(struct sbitmap_queue *sbq,
+static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb,
 						    unsigned int depth)
 {
 	unsigned hint;
 
-	hint = this_cpu_read(*sbq->alloc_hint);
+	hint = this_cpu_read(*sb->alloc_hint);
 	if (unlikely(hint >= depth)) {
 		hint = depth ? prandom_u32() % depth : 0;
-		this_cpu_write(*sbq->alloc_hint, hint);
+		this_cpu_write(*sb->alloc_hint, hint);
 	}
 
 	return hint;
 }
 
-static inline void update_alloc_hint_after_get(struct sbitmap_queue *sbq,
+static inline void update_alloc_hint_after_get(struct sbitmap *sb,
 					       unsigned int depth,
 					       unsigned int hint,
 					       unsigned int nr)
 {
 	if (nr == -1) {
 		/* If the map is full, a hint won't do us much good. */
-		this_cpu_write(*sbq->alloc_hint, 0);
-	} else if (nr == hint || unlikely(sbq->sb.round_robin)) {
+		this_cpu_write(*sb->alloc_hint, 0);
+	} else if (nr == hint || unlikely(sb->round_robin)) {
 		/* Only update the hint if we used it. */
 		hint = nr + 1;
 		if (hint >= depth - 1)
 			hint = 0;
-		this_cpu_write(*sbq->alloc_hint, hint);
+		this_cpu_write(*sb->alloc_hint, hint);
 	}
 }
 
@@ -82,7 +81,8 @@ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
 }
 
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
-		      gfp_t flags, int node, bool round_robin)
+		      gfp_t flags, int node, bool round_robin,
+		      bool alloc_hint)
 {
 	unsigned int bits_per_word;
 	unsigned int i;
@@ -114,9 +114,18 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		return 0;
 	}
 
+	if (alloc_hint) {
+		if (init_alloc_hint(sb, flags))
+			return -ENOMEM;
+	} else {
+		sb->alloc_hint = NULL;
+	}
+
 	sb->map = kcalloc_node(sb->map_nr, sizeof(*sb->map), flags, node);
-	if (!sb->map)
+	if (!sb->map) {
+		free_percpu(sb->alloc_hint);
 		return -ENOMEM;
+	}
 
 	for (i = 0; i < sb->map_nr; i++) {
 		sb->map[i].depth = min(depth, bits_per_word);
@@ -196,7 +205,7 @@ static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
 	return nr;
 }
 
-int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint)
+static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint)
 {
 	unsigned int i, index;
 	int nr = -1;
@@ -228,10 +237,27 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint)
 
 	return nr;
 }
+
+int sbitmap_get(struct sbitmap *sb)
+{
+	int nr;
+	unsigned int hint, depth;
+
+	if (WARN_ON_ONCE(unlikely(!sb->alloc_hint)))
+		return -1;
+
+	depth = READ_ONCE(sb->depth);
+	hint = update_alloc_hint_before_get(sb, depth);
+	nr = __sbitmap_get(sb, hint);
+	update_alloc_hint_after_get(sb, depth, hint, nr);
+
+	return nr;
+}
 EXPORT_SYMBOL_GPL(sbitmap_get);
 
-int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
-			unsigned long shallow_depth)
+static int __sbitmap_get_shallow(struct sbitmap *sb,
+				 unsigned int alloc_hint,
+				 unsigned long shallow_depth)
 {
 	unsigned int i, index;
 	int nr = -1;
@@ -263,6 +289,22 @@ again:
 
 	return nr;
 }
+
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
+{
+	int nr;
+	unsigned int hint, depth;
+
+	if (WARN_ON_ONCE(unlikely(!sb->alloc_hint)))
+		return -1;
+
+	depth = READ_ONCE(sb->depth);
+	hint = update_alloc_hint_before_get(sb, depth);
+	nr = __sbitmap_get_shallow(sb, hint, shallow_depth);
+	update_alloc_hint_after_get(sb, depth, hint, nr);
+
+	return nr;
+}
 EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
 
 bool sbitmap_any_bit_set(const struct sbitmap *sb)
@@ -400,15 +442,10 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	int i;
 
 	ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node,
-				round_robin);
+				round_robin, true);
 	if (ret)
 		return ret;
 
-	if (init_alloc_hint(sbq, flags) != 0) {
-		sbitmap_free(&sbq->sb);
-		return -ENOMEM;
-	}
-
 	sbq->min_shallow_depth = UINT_MAX;
 	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
@@ -416,7 +453,6 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
-		free_percpu(sbq->alloc_hint);
 		sbitmap_free(&sbq->sb);
 		return -ENOMEM;
 	}
@@ -458,32 +494,16 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
 
 int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 {
-	unsigned int hint, depth;
-	int nr;
-
-	depth = READ_ONCE(sbq->sb.depth);
-	hint = update_alloc_hint_before_get(sbq, depth);
-	nr = sbitmap_get(&sbq->sb, hint);
-	update_alloc_hint_after_get(sbq, depth, hint, nr);
-
-	return nr;
+	return sbitmap_get(&sbq->sb);
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
 
 int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
 				unsigned int shallow_depth)
 {
-	unsigned int hint, depth;
-	int nr;
-
 	WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);
 
-	depth = READ_ONCE(sbq->sb.depth);
-	hint = update_alloc_hint_before_get(sbq, depth);
-	nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth);
-	update_alloc_hint_after_get(sbq, depth, hint, nr);
-
-	return nr;
+	return sbitmap_get_shallow(&sbq->sb, shallow_depth);
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
 
@@ -592,7 +612,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 	sbitmap_queue_wake_up(sbq);
 
 	if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
-		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
+		*per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
 
@@ -630,7 +650,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 		if (!first)
 			seq_puts(m, ", ");
 		first = false;
-		seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i));
+		seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i));
 	}
 	seq_puts(m, "}\n");
 
-- 
cgit v1.2.3


From cbb9950b41dd9dfb7c2be3429ba09f83b8b1ff98 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jan 2021 10:33:09 +0800
Subject: scsi: sbitmap: Export sbitmap_weight

SCSI's .device_busy will be converted to sbitmap and sbitmap_weight is
needed. Export the helper.

The only existing user of sbitmap_weight() uses it to find out how many
bits are set and not cleared. Align sbitmap_weight() meaning with this
usage model.

Link: https://lore.kernel.org/r/20210122023317.687987-6-ming.lei@redhat.com
Cc: Omar Sandoval <osandov@fb.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Tested-by: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/sbitmap.h | 10 ++++++++++
 lib/sbitmap.c           | 11 ++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 247776fcc02c..c65ba887dcc3 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -341,6 +341,16 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
  */
 void sbitmap_show(struct sbitmap *sb, struct seq_file *m);
 
+
+/**
+ * sbitmap_weight() - Return how many set and not cleared bits in a &struct
+ * sbitmap.
+ * @sb: Bitmap to check.
+ *
+ * Return: How many set and not cleared bits set
+ */
+unsigned int sbitmap_weight(const struct sbitmap *sb);
+
 /**
  * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct
  * seq_file.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index e395435654aa..73da26ad021e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -334,20 +334,21 @@ static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
 	return weight;
 }
 
-static unsigned int sbitmap_weight(const struct sbitmap *sb)
+static unsigned int sbitmap_cleared(const struct sbitmap *sb)
 {
-	return __sbitmap_weight(sb, true);
+	return __sbitmap_weight(sb, false);
 }
 
-static unsigned int sbitmap_cleared(const struct sbitmap *sb)
+unsigned int sbitmap_weight(const struct sbitmap *sb)
 {
-	return __sbitmap_weight(sb, false);
+	return __sbitmap_weight(sb, true) - sbitmap_cleared(sb);
 }
+EXPORT_SYMBOL_GPL(sbitmap_weight);
 
 void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
 {
 	seq_printf(m, "depth=%u\n", sb->depth);
-	seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
+	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
 	seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
 	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
 	seq_printf(m, "map_nr=%u\n", sb->map_nr);
-- 
cgit v1.2.3


From 2d13b1ea9f4affdaa7af0e0e4a1358d28f80c54f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jan 2021 10:33:10 +0800
Subject: scsi: sbitmap: Add sbitmap_calculate_shift() helper

Move code for calculating default shift into a public helper which can be
used by SCSI.

Link: https://lore.kernel.org/r/20210122023317.687987-7-ming.lei@redhat.com
Cc: Omar Sandoval <osandov@fb.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Tested-by: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/sbitmap.h | 18 ++++++++++++++++++
 lib/sbitmap.c           | 16 +++-------------
 2 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index c65ba887dcc3..3087e1f15fdd 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -332,6 +332,24 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
 	return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
+static inline int sbitmap_calculate_shift(unsigned int depth)
+{
+	int	shift = ilog2(BITS_PER_LONG);
+
+	/*
+	 * If the bitmap is small, shrink the number of bits per word so
+	 * we spread over a few cachelines, at least. If less than 4
+	 * bits, just forget about it, it's not going to work optimally
+	 * anyway.
+	 */
+	if (depth >= 4) {
+		while ((4U << shift) > depth)
+			shift--;
+	}
+
+	return shift;
+}
+
 /**
  * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
  * @sb: Bitmap to show.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 73da26ad021e..47b3691058eb 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -87,19 +87,9 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 	unsigned int bits_per_word;
 	unsigned int i;
 
-	if (shift < 0) {
-		shift = ilog2(BITS_PER_LONG);
-		/*
-		 * If the bitmap is small, shrink the number of bits per word so
-		 * we spread over a few cachelines, at least. If less than 4
-		 * bits, just forget about it, it's not going to work optimally
-		 * anyway.
-		 */
-		if (depth >= 4) {
-			while ((4U << shift) > depth)
-				shift--;
-		}
-	}
+	if (shift < 0)
+		shift = sbitmap_calculate_shift(depth);
+
 	bits_per_word = 1U << shift;
 	if (bits_per_word > BITS_PER_LONG)
 		return -EINVAL;
-- 
cgit v1.2.3


From d022d18c045fc2ccf92d0f14cf80f98eb0a8e119 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jan 2021 10:33:11 +0800
Subject: scsi: blk-mq: Add callbacks for storing & retrieving budget token

Since SCSI is the only driver which requires dispatch budget move the token
from struct request to struct scsi_cmnd.

Link: https://lore.kernel.org/r/20210122023317.687987-8-ming.lei@redhat.com
Cc: Omar Sandoval <osandov@fb.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Tested-by: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_lib.c  | 18 ++++++++++++++++++
 include/linux/blk-mq.h   |  9 +++++++++
 include/scsi/scsi_cmnd.h |  2 ++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7d52a11e1b61..02da34ab335d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1642,6 +1642,20 @@ static bool scsi_mq_get_budget(struct request_queue *q)
 	return false;
 }
 
+static void scsi_mq_set_rq_budget_token(struct request *req, int token)
+{
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
+
+	cmd->budget_token = token;
+}
+
+static int scsi_mq_get_rq_budget_token(struct request *req)
+{
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
+
+	return cmd->budget_token;
+}
+
 static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 			 const struct blk_mq_queue_data *bd)
 {
@@ -1856,6 +1870,8 @@ static const struct blk_mq_ops scsi_mq_ops_no_commit = {
 	.cleanup_rq	= scsi_cleanup_rq,
 	.busy		= scsi_mq_lld_busy,
 	.map_queues	= scsi_map_queues,
+	.set_rq_budget_token = scsi_mq_set_rq_budget_token,
+	.get_rq_budget_token = scsi_mq_get_rq_budget_token,
 };
 
 
@@ -1884,6 +1900,8 @@ static const struct blk_mq_ops scsi_mq_ops = {
 	.cleanup_rq	= scsi_cleanup_rq,
 	.busy		= scsi_mq_lld_busy,
 	.map_queues	= scsi_map_queues,
+	.set_rq_budget_token = scsi_mq_set_rq_budget_token,
+	.get_rq_budget_token = scsi_mq_get_rq_budget_token,
 };
 
 struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2c473c9b8990..5fae401f083d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -313,6 +313,15 @@ struct blk_mq_ops {
 	 */
 	void (*put_budget)(struct request_queue *);
 
+	/*
+	 * @set_rq_budget_toekn: store rq's budget token
+	 */
+	void (*set_rq_budget_token)(struct request *, int);
+	/*
+	 * @get_rq_budget_toekn: retrieve rq's budget token
+	 */
+	int (*get_rq_budget_token)(struct request *);
+
 	/**
 	 * @timeout: Called on request timeout.
 	 */
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index ace15b5dc956..0cf2b5ccc1f7 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -75,6 +75,8 @@ struct scsi_cmnd {
 
 	int eh_eflags;		/* Used by error handlr */
 
+	int budget_token;
+
 	/*
 	 * This is set to jiffies as it was when the command was first
 	 * allocated.  It is used to time how long the command has
-- 
cgit v1.2.3


From 2a5a24aa83382a88c43d18a901fab66e6ffe1199 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 22 Jan 2021 10:33:12 +0800
Subject: scsi: blk-mq: Return budget token from .get_budget callback

SCSI uses a global atomic variable to track queue depth for each
LUN/request queue.

This doesn't scale well when there are lots of CPU cores and the disk is
very fast. It has been observed that IOPS is affected a lot by tracking
queue depth via sdev->device_busy in the I/O path.

Return budget token from .get_budget callback. The budget token can be
passed to driver so that we can replace the atomic variable with
sbitmap_queue and alleviate the scaling problems that way.

Link: https://lore.kernel.org/r/20210122023317.687987-9-ming.lei@redhat.com
Cc: Omar Sandoval <osandov@fb.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D. Milne <emilne@redhat.com>
Tested-by: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-mq-sched.c    | 17 +++++++++++++----
 block/blk-mq.c          | 36 +++++++++++++++++++++++++-----------
 block/blk-mq.h          | 25 +++++++++++++++++++++----
 drivers/scsi/scsi_lib.c | 16 +++++++++++-----
 include/linux/blk-mq.h  |  4 ++--
 5 files changed, 72 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ddb65e9e6fd9..712b7f0afd63 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -131,6 +131,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 
 	do {
 		struct request *rq;
+		int budget_token;
 
 		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 			break;
@@ -140,12 +141,13 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 			break;
 		}
 
-		if (!blk_mq_get_dispatch_budget(q))
+		budget_token = blk_mq_get_dispatch_budget(q);
+		if (budget_token < 0)
 			break;
 
 		rq = e->type->ops.dispatch_request(hctx);
 		if (!rq) {
-			blk_mq_put_dispatch_budget(q);
+			blk_mq_put_dispatch_budget(q, budget_token);
 			/*
 			 * We're releasing without dispatching. Holding the
 			 * budget could have blocked any "hctx"s with the
@@ -157,6 +159,8 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 			break;
 		}
 
+		blk_mq_set_rq_budget_token(rq, budget_token);
+
 		/*
 		 * Now this rq owns the budget which has to be released
 		 * if this rq won't be queued to driver via .queue_rq()
@@ -230,6 +234,8 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 	struct request *rq;
 
 	do {
+		int budget_token;
+
 		if (!list_empty_careful(&hctx->dispatch)) {
 			ret = -EAGAIN;
 			break;
@@ -238,12 +244,13 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
 			break;
 
-		if (!blk_mq_get_dispatch_budget(q))
+		budget_token = blk_mq_get_dispatch_budget(q);
+		if (budget_token < 0)
 			break;
 
 		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 		if (!rq) {
-			blk_mq_put_dispatch_budget(q);
+			blk_mq_put_dispatch_budget(q, budget_token);
 			/*
 			 * We're releasing without dispatching. Holding the
 			 * budget could have blocked any "hctx"s with the
@@ -255,6 +262,8 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 			break;
 		}
 
+		blk_mq_set_rq_budget_token(rq, budget_token);
+
 		/*
 		 * Now this rq owns the budget which has to be released
 		 * if this rq won't be queued to driver via .queue_rq()
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2e8c94e00f58..2e825a7a3606 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1277,10 +1277,15 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
 						  bool need_budget)
 {
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+	int budget_token = -1;
 
-	if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
-		blk_mq_put_driver_tag(rq);
-		return PREP_DISPATCH_NO_BUDGET;
+	if (need_budget) {
+		budget_token = blk_mq_get_dispatch_budget(rq->q);
+		if (budget_token < 0) {
+			blk_mq_put_driver_tag(rq);
+			return PREP_DISPATCH_NO_BUDGET;
+		}
+		blk_mq_set_rq_budget_token(rq, budget_token);
 	}
 
 	if (!blk_mq_get_driver_tag(rq)) {
@@ -1297,7 +1302,7 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
 			 * together during handling partial dispatch
 			 */
 			if (need_budget)
-				blk_mq_put_dispatch_budget(rq->q);
+				blk_mq_put_dispatch_budget(rq->q, budget_token);
 			return PREP_DISPATCH_NO_TAG;
 		}
 	}
@@ -1307,12 +1312,16 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
 
 /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
 static void blk_mq_release_budgets(struct request_queue *q,
-		unsigned int nr_budgets)
+		struct list_head *list)
 {
-	int i;
+	struct request *rq;
 
-	for (i = 0; i < nr_budgets; i++)
-		blk_mq_put_dispatch_budget(q);
+	list_for_each_entry(rq, list, queuelist) {
+		int budget_token = blk_mq_get_rq_budget_token(rq);
+
+		if (budget_token >= 0)
+			blk_mq_put_dispatch_budget(q, budget_token);
+	}
 }
 
 /*
@@ -1410,7 +1419,8 @@ out:
 			(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
 		bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
 
-		blk_mq_release_budgets(q, nr_budgets);
+		if (nr_budgets)
+			blk_mq_release_budgets(q, list);
 
 		spin_lock(&hctx->lock);
 		list_splice_tail_init(list, &hctx->dispatch);
@@ -2009,6 +2019,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 {
 	struct request_queue *q = rq->q;
 	bool run_queue = true;
+	int budget_token;
 
 	/*
 	 * RCU or SRCU read lock is needed before checking quiesced flag.
@@ -2026,11 +2037,14 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (q->elevator && !bypass_insert)
 		goto insert;
 
-	if (!blk_mq_get_dispatch_budget(q))
+	budget_token = blk_mq_get_dispatch_budget(q);
+	if (budget_token < 0)
 		goto insert;
 
+	blk_mq_set_rq_budget_token(rq, budget_token);
+
 	if (!blk_mq_get_driver_tag(rq)) {
-		blk_mq_put_dispatch_budget(q);
+		blk_mq_put_dispatch_budget(q, budget_token);
 		goto insert;
 	}
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3616453ca28c..9ce64bc4a6c8 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -187,17 +187,34 @@ unsigned int blk_mq_in_flight(struct request_queue *q,
 void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
 		unsigned int inflight[2]);
 
-static inline void blk_mq_put_dispatch_budget(struct request_queue *q)
+static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
+					      int budget_token)
 {
 	if (q->mq_ops->put_budget)
-		q->mq_ops->put_budget(q);
+		q->mq_ops->put_budget(q, budget_token);
 }
 
-static inline bool blk_mq_get_dispatch_budget(struct request_queue *q)
+static inline int blk_mq_get_dispatch_budget(struct request_queue *q)
 {
 	if (q->mq_ops->get_budget)
 		return q->mq_ops->get_budget(q);
-	return true;
+	return 0;
+}
+
+static inline void blk_mq_set_rq_budget_token(struct request *rq, int token)
+{
+	if (token < 0)
+		return;
+
+	if (rq->q->mq_ops->set_rq_budget_token)
+		rq->q->mq_ops->set_rq_budget_token(rq, token);
+}
+
+static inline int blk_mq_get_rq_budget_token(struct request *rq)
+{
+	if (rq->q->mq_ops->get_rq_budget_token)
+		return rq->q->mq_ops->get_rq_budget_token(rq);
+	return -1;
 }
 
 static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 02da34ab335d..dae9f66caffd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -329,6 +329,7 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
 		atomic_dec(&starget->target_busy);
 
 	atomic_dec(&sdev->device_busy);
+	cmd->budget_token = -1;
 }
 
 static void scsi_kick_queue(struct request_queue *q)
@@ -1143,6 +1144,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	unsigned long jiffies_at_alloc;
 	int retries, to_clear;
 	bool in_flight;
+	int budget_token = cmd->budget_token;
 
 	if (!blk_rq_is_scsi(rq) && !(flags & SCMD_INITIALIZED)) {
 		flags |= SCMD_INITIALIZED;
@@ -1171,6 +1173,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	cmd->retries = retries;
 	if (in_flight)
 		__set_bit(SCMD_STATE_INFLIGHT, &cmd->state);
+	cmd->budget_token = budget_token;
 
 }
 
@@ -1605,19 +1608,19 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
 	blk_mq_complete_request(cmd->request);
 }
 
-static void scsi_mq_put_budget(struct request_queue *q)
+static void scsi_mq_put_budget(struct request_queue *q, int budget_token)
 {
 	struct scsi_device *sdev = q->queuedata;
 
 	atomic_dec(&sdev->device_busy);
 }
 
-static bool scsi_mq_get_budget(struct request_queue *q)
+static int scsi_mq_get_budget(struct request_queue *q)
 {
 	struct scsi_device *sdev = q->queuedata;
 
 	if (scsi_dev_queue_ready(q, sdev))
-		return true;
+		return 0;
 
 	atomic_inc(&sdev->restarts);
 
@@ -1639,7 +1642,7 @@ static bool scsi_mq_get_budget(struct request_queue *q)
 	if (unlikely(atomic_read(&sdev->device_busy) == 0 &&
 				!scsi_device_blocked(sdev)))
 		blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY);
-	return false;
+	return -1;
 }
 
 static void scsi_mq_set_rq_budget_token(struct request *req, int token)
@@ -1667,6 +1670,8 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_status_t ret;
 	int reason;
 
+	WARN_ON_ONCE(cmd->budget_token < 0);
+
 	/*
 	 * If the device is not in running state we will reject some or all
 	 * commands.
@@ -1718,7 +1723,8 @@ out_dec_target_busy:
 	if (scsi_target(sdev)->can_queue > 0)
 		atomic_dec(&scsi_target(sdev)->target_busy);
 out_put_budget:
-	scsi_mq_put_budget(q);
+	scsi_mq_put_budget(q, cmd->budget_token);
+	cmd->budget_token = -1;
 	switch (ret) {
 	case BLK_STS_OK:
 		break;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5fae401f083d..3bd3ee651143 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -306,12 +306,12 @@ struct blk_mq_ops {
 	 * reserved budget. Also we have to handle failure case
 	 * of .get_budget for avoiding I/O deadlock.
 	 */
-	bool (*get_budget)(struct request_queue *);
+	int (*get_budget)(struct request_queue *);
 
 	/**
 	 * @put_budget: Release the reserved budget.
 	 */
-	void (*put_budget)(struct request_queue *);
+	void (*put_budget)(struct request_queue *, int);
 
 	/*
 	 * @set_rq_budget_toekn: store rq's budget token
-- 
cgit v1.2.3


From 7c32e8f8bc33a5f4b113a630857e46634e3e143b Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Wed, 3 Mar 2021 10:18:13 +0000
Subject: bpf: Add PROG_TEST_RUN support for sk_lookup programs

Allow to pass sk_lookup programs to PROG_TEST_RUN. User space
provides the full bpf_sk_lookup struct as context. Since the
context includes a socket pointer that can't be exposed
to user space we define that PROG_TEST_RUN returns the cookie
of the selected socket or zero in place of the socket pointer.

We don't support testing programs that select a reuseport socket,
since this would mean running another (unrelated) BPF program
from the sk_lookup test handler.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210303101816.36774-3-lmb@cloudflare.com
---
 include/linux/bpf.h            |  10 ++++
 include/uapi/linux/bpf.h       |   5 +-
 net/bpf/test_run.c             | 105 +++++++++++++++++++++++++++++++++++++++++
 net/core/filter.c              |   1 +
 tools/include/uapi/linux/bpf.h |   5 +-
 5 files changed, 124 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4c730863fa77..c931bc97019d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1491,6 +1491,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 			     const union bpf_attr *kattr,
 			     union bpf_attr __user *uattr);
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+				const union bpf_attr *kattr,
+				union bpf_attr __user *uattr);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
@@ -1692,6 +1695,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	return -ENOTSUPP;
 }
 
+static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+					      const union bpf_attr *kattr,
+					      union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
+
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63a56ed6a785..7f530e349aff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5953,7 +5953,10 @@ struct bpf_pidns_info {
 
 /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
 struct bpf_sk_lookup {
-	__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+	union {
+		__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+		__u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+	};
 
 	__u32 family;		/* Protocol family (AF_INET, AF_INET6) */
 	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index eb3c78cd4d7c..0abdd67f44b1 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,8 +10,10 @@
 #include <net/bpf_sk_storage.h>
 #include <net/sock.h>
 #include <net/tcp.h>
+#include <net/net_namespace.h>
 #include <linux/error-injection.h>
 #include <linux/smp.h>
+#include <linux/sock_diag.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/bpf_test_run.h>
@@ -781,3 +783,106 @@ out:
 	kfree(data);
 	return ret;
 }
+
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr,
+				union bpf_attr __user *uattr)
+{
+	struct bpf_test_timer t = { NO_PREEMPT };
+	struct bpf_prog_array *progs = NULL;
+	struct bpf_sk_lookup_kern ctx = {};
+	u32 repeat = kattr->test.repeat;
+	struct bpf_sk_lookup *user_ctx;
+	u32 retval, duration;
+	int ret = -EINVAL;
+
+	if (prog->type != BPF_PROG_TYPE_SK_LOOKUP)
+		return -EINVAL;
+
+	if (kattr->test.flags || kattr->test.cpu)
+		return -EINVAL;
+
+	if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out ||
+	    kattr->test.data_size_out)
+		return -EINVAL;
+
+	if (!repeat)
+		repeat = 1;
+
+	user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx));
+	if (IS_ERR(user_ctx))
+		return PTR_ERR(user_ctx);
+
+	if (!user_ctx)
+		return -EINVAL;
+
+	if (user_ctx->sk)
+		goto out;
+
+	if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
+		goto out;
+
+	if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) {
+		ret = -ERANGE;
+		goto out;
+	}
+
+	ctx.family = (u16)user_ctx->family;
+	ctx.protocol = (u16)user_ctx->protocol;
+	ctx.dport = (u16)user_ctx->local_port;
+	ctx.sport = (__force __be16)user_ctx->remote_port;
+
+	switch (ctx.family) {
+	case AF_INET:
+		ctx.v4.daddr = (__force __be32)user_ctx->local_ip4;
+		ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4;
+		break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6;
+		ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6;
+		break;
+#endif
+
+	default:
+		ret = -EAFNOSUPPORT;
+		goto out;
+	}
+
+	progs = bpf_prog_array_alloc(1, GFP_KERNEL);
+	if (!progs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	progs->items[0].prog = prog;
+
+	bpf_test_timer_enter(&t);
+	do {
+		ctx.selected_sk = NULL;
+		retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN);
+	} while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
+	bpf_test_timer_leave(&t);
+
+	if (ret < 0)
+		goto out;
+
+	user_ctx->cookie = 0;
+	if (ctx.selected_sk) {
+		if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+
+		user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
+	}
+
+	ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration);
+	if (!ret)
+		ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
+
+out:
+	bpf_prog_array_free(progs);
+	kfree(user_ctx);
+	return ret;
+}
diff --git a/net/core/filter.c b/net/core/filter.c
index 13bcf248ee7b..a526db494c62 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10457,6 +10457,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
 }
 
 const struct bpf_prog_ops sk_lookup_prog_ops = {
+	.test_run = bpf_prog_test_run_sk_lookup,
 };
 
 const struct bpf_verifier_ops sk_lookup_verifier_ops = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 63a56ed6a785..7f530e349aff 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5953,7 +5953,10 @@ struct bpf_pidns_info {
 
 /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
 struct bpf_sk_lookup {
-	__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+	union {
+		__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+		__u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+	};
 
 	__u32 family;		/* Protocol family (AF_INET, AF_INET6) */
 	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
-- 
cgit v1.2.3


From 183f47fcaa54a5ffe671d990186d330ac8c63b10 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 18 Feb 2021 18:31:24 +0100
Subject: kcov: Remove kcov include from sched.h and move it to its users.

The recent addition of in_serving_softirq() to kconv.h results in
compile failure on PREEMPT_RT because it requires
task_struct::softirq_disable_cnt. This is not available if kconv.h is
included from sched.h.

It is not needed to include kconv.h from sched.h. All but the net/ user
already include the kconv header file.

Move the include of the kconv.h header from sched.h it its users.
Additionally include sched.h from kconv.h to ensure that everything
task_struct related is available.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Andrey Konovalov <andreyknvl@google.com>
Link: https://lkml.kernel.org/r/20210218173124.iy5iyqv3a4oia4vv@linutronix.de
---
 drivers/usb/usbip/usbip_common.h | 1 +
 include/linux/kcov.h             | 1 +
 include/linux/sched.h            | 1 -
 net/core/skbuff.c                | 1 +
 net/mac80211/iface.c             | 1 +
 net/mac80211/rx.c                | 1 +
 6 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h
index d60ce17d3dd2..a7dd6c66aee5 100644
--- a/drivers/usb/usbip/usbip_common.h
+++ b/drivers/usb/usbip/usbip_common.h
@@ -18,6 +18,7 @@
 #include <linux/usb.h>
 #include <linux/wait.h>
 #include <linux/sched/task.h>
+#include <linux/kcov.h>
 #include <uapi/linux/usbip.h>
 
 #undef pr_fmt
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index 4e3037dc1204..55dc338f6bcd 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_KCOV_H
 #define _LINUX_KCOV_H
 
+#include <linux/sched.h>
 #include <uapi/linux/kcov.h>
 
 struct task_struct;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ef00bb22164c..cf245bc237e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -14,7 +14,6 @@
 #include <linux/pid.h>
 #include <linux/sem.h>
 #include <linux/shm.h>
-#include <linux/kcov.h>
 #include <linux/mutex.h>
 #include <linux/plist.h>
 #include <linux/hrtimer.h>
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 545a472273a5..420f23c0f81f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -60,6 +60,7 @@
 #include <linux/prefetch.h>
 #include <linux/if_vlan.h>
 #include <linux/mpls.h>
+#include <linux/kcov.h>
 
 #include <net/protocol.h>
 #include <net/dst.h>
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b80c9b016b2b..c127debdc12e 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -15,6 +15,7 @@
 #include <linux/if_arp.h>
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
+#include <linux/kcov.h>
 #include <net/mac80211.h>
 #include <net/ieee80211_radiotap.h>
 #include "ieee80211_i.h"
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index c1343c028b76..62047e93e217 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -17,6 +17,7 @@
 #include <linux/etherdevice.h>
 #include <linux/rcupdate.h>
 #include <linux/export.h>
+#include <linux/kcov.h>
 #include <linux/bitops.h>
 #include <net/mac80211.h>
 #include <net/ieee80211_radiotap.h>
-- 
cgit v1.2.3


From e7fcd762282332f765af2035a9568fb126fa3c01 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Wed, 3 Mar 2021 11:46:56 +0800
Subject: psi: Add PSI_CPU_FULL state

The FULL state doesn't exist for the CPU resource at the system level,
but exist at the cgroup level, means all non-idle tasks in a cgroup are
delayed on the CPU resource which used by others outside of the cgroup
or throttled by the cgroup cpu.max configuration.

Co-developed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20210303034659.91735-2-zhouchengming@bytedance.com
---
 include/linux/psi_types.h |  3 ++-
 kernel/sched/psi.c        | 14 +++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index b95f3211566a..0a23300d49af 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -50,9 +50,10 @@ enum psi_states {
 	PSI_MEM_SOME,
 	PSI_MEM_FULL,
 	PSI_CPU_SOME,
+	PSI_CPU_FULL,
 	/* Only per-CPU, to weigh the CPU in the global average: */
 	PSI_NONIDLE,
-	NR_PSI_STATES = 6,
+	NR_PSI_STATES = 7,
 };
 
 enum psi_aggregators {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 967732c0766c..2293c45d289d 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -34,7 +34,10 @@
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * (Naturally, the FULL state doesn't exist for the CPU resource.)
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level, means all non-idle tasks
+ * in a cgroup are delayed on the CPU resource which used by others outside
+ * of the cgroup or throttled by the cgroup cpu.max configuration.
  *
  *	SOME = nr_delayed_tasks != 0
  *	FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
@@ -225,6 +228,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
 	case PSI_CPU_SOME:
 		return tasks[NR_RUNNING] > tasks[NR_ONCPU];
+	case PSI_CPU_FULL:
+		return tasks[NR_RUNNING] && !tasks[NR_ONCPU];
 	case PSI_NONIDLE:
 		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
 			tasks[NR_RUNNING];
@@ -678,8 +683,11 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 		}
 	}
 
-	if (groupc->state_mask & (1 << PSI_CPU_SOME))
+	if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
 		groupc->times[PSI_CPU_SOME] += delta;
+		if (groupc->state_mask & (1 << PSI_CPU_FULL))
+			groupc->times[PSI_CPU_FULL] += delta;
+	}
 
 	if (groupc->state_mask & (1 << PSI_NONIDLE))
 		groupc->times[PSI_NONIDLE] += delta;
@@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 		group->avg_next_update = update_averages(group, now);
 	mutex_unlock(&group->avgs_lock);
 
-	for (full = 0; full < 2 - (res == PSI_CPU); full++) {
+	for (full = 0; full < 2; full++) {
 		unsigned long avg[3];
 		u64 total;
 		int w;
-- 
cgit v1.2.3


From 7fae6c8171d20ac55402930ee8ae760cf85dff7b Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Wed, 3 Mar 2021 11:46:57 +0800
Subject: psi: Use ONCPU state tracking machinery to detect reclaim

Move the reclaim detection from the timer tick to the task state
tracking machinery using the recently added ONCPU state. And we
also add task psi_flags changes checking in the psi_task_switch()
optimization to update the parents properly.

In terms of performance and cost, this ONCPU task state tracking
is not cheaper than previous timer tick in aggregate. But the code is
simpler and shorter this way, so it's a maintainability win. And
Johannes did some testing with perf bench, the performace and cost
changes would be acceptable for real workloads.

Thanks to Johannes Weiner for pointing out the psi_task_switch()
optimization things and the clearer changelog.

Co-developed-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20210303034659.91735-3-zhouchengming@bytedance.com
---
 include/linux/psi.h  |  1 -
 kernel/sched/core.c  |  1 -
 kernel/sched/psi.c   | 65 +++++++++++++++++++---------------------------------
 kernel/sched/stats.h |  9 --------
 4 files changed, 24 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7361023f3fdd..65eb1476ac70 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep);
 
-void psi_memstall_tick(struct task_struct *task, int cpu);
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 361974efc243..d2629fd2af07 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4551,7 +4551,6 @@ void scheduler_tick(void)
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
 	calc_global_load_tick(rq);
-	psi_task_tick(rq);
 
 	rq_unlock(rq, &rf);
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 2293c45d289d..0fe6ff6a6a15 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
 	wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu,
-			 bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, int cpu)
 {
 	u32 delta;
 	u64 now;
@@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 		groupc->times[PSI_MEM_SOME] += delta;
 		if (groupc->state_mask & (1 << PSI_MEM_FULL))
 			groupc->times[PSI_MEM_FULL] += delta;
-		else if (memstall_tick) {
-			u32 sample;
-			/*
-			 * Since we care about lost potential, a
-			 * memstall is FULL when there are no other
-			 * working tasks, but also when the CPU is
-			 * actively reclaiming and nothing productive
-			 * could run even if it were runnable.
-			 *
-			 * When the timer tick sees a reclaiming CPU,
-			 * regardless of runnable tasks, sample a FULL
-			 * tick (or less if it hasn't been a full tick
-			 * since the last state change).
-			 */
-			sample = min(delta, (u32)jiffies_to_nsecs(1));
-			groupc->times[PSI_MEM_FULL] += sample;
-		}
 	}
 
 	if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
@@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	 */
 	write_seqcount_begin(&groupc->seq);
 
-	record_times(groupc, cpu, false);
+	record_times(groupc, cpu);
 
 	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
 		if (!(m & (1 << t)))
@@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (test_state(groupc->tasks, s))
 			state_mask |= (1 << s);
 	}
+
+	/*
+	 * Since we care about lost potential, a memstall is FULL
+	 * when there are no other working tasks, but also when
+	 * the CPU is actively reclaiming and nothing productive
+	 * could run even if it were runnable. So when the current
+	 * task in a cgroup is in_memstall, the corresponding groupc
+	 * on that cpu is in PSI_MEM_FULL state.
+	 */
+	if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
+		state_mask |= (1 << PSI_MEM_FULL);
+
 	groupc->state_mask = state_mask;
 
 	write_seqcount_end(&groupc->seq);
@@ -823,17 +817,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 	void *iter;
 
 	if (next->pid) {
+		bool identical_state;
+
 		psi_flags_change(next, 0, TSK_ONCPU);
 		/*
-		 * When moving state between tasks, the group that
-		 * contains them both does not change: we can stop
-		 * updating the tree once we reach the first common
-		 * ancestor. Iterate @next's ancestors until we
-		 * encounter @prev's state.
+		 * When switching between tasks that have an identical
+		 * runtime state, the cgroup that contains both tasks
+		 * runtime state, the cgroup that contains both tasks
+		 * we reach the first common ancestor. Iterate @next's
+		 * ancestors only until we encounter @prev's ONCPU.
 		 */
+		identical_state = prev->psi_flags == next->psi_flags;
 		iter = NULL;
 		while ((group = iterate_groups(next, &iter))) {
-			if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+			if (identical_state &&
+			    per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
 				common = group;
 				break;
 			}
@@ -859,21 +857,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 	}
 }
 
-void psi_memstall_tick(struct task_struct *task, int cpu)
-{
-	struct psi_group *group;
-	void *iter = NULL;
-
-	while ((group = iterate_groups(task, &iter))) {
-		struct psi_group_cpu *groupc;
-
-		groupc = per_cpu_ptr(group->pcpu, cpu);
-		write_seqcount_begin(&groupc->seq);
-		record_times(groupc, cpu, true);
-		write_seqcount_end(&groupc->seq);
-	}
-}
-
 /**
  * psi_memstall_enter - mark the beginning of a memory stall section
  * @flags: flags to handle nested sections
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..9e4e67a94731 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -144,14 +144,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
 	psi_task_switch(prev, next, sleep);
 }
 
-static inline void psi_task_tick(struct rq *rq)
-{
-	if (static_branch_likely(&psi_disabled))
-		return;
-
-	if (unlikely(rq->curr->in_memstall))
-		psi_memstall_tick(rq->curr, cpu_of(rq));
-}
 #else /* CONFIG_PSI */
 static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
 static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
@@ -159,7 +151,6 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
 				    bool sleep) {}
-static inline void psi_task_tick(struct rq *rq) {}
 #endif /* CONFIG_PSI */
 
 #ifdef CONFIG_SCHED_INFO
-- 
cgit v1.2.3


From cbe16f35bee6880becca6f20d2ebf6b457148552 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 3 Mar 2021 11:49:15 +1300
Subject: genirq: Add IRQF_NO_AUTOEN for request_irq/nmi()

Many drivers don't want interrupts enabled automatically via request_irq().
So they are handling this issue by either way of the below two:

(1)
  irq_set_status_flags(irq, IRQ_NOAUTOEN);
  request_irq(dev, irq...);

(2)
  request_irq(dev, irq...);
  disable_irq(irq);

The code in the second way is silly and unsafe. In the small time gap
between request_irq() and disable_irq(), interrupts can still come.

The code in the first way is safe though it's subobtimal.

Add a new IRQF_NO_AUTOEN flag which can be handed in by drivers to
request_irq() and request_nmi(). It prevents the automatic enabling of the
requested interrupt/nmi in the same safe way as #1 above. With that the
various usage sites of #1 and #2 above can be simplified and corrected.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: dmitry.torokhov@gmail.com
Link: https://lore.kernel.org/r/20210302224916.13980-2-song.bao.hua@hisilicon.com
---
 include/linux/interrupt.h |  4 ++++
 kernel/irq/manage.c       | 11 +++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 967e25767153..76f1161a441a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -61,6 +61,9 @@
  *                interrupt handler after suspending interrupts. For system
  *                wakeup devices users need to implement wakeup detection in
  *                their interrupt handlers.
+ * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it.
+ *                Users will enable it explicitly by enable_irq() or enable_nmi()
+ *                later.
  */
 #define IRQF_SHARED		0x00000080
 #define IRQF_PROBE_SHARED	0x00000100
@@ -74,6 +77,7 @@
 #define IRQF_NO_THREAD		0x00010000
 #define IRQF_EARLY_RESUME	0x00020000
 #define IRQF_COND_SUSPEND	0x00040000
+#define IRQF_NO_AUTOEN		0x00080000
 
 #define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dec3f73e8db9..97c231a5644c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1693,7 +1693,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
 		}
 
-		if (irq_settings_can_autoenable(desc)) {
+		if (!(new->flags & IRQF_NO_AUTOEN) &&
+		    irq_settings_can_autoenable(desc)) {
 			irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
 		} else {
 			/*
@@ -2086,10 +2087,15 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	 * which interrupt is which (messes up the interrupt freeing
 	 * logic etc).
 	 *
+	 * Also shared interrupts do not go well with disabling auto enable.
+	 * The sharing interrupt might request it while it's still disabled
+	 * and then wait for interrupts forever.
+	 *
 	 * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
 	 * it cannot be set along with IRQF_NO_SUSPEND.
 	 */
 	if (((irqflags & IRQF_SHARED) && !dev_id) ||
+	    ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) ||
 	    (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
 	    ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
 		return -EINVAL;
@@ -2245,7 +2251,8 @@ int request_nmi(unsigned int irq, irq_handler_t handler,
 
 	desc = irq_to_desc(irq);
 
-	if (!desc || irq_settings_can_autoenable(desc) ||
+	if (!desc || (irq_settings_can_autoenable(desc) &&
+	    !(irqflags & IRQF_NO_AUTOEN)) ||
 	    !irq_settings_can_request(desc) ||
 	    WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
 	    !irq_supports_nmi(desc))
-- 
cgit v1.2.3


From 3e31f94752e454bdd0ca4a1d046ee21f80c166c5 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Fri, 26 Feb 2021 17:06:58 -0700
Subject: lockdep: Add lockdep_assert_not_held()

Some kernel functions must be called without holding a specific lock.
Add lockdep_assert_not_held() to be used in these functions to detect
incorrect calls while holding a lock.

lockdep_assert_not_held() provides the opposite functionality of
lockdep_assert_held() which is used to assert calls that require
holding a specific lock.

Incorporates suggestions from Peter Zijlstra to avoid misfires when
lockdep_off() is employed.

The need for lockdep_assert_not_held() came up in a discussion on
ath10k patch. ath10k_drain_tx() and i915_vma_pin_ww() are examples
of functions that can use lockdep_assert_not_held().

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/linux-wireless/871rdmu9z9.fsf@codeaurora.org/
---
 include/linux/lockdep.h  | 11 ++++++++---
 kernel/locking/lockdep.c |  6 +++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 7b7ebf2e28ec..dbd9ea846b36 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -301,8 +301,12 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
 
-#define lockdep_assert_held(l)	do {				\
-		WARN_ON(debug_locks && !lockdep_is_held(l));	\
+#define lockdep_assert_held(l)	do {					\
+		WARN_ON(debug_locks && lockdep_is_held(l) == 0);	\
+	} while (0)
+
+#define lockdep_assert_not_held(l)	do {				\
+		WARN_ON(debug_locks && lockdep_is_held(l) == 1);	\
 	} while (0)
 
 #define lockdep_assert_held_write(l)	do {			\
@@ -393,7 +397,8 @@ extern int lockdep_is_held(const void *);
 #define lockdep_is_held_type(l, r)		(1)
 
 #define lockdep_assert_held(l)			do { (void)(l); } while (0)
-#define lockdep_assert_held_write(l)	do { (void)(l); } while (0)
+#define lockdep_assert_not_held(l)		do { (void)(l); } while (0)
+#define lockdep_assert_held_write(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_read(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c6d0c1dc6253..969736b33185 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5539,8 +5539,12 @@ noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
 	unsigned long flags;
 	int ret = 0;
 
+	/*
+	 * Avoid false negative lockdep_assert_held() and
+	 * lockdep_assert_not_held().
+	 */
 	if (unlikely(!lockdep_enabled()))
-		return 1; /* avoid false negative lockdep_assert_held() */
+		return -1;
 
 	raw_local_irq_save(flags);
 	check_flags(flags);
-- 
cgit v1.2.3


From f8cfa46608f8aa5ca5421ce281ab314129c15411 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Fri, 26 Feb 2021 17:06:59 -0700
Subject: lockdep: Add lockdep lock state defines

Adds defines for lock state returns from lock_is_held_type() based on
Johannes Berg's suggestions as it make it easier to read and maintain
the lock states. These are defines and a enum to avoid changes to
lock_is_held_type() and lockdep_is_held() return types.

Updates to lock_is_held_type() and  __lock_is_held() to use the new
defines.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/linux-wireless/871rdmu9z9.fsf@codeaurora.org/
---
 include/linux/lockdep.h  | 11 +++++++++--
 kernel/locking/lockdep.c | 11 ++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index dbd9ea846b36..17805aac0e85 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -268,6 +268,11 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 extern void lock_release(struct lockdep_map *lock, unsigned long ip);
 
+/* lock_is_held_type() returns */
+#define LOCK_STATE_UNKNOWN	-1
+#define LOCK_STATE_NOT_HELD	0
+#define LOCK_STATE_HELD		1
+
 /*
  * Same "read" as for lock_acquire(), except -1 means any.
  */
@@ -302,11 +307,13 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
 
 #define lockdep_assert_held(l)	do {					\
-		WARN_ON(debug_locks && lockdep_is_held(l) == 0);	\
+		WARN_ON(debug_locks &&					\
+			lockdep_is_held(l) == LOCK_STATE_NOT_HELD);	\
 	} while (0)
 
 #define lockdep_assert_not_held(l)	do {				\
-		WARN_ON(debug_locks && lockdep_is_held(l) == 1);	\
+		WARN_ON(debug_locks &&					\
+			lockdep_is_held(l) == LOCK_STATE_HELD);		\
 	} while (0)
 
 #define lockdep_assert_held_write(l)	do {			\
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 969736b33185..c0b8926a67f0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -54,6 +54,7 @@
 #include <linux/nmi.h>
 #include <linux/rcupdate.h>
 #include <linux/kprobes.h>
+#include <linux/lockdep.h>
 
 #include <asm/sections.h>
 
@@ -5252,13 +5253,13 @@ int __lock_is_held(const struct lockdep_map *lock, int read)
 
 		if (match_held_lock(hlock, lock)) {
 			if (read == -1 || hlock->read == read)
-				return 1;
+				return LOCK_STATE_HELD;
 
-			return 0;
+			return LOCK_STATE_NOT_HELD;
 		}
 	}
 
-	return 0;
+	return LOCK_STATE_NOT_HELD;
 }
 
 static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
@@ -5537,14 +5538,14 @@ EXPORT_SYMBOL_GPL(lock_release);
 noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
 {
 	unsigned long flags;
-	int ret = 0;
+	int ret = LOCK_STATE_NOT_HELD;
 
 	/*
 	 * Avoid false negative lockdep_assert_held() and
 	 * lockdep_assert_not_held().
 	 */
 	if (unlikely(!lockdep_enabled()))
-		return -1;
+		return LOCK_STATE_UNKNOWN;
 
 	raw_local_irq_save(flags);
 	check_flags(flags);
-- 
cgit v1.2.3


From a5398bffc01fe044848c5024e5e867e407f239b8 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 30 Nov 2020 11:38:40 -0800
Subject: perf/core: Flush PMU internal buffers for per-CPU events

Sometimes the PMU internal buffers have to be flushed for per-CPU events
during a context switch, e.g., large PEBS. Otherwise, the perf tool may
report samples in locations that do not belong to the process where the
samples are processed in, because PEBS does not tag samples with PID/TID.

The current code only flush the buffers for a per-task event. It doesn't
check a per-CPU event.

Add a new event state flag, PERF_ATTACH_SCHED_CB, to indicate that the
PMU internal buffers have to be flushed for this event during a context
switch.

Add sched_cb_entry and perf_sched_cb_usages back to track the PMU/cpuctx
which is required to be flushed.

Only need to invoke the sched_task() for per-CPU events in this patch.
The per-task events have been handled in perf_event_context_sched_in/out
already.

Fixes: 9c964efa4330 ("perf/x86/intel: Drain the PEBS buffer during context switches")
Reported-by: Gabriel Marin <gmx@google.com>
Originally-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20201130193842.10569-1-kan.liang@linux.intel.com
---
 include/linux/perf_event.h |  2 ++
 kernel/events/core.c       | 42 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fab42cfbd350..3f7f89ea5e51 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -606,6 +606,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
 #define PERF_ATTACH_ITRACE	0x10
+#define PERF_ATTACH_SCHED_CB	0x20
 
 struct perf_cgroup;
 struct perf_buffer;
@@ -872,6 +873,7 @@ struct perf_cpu_context {
 	struct list_head		cgrp_cpuctx_entry;
 #endif
 
+	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
 
 	int				online;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0aeca5f3c0ac..03db40f6cba9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
 static atomic_t perf_sched_count;
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -3461,11 +3462,16 @@ unlock:
 	}
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	--cpuctx->sched_cb_usage;
+	this_cpu_dec(perf_sched_cb_usages);
+
+	if (!--cpuctx->sched_cb_usage)
+		list_del(&cpuctx->sched_cb_entry);
 }
 
 
@@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 
-	cpuctx->sched_cb_usage++;
+	if (!cpuctx->sched_cb_usage++)
+		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+	this_cpu_inc(perf_sched_cb_usages);
 }
 
 /*
@@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (prev == next)
+		return;
+
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+		/* will be handled in perf_event_context_sched_in/out */
+		if (cpuctx->task_ctx)
+			continue;
+
+		__perf_pmu_sched_task(cpuctx, sched_in);
+	}
+}
+
 static void perf_event_switch(struct task_struct *task,
 			      struct task_struct *next_prev, bool sched_in);
 
@@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(task, next, false);
+
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, next, false);
 
@@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 
 	if (atomic_read(&nr_switch_events))
 		perf_event_switch(task, prev, true);
+
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		dec = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_dec(&nr_mmap_events);
@@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event)
 	if (event->parent)
 		return;
 
-	if (event->attach_state & PERF_ATTACH_TASK)
+	if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
 		inc = true;
 	if (event->attr.mmap || event->attr.mmap_data)
 		atomic_inc(&nr_mmap_events);
@@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void)
 #ifdef CONFIG_CGROUP_PERF
 		INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
 #endif
+		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 
-- 
cgit v1.2.3


From 291c4011dd7ac0cd0cebb727a75ee5a50d16dcf7 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Sat, 20 Feb 2021 15:17:10 -0800
Subject: cpumask: Mark functions as pure

cpumask_next_and() and cpumask_any_but() are pure, and marking them as
such seems to generate different and presumably better code for
native_flush_tlb_multi().

Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/r/20210220231712.2475218-8-namit@vmware.com
---
 include/linux/cpumask.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 383684e30f12..c53364c4296d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -235,7 +235,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp)
 	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
-unsigned int cpumask_next(int n, const struct cpumask *srcp);
+unsigned int __pure cpumask_next(int n, const struct cpumask *srcp);
 
 /**
  * cpumask_next_zero - get the next unset cpu in a cpumask
@@ -252,8 +252,8 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
 }
 
-int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
-int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
+int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
 unsigned int cpumask_local_spread(unsigned int i, int node);
 int cpumask_any_and_distribute(const struct cpumask *src1p,
 			       const struct cpumask *src2p);
-- 
cgit v1.2.3


From a5aa5ce300597224ec76dacc8e63ba3ad7a18bbd Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Sat, 20 Feb 2021 15:17:12 -0800
Subject: smp: Inline on_each_cpu_cond() and on_each_cpu()

Simplify the code and avoid having an additional function on the stack
by inlining on_each_cpu_cond() and on_each_cpu().

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Nadav Amit <namit@vmware.com>
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210220231712.2475218-10-namit@vmware.com
---
 include/linux/smp.h | 50 +++++++++++++++++++++++++++++++++--------------
 kernel/smp.c        | 56 -----------------------------------------------------
 kernel/up.c         | 38 +-----------------------------------
 3 files changed, 37 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 70c6f6284dcf..84a0b4828f66 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -50,30 +50,52 @@ extern unsigned int total_cpus;
 int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 			     int wait);
 
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+			   void *info, bool wait, const struct cpumask *mask);
+
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
+
 /*
  * Call a function on all processors
  */
-void on_each_cpu(smp_call_func_t func, void *info, int wait);
+static inline void on_each_cpu(smp_call_func_t func, void *info, int wait)
+{
+	on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask);
+}
 
-/*
- * Call a function on processors specified by mask, which might include
- * the local one.
+/**
+ * on_each_cpu_mask(): Run a function on processors specified by
+ * cpumask, which may include the local processor.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.  The
+ * exception is that it may be used during early boot while
+ * early_boot_irqs_disabled is set.
  */
-void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
-		void *info, bool wait);
+static inline void on_each_cpu_mask(const struct cpumask *mask,
+				    smp_call_func_t func, void *info, bool wait)
+{
+	on_each_cpu_cond_mask(NULL, func, info, wait, mask);
+}
 
 /*
  * Call a function on each processor for which the supplied function
  * cond_func returns a positive value. This may include the local
- * processor.
+ * processor.  May be used during early boot while early_boot_irqs_disabled is
+ * set. Use local_irq_save/restore() instead of local_irq_disable/enable().
  */
-void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
-		      void *info, bool wait);
-
-void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
-			   void *info, bool wait, const struct cpumask *mask);
-
-int smp_call_function_single_async(int cpu, call_single_data_t *csd);
+static inline void on_each_cpu_cond(smp_cond_func_t cond_func,
+				    smp_call_func_t func, void *info, bool wait)
+{
+	on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
+}
 
 #ifdef CONFIG_SMP
 
diff --git a/kernel/smp.c b/kernel/smp.c
index c8a5a1facc1a..b6375d775e93 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -847,55 +847,6 @@ void __init smp_init(void)
 	smp_cpus_done(setup_max_cpus);
 }
 
-/*
- * Call a function on all processors.  May be used during early boot while
- * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
- * of local_irq_disable/enable().
- */
-void on_each_cpu(smp_call_func_t func, void *info, int wait)
-{
-	unsigned long flags;
-
-	preempt_disable();
-	smp_call_function(func, info, wait);
-	local_irq_save(flags);
-	func(info);
-	local_irq_restore(flags);
-	preempt_enable();
-}
-EXPORT_SYMBOL(on_each_cpu);
-
-/**
- * on_each_cpu_mask(): Run a function on processors specified by
- * cpumask, which may include the local processor.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- *        on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.  The
- * exception is that it may be used during early boot while
- * early_boot_irqs_disabled is set.
- */
-void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
-			void *info, bool wait)
-{
-	unsigned int scf_flags;
-
-	scf_flags = SCF_RUN_LOCAL;
-	if (wait)
-		scf_flags |= SCF_WAIT;
-
-	preempt_disable();
-	smp_call_function_many_cond(mask, func, info, scf_flags, NULL);
-	preempt_enable();
-}
-EXPORT_SYMBOL(on_each_cpu_mask);
-
 /*
  * on_each_cpu_cond(): Call a function on each processor for which
  * the supplied function cond_func returns true, optionally waiting
@@ -932,13 +883,6 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 }
 EXPORT_SYMBOL(on_each_cpu_cond_mask);
 
-void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
-		      void *info, bool wait)
-{
-	on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
-}
-EXPORT_SYMBOL(on_each_cpu_cond);
-
 static void do_nothing(void *unused)
 {
 }
diff --git a/kernel/up.c b/kernel/up.c
index c6f323dcd45b..bf20b4a9af60 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -36,35 +36,6 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 }
 EXPORT_SYMBOL(smp_call_function_single_async);
 
-void on_each_cpu(smp_call_func_t func, void *info, int wait)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	func(info);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(on_each_cpu);
-
-/*
- * Note we still need to test the mask even for UP
- * because we actually can get an empty mask from
- * code that on SMP might call us without the local
- * CPU in the mask.
- */
-void on_each_cpu_mask(const struct cpumask *mask,
-		      smp_call_func_t func, void *info, bool wait)
-{
-	unsigned long flags;
-
-	if (cpumask_test_cpu(0, mask)) {
-		local_irq_save(flags);
-		func(info);
-		local_irq_restore(flags);
-	}
-}
-EXPORT_SYMBOL(on_each_cpu_mask);
-
 /*
  * Preemption is disabled here to make sure the cond_func is called under the
  * same condtions in UP and SMP.
@@ -75,7 +46,7 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 	unsigned long flags;
 
 	preempt_disable();
-	if (cond_func(0, info)) {
+	if ((!cond_func || cond_func(0, info)) && cpumask_test_cpu(0, mask)) {
 		local_irq_save(flags);
 		func(info);
 		local_irq_restore(flags);
@@ -84,13 +55,6 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 }
 EXPORT_SYMBOL(on_each_cpu_cond_mask);
 
-void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
-		      void *info, bool wait)
-{
-	on_each_cpu_cond_mask(cond_func, func, info, wait, NULL);
-}
-EXPORT_SYMBOL(on_each_cpu_cond);
-
 int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
 {
 	int ret;
-- 
cgit v1.2.3


From baf186c4d345f5a105e63df01100936ad622f369 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 6 Mar 2021 11:02:15 +0000
Subject: io_uring: index io_uring->xa by ctx not file

We don't use task file notes anymore, and no need left in indexing
task->io_uring->xa by file, and replace it with ctx. It's better
design-wise, especially since we keep a dangling file, and so have to
keep an eye on not dereferencing it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 24 +++++++++++-------------
 include/linux/io_uring.h |  2 +-
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f448213267c8..01a7fa4a4889 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -809,7 +809,6 @@ struct io_kiocb {
 struct io_tctx_node {
 	struct list_head	ctx_node;
 	struct task_struct	*task;
-	struct file		*file;
 	struct io_ring_ctx	*ctx;
 };
 
@@ -8540,7 +8539,7 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
 struct io_tctx_exit {
 	struct callback_head		task_work;
 	struct completion		completion;
-	unsigned long			index;
+	struct io_ring_ctx		*ctx;
 };
 
 static void io_tctx_exit_cb(struct callback_head *cb)
@@ -8554,7 +8553,7 @@ static void io_tctx_exit_cb(struct callback_head *cb)
 	 * node. It'll be removed by the end of cancellation, just ignore it.
 	 */
 	if (!atomic_read(&tctx->in_idle))
-		io_uring_del_task_file(work->index);
+		io_uring_del_task_file((unsigned long)work->ctx);
 	complete(&work->completion);
 }
 
@@ -8579,7 +8578,7 @@ static void io_ring_exit_work(struct work_struct *work)
 	while (!list_empty(&ctx->tctx_list)) {
 		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
 					ctx_node);
-		exit.index = (unsigned long)node->file;
+		exit.ctx = ctx;
 		init_completion(&exit.completion);
 		init_task_work(&exit.task_work, io_tctx_exit_cb);
 		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
@@ -8798,7 +8797,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 /*
  * Note that this task has used io_uring. We use it for cancelation purposes.
  */
-static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
+static int io_uring_add_task_file(struct io_ring_ctx *ctx)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
@@ -8810,18 +8809,17 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
 			return ret;
 		tctx = current->io_uring;
 	}
-	if (tctx->last != file) {
-		void *old = xa_load(&tctx->xa, (unsigned long)file);
+	if (tctx->last != ctx) {
+		void *old = xa_load(&tctx->xa, (unsigned long)ctx);
 
 		if (!old) {
 			node = kmalloc(sizeof(*node), GFP_KERNEL);
 			if (!node)
 				return -ENOMEM;
 			node->ctx = ctx;
-			node->file = file;
 			node->task = current;
 
-			ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
+			ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
 						node, GFP_KERNEL));
 			if (ret) {
 				kfree(node);
@@ -8832,7 +8830,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
 			list_add(&node->ctx_node, &ctx->tctx_list);
 			mutex_unlock(&ctx->uring_lock);
 		}
-		tctx->last = file;
+		tctx->last = ctx;
 	}
 
 	/*
@@ -8867,7 +8865,7 @@ static void io_uring_del_task_file(unsigned long index)
 	list_del(&node->ctx_node);
 	mutex_unlock(&node->ctx->uring_lock);
 
-	if (tctx->last == node->file)
+	if (tctx->last == node->ctx)
 		tctx->last = NULL;
 	kfree(node);
 }
@@ -9166,7 +9164,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		}
 		submitted = to_submit;
 	} else if (to_submit) {
-		ret = io_uring_add_task_file(ctx, f.file);
+		ret = io_uring_add_task_file(ctx);
 		if (unlikely(ret))
 			goto out;
 		mutex_lock(&ctx->uring_lock);
@@ -9375,7 +9373,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
 	if (fd < 0)
 		return fd;
 
-	ret = io_uring_add_task_file(ctx, file);
+	ret = io_uring_add_task_file(ctx);
 	if (ret) {
 		put_unused_fd(fd);
 		return ret;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 7cb7bd0e334c..9761a0ec9f95 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -18,7 +18,7 @@ struct io_uring_task {
 	/* submission side */
 	struct xarray		xa;
 	struct wait_queue_head	wait;
-	struct file		*last;
+	void			*last;
 	void			*io_wq;
 	struct percpu_counter	inflight;
 	atomic_t		in_idle;
-- 
cgit v1.2.3


From ae2177cf318d169e349319b24a26881ba0e5248f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 4 Mar 2021 16:08:20 +0200
Subject: mtd: spi-nor: intel-spi: Move platform data header to x86 subfolder

In order to group x86 related platform data move intel-spi.h to x86 folder.

While at it, remove duplicate inclusion in C file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
[ta: s/x85/x86]
Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Reviewed-by: Vignesh Raghavendra <vigneshr@ti.com>
Link: https://lore.kernel.org/r/20210304140820.56692-1-andriy.shevchenko@linux.intel.com
---
 drivers/mtd/spi-nor/controllers/intel-spi.c |  1 -
 drivers/mtd/spi-nor/controllers/intel-spi.h |  2 +-
 include/linux/mfd/lpc_ich.h                 |  2 +-
 include/linux/platform_data/intel-spi.h     | 29 -----------------------------
 include/linux/platform_data/x86/intel-spi.h | 29 +++++++++++++++++++++++++++++
 5 files changed, 31 insertions(+), 32 deletions(-)
 delete mode 100644 include/linux/platform_data/intel-spi.h
 create mode 100644 include/linux/platform_data/x86/intel-spi.h

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/controllers/intel-spi.c b/drivers/mtd/spi-nor/controllers/intel-spi.c
index b54a56a68100..a413892ff449 100644
--- a/drivers/mtd/spi-nor/controllers/intel-spi.c
+++ b/drivers/mtd/spi-nor/controllers/intel-spi.c
@@ -15,7 +15,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/spi-nor.h>
-#include <linux/platform_data/intel-spi.h>
 
 #include "intel-spi.h"
 
diff --git a/drivers/mtd/spi-nor/controllers/intel-spi.h b/drivers/mtd/spi-nor/controllers/intel-spi.h
index e2f41b8827bf..f2871179fd34 100644
--- a/drivers/mtd/spi-nor/controllers/intel-spi.h
+++ b/drivers/mtd/spi-nor/controllers/intel-spi.h
@@ -9,7 +9,7 @@
 #ifndef INTEL_SPI_H
 #define INTEL_SPI_H
 
-#include <linux/platform_data/intel-spi.h>
+#include <linux/platform_data/x86/intel-spi.h>
 
 struct intel_spi;
 struct resource;
diff --git a/include/linux/mfd/lpc_ich.h b/include/linux/mfd/lpc_ich.h
index 6ddca2bbb3a8..39967a5eca6d 100644
--- a/include/linux/mfd/lpc_ich.h
+++ b/include/linux/mfd/lpc_ich.h
@@ -8,7 +8,7 @@
 #ifndef LPC_ICH_H
 #define LPC_ICH_H
 
-#include <linux/platform_data/intel-spi.h>
+#include <linux/platform_data/x86/intel-spi.h>
 
 /* GPIO resources */
 #define ICH_RES_GPIO	0
diff --git a/include/linux/platform_data/intel-spi.h b/include/linux/platform_data/intel-spi.h
deleted file mode 100644
index 7f53a5c6f35e..000000000000
--- a/include/linux/platform_data/intel-spi.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Intel PCH/PCU SPI flash driver.
- *
- * Copyright (C) 2016, Intel Corporation
- * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- */
-
-#ifndef INTEL_SPI_PDATA_H
-#define INTEL_SPI_PDATA_H
-
-enum intel_spi_type {
-	INTEL_SPI_BYT = 1,
-	INTEL_SPI_LPT,
-	INTEL_SPI_BXT,
-	INTEL_SPI_CNL,
-};
-
-/**
- * struct intel_spi_boardinfo - Board specific data for Intel SPI driver
- * @type: Type which this controller is compatible with
- * @writeable: The chip is writeable
- */
-struct intel_spi_boardinfo {
-	enum intel_spi_type type;
-	bool writeable;
-};
-
-#endif /* INTEL_SPI_PDATA_H */
diff --git a/include/linux/platform_data/x86/intel-spi.h b/include/linux/platform_data/x86/intel-spi.h
new file mode 100644
index 000000000000..7f53a5c6f35e
--- /dev/null
+++ b/include/linux/platform_data/x86/intel-spi.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel PCH/PCU SPI flash driver.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
+ */
+
+#ifndef INTEL_SPI_PDATA_H
+#define INTEL_SPI_PDATA_H
+
+enum intel_spi_type {
+	INTEL_SPI_BYT = 1,
+	INTEL_SPI_LPT,
+	INTEL_SPI_BXT,
+	INTEL_SPI_CNL,
+};
+
+/**
+ * struct intel_spi_boardinfo - Board specific data for Intel SPI driver
+ * @type: Type which this controller is compatible with
+ * @writeable: The chip is writeable
+ */
+struct intel_spi_boardinfo {
+	enum intel_spi_type type;
+	bool writeable;
+};
+
+#endif /* INTEL_SPI_PDATA_H */
-- 
cgit v1.2.3


From f567d6ef8606fb427636e824c867229ecb5aefab Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maxtram95@gmail.com>
Date: Sun, 7 Feb 2021 16:47:40 +0200
Subject: HID: plantronics: Workaround for double volume key presses

Plantronics Blackwire 3220 Series (047f:c056) sends HID reports twice
for each volume key press. This patch adds a quirk to hid-plantronics
for this product ID, which will ignore the second volume key press if
it happens within 5 ms from the last one that was handled.

The patch was tested on the mentioned model only, it shouldn't affect
other models, however, this quirk might be needed for them too.
Auto-repeat (when a key is held pressed) is not affected, because the
rate is about 3 times per second, which is far less frequent than once
in 5 ms.

Fixes: 81bb773faed7 ("HID: plantronics: Update to map volume up/down controls")
Signed-off-by: Maxim Mikityanskiy <maxtram95@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-ids.h         |  1 +
 drivers/hid/hid-plantronics.c | 60 +++++++++++++++++++++++++++++++++++++++++--
 include/linux/hid.h           |  2 ++
 3 files changed, 61 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index e42aaae3138f..4c74d8706b51 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -944,6 +944,7 @@
 #define USB_DEVICE_ID_ORTEK_IHOME_IMAC_A210S	0x8003
 
 #define USB_VENDOR_ID_PLANTRONICS	0x047f
+#define USB_DEVICE_ID_PLANTRONICS_BLACKWIRE_3220_SERIES	0xc056
 
 #define USB_VENDOR_ID_PANASONIC		0x04da
 #define USB_DEVICE_ID_PANABOARD_UBT780	0x1044
diff --git a/drivers/hid/hid-plantronics.c b/drivers/hid/hid-plantronics.c
index 85b685efc12f..e81b7cec2d12 100644
--- a/drivers/hid/hid-plantronics.c
+++ b/drivers/hid/hid-plantronics.c
@@ -13,6 +13,7 @@
 
 #include <linux/hid.h>
 #include <linux/module.h>
+#include <linux/jiffies.h>
 
 #define PLT_HID_1_0_PAGE	0xffa00000
 #define PLT_HID_2_0_PAGE	0xffa20000
@@ -36,6 +37,16 @@
 #define PLT_ALLOW_CONSUMER (field->application == HID_CP_CONSUMERCONTROL && \
 			    (usage->hid & HID_USAGE_PAGE) == HID_UP_CONSUMER)
 
+#define PLT_QUIRK_DOUBLE_VOLUME_KEYS BIT(0)
+
+#define PLT_DOUBLE_KEY_TIMEOUT 5 /* ms */
+
+struct plt_drv_data {
+	unsigned long device_type;
+	unsigned long last_volume_key_ts;
+	u32 quirks;
+};
+
 static int plantronics_input_mapping(struct hid_device *hdev,
 				     struct hid_input *hi,
 				     struct hid_field *field,
@@ -43,7 +54,8 @@ static int plantronics_input_mapping(struct hid_device *hdev,
 				     unsigned long **bit, int *max)
 {
 	unsigned short mapped_key;
-	unsigned long plt_type = (unsigned long)hid_get_drvdata(hdev);
+	struct plt_drv_data *drv_data = hid_get_drvdata(hdev);
+	unsigned long plt_type = drv_data->device_type;
 
 	/* special case for PTT products */
 	if (field->application == HID_GD_JOYSTICK)
@@ -105,6 +117,30 @@ mapped:
 	return 1;
 }
 
+static int plantronics_event(struct hid_device *hdev, struct hid_field *field,
+			     struct hid_usage *usage, __s32 value)
+{
+	struct plt_drv_data *drv_data = hid_get_drvdata(hdev);
+
+	if (drv_data->quirks & PLT_QUIRK_DOUBLE_VOLUME_KEYS) {
+		unsigned long prev_ts, cur_ts;
+
+		/* Usages are filtered in plantronics_usages. */
+
+		if (!value) /* Handle key presses only. */
+			return 0;
+
+		prev_ts = drv_data->last_volume_key_ts;
+		cur_ts = jiffies;
+		if (jiffies_to_msecs(cur_ts - prev_ts) <= PLT_DOUBLE_KEY_TIMEOUT)
+			return 1; /* Ignore the repeated key. */
+
+		drv_data->last_volume_key_ts = cur_ts;
+	}
+
+	return 0;
+}
+
 static unsigned long plantronics_device_type(struct hid_device *hdev)
 {
 	unsigned i, col_page;
@@ -133,15 +169,24 @@ exit:
 static int plantronics_probe(struct hid_device *hdev,
 			     const struct hid_device_id *id)
 {
+	struct plt_drv_data *drv_data;
 	int ret;
 
+	drv_data = devm_kzalloc(&hdev->dev, sizeof(*drv_data), GFP_KERNEL);
+	if (!drv_data)
+		return -ENOMEM;
+
 	ret = hid_parse(hdev);
 	if (ret) {
 		hid_err(hdev, "parse failed\n");
 		goto err;
 	}
 
-	hid_set_drvdata(hdev, (void *)plantronics_device_type(hdev));
+	drv_data->device_type = plantronics_device_type(hdev);
+	drv_data->quirks = id->driver_data;
+	drv_data->last_volume_key_ts = jiffies - msecs_to_jiffies(PLT_DOUBLE_KEY_TIMEOUT);
+
+	hid_set_drvdata(hdev, drv_data);
 
 	ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT |
 		HID_CONNECT_HIDINPUT_FORCE | HID_CONNECT_HIDDEV_FORCE);
@@ -153,15 +198,26 @@ err:
 }
 
 static const struct hid_device_id plantronics_devices[] = {
+	{ HID_USB_DEVICE(USB_VENDOR_ID_PLANTRONICS,
+					 USB_DEVICE_ID_PLANTRONICS_BLACKWIRE_3220_SERIES),
+		.driver_data = PLT_QUIRK_DOUBLE_VOLUME_KEYS },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_PLANTRONICS, HID_ANY_ID) },
 	{ }
 };
 MODULE_DEVICE_TABLE(hid, plantronics_devices);
 
+static const struct hid_usage_id plantronics_usages[] = {
+	{ HID_CP_VOLUMEUP, EV_KEY, HID_ANY_ID },
+	{ HID_CP_VOLUMEDOWN, EV_KEY, HID_ANY_ID },
+	{ HID_TERMINATOR, HID_TERMINATOR, HID_TERMINATOR }
+};
+
 static struct hid_driver plantronics_driver = {
 	.name = "plantronics",
 	.id_table = plantronics_devices,
+	.usage_table = plantronics_usages,
 	.input_mapping = plantronics_input_mapping,
+	.event = plantronics_event,
 	.probe = plantronics_probe,
 };
 module_hid_driver(plantronics_driver);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index ef702b3f56e3..3e33eb14118c 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -262,6 +262,8 @@ struct hid_item {
 #define HID_CP_SELECTION	0x000c0080
 #define HID_CP_MEDIASELECTION	0x000c0087
 #define HID_CP_SELECTDISC	0x000c00ba
+#define HID_CP_VOLUMEUP		0x000c00e9
+#define HID_CP_VOLUMEDOWN	0x000c00ea
 #define HID_CP_PLAYBACKSPEED	0x000c00f1
 #define HID_CP_PROXIMITY	0x000c0109
 #define HID_CP_SPEAKERSYSTEM	0x000c0160
-- 
cgit v1.2.3


From 03ee318391707e822eb915f4f30fe42c78b9d89b Mon Sep 17 00:00:00 2001
From: Maximilian Luz <luzmaximilian@gmail.com>
Date: Thu, 4 Mar 2021 20:05:24 +0100
Subject: platform/surface: aggregator: Make SSAM_DEFINE_SYNC_REQUEST_x define
 static functions

The SSAM_DEFINE_SYNC_REQUEST_x() macros are intended to reduce
boiler-plate code for SSAM request definitions by defining a wrapper
function for the specified request. The client device variants of those
macros, i.e. SSAM_DEFINE_SYNC_REQUEST_CL_x() in particular rely on the
multi-device (MD) variants, e.g.:

    #define SSAM_DEFINE_SYNC_REQUEST_CL_R(name, rtype, spec...)   \
        SSAM_DEFINE_SYNC_REQUEST_MD_R(__raw_##name, rtype, spec)  \
        int name(struct ssam_device *sdev, rtype *ret)            \
        {                                                         \
            return __raw_##name(sdev->ctrl, sdev->uid.target,     \
                                sdev->uid.instance, ret);         \
        }

This now creates the problem that it is not possible to declare the
generated functions static via

    static SSAM_DEFINE_SYNC_REQUEST_CL_R(...)

as this will only apply to the function defined by the multi-device
macro, i.e. SSAM_DEFINE_SYNC_REQUEST_MD_R(). Thus compiling with
`-Wmissing-prototypes' rightfully complains that there is a 'static'
keyword missing.

To solve this, make all SSAM_DEFINE_SYNC_REQUEST_x() macros define
static functions. Non-client-device macros are also changed for
consistency. In general, we expect those functions to be only used
locally in the respective drivers for the corresponding interfaces, so
having to define a wrapper function to be able to export this should be
the odd case out.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: b78b4982d763 ("platform/surface: Add platform profile driver")
Signed-off-by: Maximilian Luz <luzmaximilian@gmail.com>
Link: https://lore.kernel.org/r/20210304190524.1172197-1-luzmaximilian@gmail.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 .../driver-api/surface_aggregator/client.rst       |  4 +-
 drivers/platform/surface/aggregator/controller.c   | 10 +--
 .../platform/surface/surface_aggregator_registry.c |  2 +-
 .../platform/surface/surface_platform_profile.c    |  4 +-
 include/linux/surface_aggregator/controller.h      | 74 +++++++++++-----------
 include/linux/surface_aggregator/device.h          | 31 ++++-----
 6 files changed, 63 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/surface_aggregator/client.rst b/Documentation/driver-api/surface_aggregator/client.rst
index 26d13085a117..e519d374c378 100644
--- a/Documentation/driver-api/surface_aggregator/client.rst
+++ b/Documentation/driver-api/surface_aggregator/client.rst
@@ -248,7 +248,7 @@ This example defines a function
 
 .. code-block:: c
 
-   int __ssam_tmp_perf_mode_set(struct ssam_controller *ctrl, const __le32 *arg);
+   static int __ssam_tmp_perf_mode_set(struct ssam_controller *ctrl, const __le32 *arg);
 
 executing the specified request, with the controller passed in when calling
 said function. In this example, the argument is provided via the ``arg``
@@ -296,7 +296,7 @@ This invocation of the macro defines a function
 
 .. code-block:: c
 
-   int ssam_bat_get_sta(struct ssam_device *sdev, __le32 *ret);
+   static int ssam_bat_get_sta(struct ssam_device *sdev, __le32 *ret);
 
 executing the specified request, using the device IDs and controller given
 in the client device. The full list of such macros for client devices is:
diff --git a/drivers/platform/surface/aggregator/controller.c b/drivers/platform/surface/aggregator/controller.c
index 5bcb59ed579d..aa6f37b4f46e 100644
--- a/drivers/platform/surface/aggregator/controller.c
+++ b/drivers/platform/surface/aggregator/controller.c
@@ -1750,35 +1750,35 @@ EXPORT_SYMBOL_GPL(ssam_request_sync_with_buffer);
 
 /* -- Internal SAM requests. ------------------------------------------------ */
 
-static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_get_firmware_version, __le32, {
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_get_firmware_version, __le32, {
 	.target_category = SSAM_SSH_TC_SAM,
 	.target_id       = 0x01,
 	.command_id      = 0x13,
 	.instance_id     = 0x00,
 });
 
-static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_display_off, u8, {
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_display_off, u8, {
 	.target_category = SSAM_SSH_TC_SAM,
 	.target_id       = 0x01,
 	.command_id      = 0x15,
 	.instance_id     = 0x00,
 });
 
-static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_display_on, u8, {
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_display_on, u8, {
 	.target_category = SSAM_SSH_TC_SAM,
 	.target_id       = 0x01,
 	.command_id      = 0x16,
 	.instance_id     = 0x00,
 });
 
-static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_d0_exit, u8, {
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_d0_exit, u8, {
 	.target_category = SSAM_SSH_TC_SAM,
 	.target_id       = 0x01,
 	.command_id      = 0x33,
 	.instance_id     = 0x00,
 });
 
-static SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_d0_entry, u8, {
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_ssh_notif_d0_entry, u8, {
 	.target_category = SSAM_SSH_TC_SAM,
 	.target_id       = 0x01,
 	.command_id      = 0x34,
diff --git a/drivers/platform/surface/surface_aggregator_registry.c b/drivers/platform/surface/surface_aggregator_registry.c
index caee90d135c5..cdb4a95af3e8 100644
--- a/drivers/platform/surface/surface_aggregator_registry.c
+++ b/drivers/platform/surface/surface_aggregator_registry.c
@@ -302,7 +302,7 @@ struct ssam_base_hub {
 	struct ssam_event_notifier notif;
 };
 
-static SSAM_DEFINE_SYNC_REQUEST_R(ssam_bas_query_opmode, u8, {
+SSAM_DEFINE_SYNC_REQUEST_R(ssam_bas_query_opmode, u8, {
 	.target_category = SSAM_SSH_TC_BAS,
 	.target_id       = 0x01,
 	.command_id      = 0x0d,
diff --git a/drivers/platform/surface/surface_platform_profile.c b/drivers/platform/surface/surface_platform_profile.c
index 0081b01a5b0f..6373d3b5eb7f 100644
--- a/drivers/platform/surface/surface_platform_profile.c
+++ b/drivers/platform/surface/surface_platform_profile.c
@@ -32,12 +32,12 @@ struct ssam_tmp_profile_device {
 	struct platform_profile_handler handler;
 };
 
-static SSAM_DEFINE_SYNC_REQUEST_CL_R(__ssam_tmp_profile_get, struct ssam_tmp_profile_info, {
+SSAM_DEFINE_SYNC_REQUEST_CL_R(__ssam_tmp_profile_get, struct ssam_tmp_profile_info, {
 	.target_category = SSAM_SSH_TC_TMP,
 	.command_id      = 0x02,
 });
 
-static SSAM_DEFINE_SYNC_REQUEST_CL_W(__ssam_tmp_profile_set, __le32, {
+SSAM_DEFINE_SYNC_REQUEST_CL_W(__ssam_tmp_profile_set, __le32, {
 	.target_category = SSAM_SSH_TC_TMP,
 	.command_id      = 0x03,
 });
diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h
index f4b1ba887384..0806796eabcb 100644
--- a/include/linux/surface_aggregator/controller.h
+++ b/include/linux/surface_aggregator/controller.h
@@ -344,16 +344,16 @@ struct ssam_request_spec_md {
  * request has been fully completed. The required transport buffer will be
  * allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_controller
- * *ctrl)``, returning the status of the request, which is zero on success and
- * negative on failure. The ``ctrl`` parameter is the controller via which the
- * request is being sent.
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl)``, returning the status of the request, which is
+ * zero on success and negative on failure. The ``ctrl`` parameter is the
+ * controller via which the request is being sent.
  *
  * Refer to ssam_request_sync_onstack() for more details on the behavior of
  * the generated function.
  */
 #define SSAM_DEFINE_SYNC_REQUEST_N(name, spec...)				\
-	int name(struct ssam_controller *ctrl)					\
+	static int name(struct ssam_controller *ctrl)				\
 	{									\
 		struct ssam_request_spec s = (struct ssam_request_spec)spec;	\
 		struct ssam_request rqst;					\
@@ -383,17 +383,17 @@ struct ssam_request_spec_md {
  * returning once the request has been fully completed. The required transport
  * buffer will be allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_controller
- * *ctrl, const atype *arg)``, returning the status of the request, which is
- * zero on success and negative on failure. The ``ctrl`` parameter is the
- * controller via which the request is sent. The request argument is specified
- * via the ``arg`` pointer.
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, const atype *arg)``, returning the status of the
+ * request, which is zero on success and negative on failure. The ``ctrl``
+ * parameter is the controller via which the request is sent. The request
+ * argument is specified via the ``arg`` pointer.
  *
  * Refer to ssam_request_sync_onstack() for more details on the behavior of
  * the generated function.
  */
 #define SSAM_DEFINE_SYNC_REQUEST_W(name, atype, spec...)			\
-	int name(struct ssam_controller *ctrl, const atype *arg)		\
+	static int name(struct ssam_controller *ctrl, const atype *arg)		\
 	{									\
 		struct ssam_request_spec s = (struct ssam_request_spec)spec;	\
 		struct ssam_request rqst;					\
@@ -424,17 +424,17 @@ struct ssam_request_spec_md {
  * request itself, returning once the request has been fully completed. The
  * required transport buffer will be allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_controller
- * *ctrl, rtype *ret)``, returning the status of the request, which is zero on
- * success and negative on failure. The ``ctrl`` parameter is the controller
- * via which the request is sent. The request's return value is written to the
- * memory pointed to by the ``ret`` parameter.
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, rtype *ret)``, returning the status of the request,
+ * which is zero on success and negative on failure. The ``ctrl`` parameter is
+ * the controller via which the request is sent. The request's return value is
+ * written to the memory pointed to by the ``ret`` parameter.
  *
  * Refer to ssam_request_sync_onstack() for more details on the behavior of
  * the generated function.
  */
 #define SSAM_DEFINE_SYNC_REQUEST_R(name, rtype, spec...)			\
-	int name(struct ssam_controller *ctrl, rtype *ret)			\
+	static int name(struct ssam_controller *ctrl, rtype *ret)		\
 	{									\
 		struct ssam_request_spec s = (struct ssam_request_spec)spec;	\
 		struct ssam_request rqst;					\
@@ -483,17 +483,17 @@ struct ssam_request_spec_md {
  * returning once the request has been fully completed. The required transport
  * buffer will be allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_controller
- * *ctrl, u8 tid, u8 iid)``, returning the status of the request, which is
- * zero on success and negative on failure. The ``ctrl`` parameter is the
- * controller via which the request is sent, ``tid`` the target ID for the
- * request, and ``iid`` the instance ID.
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, u8 tid, u8 iid)``, returning the status of the
+ * request, which is zero on success and negative on failure. The ``ctrl``
+ * parameter is the controller via which the request is sent, ``tid`` the
+ * target ID for the request, and ``iid`` the instance ID.
  *
  * Refer to ssam_request_sync_onstack() for more details on the behavior of
  * the generated function.
  */
 #define SSAM_DEFINE_SYNC_REQUEST_MD_N(name, spec...)				\
-	int name(struct ssam_controller *ctrl, u8 tid, u8 iid)			\
+	static int name(struct ssam_controller *ctrl, u8 tid, u8 iid)		\
 	{									\
 		struct ssam_request_spec_md s = (struct ssam_request_spec_md)spec; \
 		struct ssam_request rqst;					\
@@ -524,18 +524,18 @@ struct ssam_request_spec_md {
  * the request itself, returning once the request has been fully completed.
  * The required transport buffer will be allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_controller
- * *ctrl, u8 tid, u8 iid, const atype *arg)``, returning the status of the
- * request, which is zero on success and negative on failure. The ``ctrl``
- * parameter is the controller via which the request is sent, ``tid`` the
- * target ID for the request, and ``iid`` the instance ID. The request argument
- * is specified via the ``arg`` pointer.
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, u8 tid, u8 iid, const atype *arg)``, returning the
+ * status of the request, which is zero on success and negative on failure.
+ * The ``ctrl`` parameter is the controller via which the request is sent,
+ * ``tid`` the target ID for the request, and ``iid`` the instance ID. The
+ * request argument is specified via the ``arg`` pointer.
  *
  * Refer to ssam_request_sync_onstack() for more details on the behavior of
  * the generated function.
  */
 #define SSAM_DEFINE_SYNC_REQUEST_MD_W(name, atype, spec...)			\
-	int name(struct ssam_controller *ctrl, u8 tid, u8 iid, const atype *arg)\
+	static int name(struct ssam_controller *ctrl, u8 tid, u8 iid, const atype *arg) \
 	{									\
 		struct ssam_request_spec_md s = (struct ssam_request_spec_md)spec; \
 		struct ssam_request rqst;					\
@@ -567,18 +567,18 @@ struct ssam_request_spec_md {
  * execution of the request itself, returning once the request has been fully
  * completed. The required transport buffer will be allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_controller
- * *ctrl, u8 tid, u8 iid, rtype *ret)``, returning the status of the request,
- * which is zero on success and negative on failure. The ``ctrl`` parameter is
- * the controller via which the request is sent, ``tid`` the target ID for the
- * request, and ``iid`` the instance ID. The request's return value is written
- * to the memory pointed to by the ``ret`` parameter.
+ * The generated function is defined as ``static int name(struct
+ * ssam_controller *ctrl, u8 tid, u8 iid, rtype *ret)``, returning the status
+ * of the request, which is zero on success and negative on failure. The
+ * ``ctrl`` parameter is the controller via which the request is sent, ``tid``
+ * the target ID for the request, and ``iid`` the instance ID. The request's
+ * return value is written to the memory pointed to by the ``ret`` parameter.
  *
  * Refer to ssam_request_sync_onstack() for more details on the behavior of
  * the generated function.
  */
 #define SSAM_DEFINE_SYNC_REQUEST_MD_R(name, rtype, spec...)			\
-	int name(struct ssam_controller *ctrl, u8 tid, u8 iid, rtype *ret)	\
+	static int name(struct ssam_controller *ctrl, u8 tid, u8 iid, rtype *ret) \
 	{									\
 		struct ssam_request_spec_md s = (struct ssam_request_spec_md)spec; \
 		struct ssam_request rqst;					\
diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h
index 02f3e06c0a60..4441ad667c3f 100644
--- a/include/linux/surface_aggregator/device.h
+++ b/include/linux/surface_aggregator/device.h
@@ -336,17 +336,18 @@ void ssam_device_driver_unregister(struct ssam_device_driver *d);
  * request has been fully completed. The required transport buffer will be
  * allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_device *sdev)``,
- * returning the status of the request, which is zero on success and negative
- * on failure. The ``sdev`` parameter specifies both the target device of the
- * request and by association the controller via which the request is sent.
+ * The generated function is defined as ``static int name(struct ssam_device
+ * *sdev)``, returning the status of the request, which is zero on success and
+ * negative on failure. The ``sdev`` parameter specifies both the target
+ * device of the request and by association the controller via which the
+ * request is sent.
  *
  * Refer to ssam_request_sync_onstack() for more details on the behavior of
  * the generated function.
  */
 #define SSAM_DEFINE_SYNC_REQUEST_CL_N(name, spec...)			\
 	SSAM_DEFINE_SYNC_REQUEST_MD_N(__raw_##name, spec)		\
-	int name(struct ssam_device *sdev)				\
+	static int name(struct ssam_device *sdev)			\
 	{								\
 		return __raw_##name(sdev->ctrl, sdev->uid.target,	\
 				    sdev->uid.instance);		\
@@ -368,19 +369,19 @@ void ssam_device_driver_unregister(struct ssam_device_driver *d);
  * itself, returning once the request has been fully completed. The required
  * transport buffer will be allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_device *sdev,
- * const atype *arg)``, returning the status of the request, which is zero on
- * success and negative on failure. The ``sdev`` parameter specifies both the
- * target device of the request and by association the controller via which
- * the request is sent. The request's argument is specified via the ``arg``
- * pointer.
+ * The generated function is defined as ``static int name(struct ssam_device
+ * *sdev, const atype *arg)``, returning the status of the request, which is
+ * zero on success and negative on failure. The ``sdev`` parameter specifies
+ * both the target device of the request and by association the controller via
+ * which the request is sent. The request's argument is specified via the
+ * ``arg`` pointer.
  *
  * Refer to ssam_request_sync_onstack() for more details on the behavior of
  * the generated function.
  */
 #define SSAM_DEFINE_SYNC_REQUEST_CL_W(name, atype, spec...)		\
 	SSAM_DEFINE_SYNC_REQUEST_MD_W(__raw_##name, atype, spec)	\
-	int name(struct ssam_device *sdev, const atype *arg)		\
+	static int name(struct ssam_device *sdev, const atype *arg)	\
 	{								\
 		return __raw_##name(sdev->ctrl, sdev->uid.target,	\
 				    sdev->uid.instance, arg);		\
@@ -402,8 +403,8 @@ void ssam_device_driver_unregister(struct ssam_device_driver *d);
  * itself, returning once the request has been fully completed. The required
  * transport buffer will be allocated on the stack.
  *
- * The generated function is defined as ``int name(struct ssam_device *sdev,
- * rtype *ret)``, returning the status of the request, which is zero on
+ * The generated function is defined as ``static int name(struct ssam_device
+ * *sdev, rtype *ret)``, returning the status of the request, which is zero on
  * success and negative on failure. The ``sdev`` parameter specifies both the
  * target device of the request and by association the controller via which
  * the request is sent. The request's return value is written to the memory
@@ -414,7 +415,7 @@ void ssam_device_driver_unregister(struct ssam_device_driver *d);
  */
 #define SSAM_DEFINE_SYNC_REQUEST_CL_R(name, rtype, spec...)		\
 	SSAM_DEFINE_SYNC_REQUEST_MD_R(__raw_##name, rtype, spec)	\
-	int name(struct ssam_device *sdev, rtype *ret)			\
+	static int name(struct ssam_device *sdev, rtype *ret)		\
 	{								\
 		return __raw_##name(sdev->ctrl, sdev->uid.target,	\
 				    sdev->uid.instance, ret);		\
-- 
cgit v1.2.3


From 2b329f5694aec86107931584413c7d2ebc6b548d Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 1 Mar 2021 17:04:04 +0100
Subject: platform/x86: wmi: Make remove callback return void
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver core ignores the return value of struct bus_type::remove()
(and so wmi_dev_remove()) because there is only little that can be done.

To simplify the quest to make this function return void, let struct
wmi_driver::remove() return void, too. All implementers of this callback
return 0 already and this way it should be obvious to driver authors
that returning an error code is a bad idea.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Pali Rohár <pali@kernel.org>
Link: https://lore.kernel.org/r/20210301160404.1677064-1-u.kleine-koenig@pengutronix.de
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/dell/dell-smbios-wmi.c                        | 3 +--
 drivers/platform/x86/dell/dell-wmi-descriptor.c                    | 3 +--
 drivers/platform/x86/dell/dell-wmi-sysman/biosattr-interface.c     | 3 +--
 drivers/platform/x86/dell/dell-wmi-sysman/passwordattr-interface.c | 3 +--
 drivers/platform/x86/dell/dell-wmi.c                               | 3 +--
 drivers/platform/x86/intel-wmi-sbl-fw-update.c                     | 3 +--
 drivers/platform/x86/intel-wmi-thunderbolt.c                       | 3 +--
 drivers/platform/x86/wmi-bmof.c                                    | 3 +--
 drivers/platform/x86/wmi.c                                         | 5 ++---
 include/linux/wmi.h                                                | 2 +-
 10 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/dell/dell-smbios-wmi.c b/drivers/platform/x86/dell/dell-smbios-wmi.c
index 27a298b7c541..a1753485159c 100644
--- a/drivers/platform/x86/dell/dell-smbios-wmi.c
+++ b/drivers/platform/x86/dell/dell-smbios-wmi.c
@@ -205,7 +205,7 @@ fail_register:
 	return ret;
 }
 
-static int dell_smbios_wmi_remove(struct wmi_device *wdev)
+static void dell_smbios_wmi_remove(struct wmi_device *wdev)
 {
 	struct wmi_smbios_priv *priv = dev_get_drvdata(&wdev->dev);
 	int count;
@@ -218,7 +218,6 @@ static int dell_smbios_wmi_remove(struct wmi_device *wdev)
 	count = get_order(priv->req_buf_size);
 	free_pages((unsigned long)priv->buf, count);
 	mutex_unlock(&call_mutex);
-	return 0;
 }
 
 static const struct wmi_device_id dell_smbios_wmi_id_table[] = {
diff --git a/drivers/platform/x86/dell/dell-wmi-descriptor.c b/drivers/platform/x86/dell/dell-wmi-descriptor.c
index a068900ae8a1..3c4af7c08bb1 100644
--- a/drivers/platform/x86/dell/dell-wmi-descriptor.c
+++ b/drivers/platform/x86/dell/dell-wmi-descriptor.c
@@ -174,14 +174,13 @@ out:
 	return ret;
 }
 
-static int dell_wmi_descriptor_remove(struct wmi_device *wdev)
+static void dell_wmi_descriptor_remove(struct wmi_device *wdev)
 {
 	struct descriptor_priv *priv = dev_get_drvdata(&wdev->dev);
 
 	mutex_lock(&list_mutex);
 	list_del(&priv->list);
 	mutex_unlock(&list_mutex);
-	return 0;
 }
 
 static const struct wmi_device_id dell_wmi_descriptor_id_table[] = {
diff --git a/drivers/platform/x86/dell/dell-wmi-sysman/biosattr-interface.c b/drivers/platform/x86/dell/dell-wmi-sysman/biosattr-interface.c
index f95d8ddace5a..c2dd2de6bc20 100644
--- a/drivers/platform/x86/dell/dell-wmi-sysman/biosattr-interface.c
+++ b/drivers/platform/x86/dell/dell-wmi-sysman/biosattr-interface.c
@@ -152,12 +152,11 @@ static int bios_attr_set_interface_probe(struct wmi_device *wdev, const void *co
 	return 0;
 }
 
-static int bios_attr_set_interface_remove(struct wmi_device *wdev)
+static void bios_attr_set_interface_remove(struct wmi_device *wdev)
 {
 	mutex_lock(&wmi_priv.mutex);
 	wmi_priv.bios_attr_wdev = NULL;
 	mutex_unlock(&wmi_priv.mutex);
-	return 0;
 }
 
 static const struct wmi_device_id bios_attr_set_interface_id_table[] = {
diff --git a/drivers/platform/x86/dell/dell-wmi-sysman/passwordattr-interface.c b/drivers/platform/x86/dell/dell-wmi-sysman/passwordattr-interface.c
index 5780b4d94759..339a082d6c18 100644
--- a/drivers/platform/x86/dell/dell-wmi-sysman/passwordattr-interface.c
+++ b/drivers/platform/x86/dell/dell-wmi-sysman/passwordattr-interface.c
@@ -119,12 +119,11 @@ static int bios_attr_pass_interface_probe(struct wmi_device *wdev, const void *c
 	return 0;
 }
 
-static int bios_attr_pass_interface_remove(struct wmi_device *wdev)
+static void bios_attr_pass_interface_remove(struct wmi_device *wdev)
 {
 	mutex_lock(&wmi_priv.mutex);
 	wmi_priv.password_attr_wdev = NULL;
 	mutex_unlock(&wmi_priv.mutex);
-	return 0;
 }
 
 static const struct wmi_device_id bios_attr_pass_interface_id_table[] = {
diff --git a/drivers/platform/x86/dell/dell-wmi.c b/drivers/platform/x86/dell/dell-wmi.c
index bbdb3e860892..5e1b7f897df5 100644
--- a/drivers/platform/x86/dell/dell-wmi.c
+++ b/drivers/platform/x86/dell/dell-wmi.c
@@ -714,10 +714,9 @@ static int dell_wmi_probe(struct wmi_device *wdev, const void *context)
 	return dell_wmi_input_setup(wdev);
 }
 
-static int dell_wmi_remove(struct wmi_device *wdev)
+static void dell_wmi_remove(struct wmi_device *wdev)
 {
 	dell_wmi_input_destroy(wdev);
-	return 0;
 }
 static const struct wmi_device_id dell_wmi_id_table[] = {
 	{ .guid_string = DELL_EVENT_GUID },
diff --git a/drivers/platform/x86/intel-wmi-sbl-fw-update.c b/drivers/platform/x86/intel-wmi-sbl-fw-update.c
index ea87fa0786e8..3c86e0108a24 100644
--- a/drivers/platform/x86/intel-wmi-sbl-fw-update.c
+++ b/drivers/platform/x86/intel-wmi-sbl-fw-update.c
@@ -117,10 +117,9 @@ static int intel_wmi_sbl_fw_update_probe(struct wmi_device *wdev,
 	return 0;
 }
 
-static int intel_wmi_sbl_fw_update_remove(struct wmi_device *wdev)
+static void intel_wmi_sbl_fw_update_remove(struct wmi_device *wdev)
 {
 	dev_info(&wdev->dev, "Slim Bootloader signaling driver removed\n");
-	return 0;
 }
 
 static const struct wmi_device_id intel_wmi_sbl_id_table[] = {
diff --git a/drivers/platform/x86/intel-wmi-thunderbolt.c b/drivers/platform/x86/intel-wmi-thunderbolt.c
index 974c22a7ff61..4ae87060d18b 100644
--- a/drivers/platform/x86/intel-wmi-thunderbolt.c
+++ b/drivers/platform/x86/intel-wmi-thunderbolt.c
@@ -66,11 +66,10 @@ static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev,
 	return ret;
 }
 
-static int intel_wmi_thunderbolt_remove(struct wmi_device *wdev)
+static void intel_wmi_thunderbolt_remove(struct wmi_device *wdev)
 {
 	sysfs_remove_group(&wdev->dev.kobj, &tbt_attribute_group);
 	kobject_uevent(&wdev->dev.kobj, KOBJ_CHANGE);
-	return 0;
 }
 
 static const struct wmi_device_id intel_wmi_thunderbolt_id_table[] = {
diff --git a/drivers/platform/x86/wmi-bmof.c b/drivers/platform/x86/wmi-bmof.c
index 66b434d6307f..80137afb9753 100644
--- a/drivers/platform/x86/wmi-bmof.c
+++ b/drivers/platform/x86/wmi-bmof.c
@@ -86,13 +86,12 @@ static int wmi_bmof_probe(struct wmi_device *wdev, const void *context)
 	return ret;
 }
 
-static int wmi_bmof_remove(struct wmi_device *wdev)
+static void wmi_bmof_remove(struct wmi_device *wdev)
 {
 	struct bmof_priv *priv = dev_get_drvdata(&wdev->dev);
 
 	sysfs_remove_bin_file(&wdev->dev.kobj, &priv->bmof_bin_attr);
 	kfree(priv->bmofdata);
-	return 0;
 }
 
 static const struct wmi_device_id wmi_bmof_id_table[] = {
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index f30d791be95b..62e0d56a3332 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -985,7 +985,6 @@ static int wmi_dev_remove(struct device *dev)
 	struct wmi_block *wblock = dev_to_wblock(dev);
 	struct wmi_driver *wdriver =
 		container_of(dev->driver, struct wmi_driver, driver);
-	int ret = 0;
 
 	if (wdriver->filter_callback) {
 		misc_deregister(&wblock->char_dev);
@@ -994,12 +993,12 @@ static int wmi_dev_remove(struct device *dev)
 	}
 
 	if (wdriver->remove)
-		ret = wdriver->remove(dev_to_wdev(dev));
+		wdriver->remove(dev_to_wdev(dev));
 
 	if (ACPI_FAILURE(wmi_method_enable(wblock, 0)))
 		dev_warn(dev, "failed to disable device\n");
 
-	return ret;
+	return 0;
 }
 
 static struct class wmi_bus_class = {
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 8ef7e7faea1e..2cb3913c1f50 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -37,7 +37,7 @@ struct wmi_driver {
 	const struct wmi_device_id *id_table;
 
 	int (*probe)(struct wmi_device *wdev, const void *context);
-	int (*remove)(struct wmi_device *wdev);
+	void (*remove)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
 	long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd,
 				struct wmi_ioctl_buffer *arg);
-- 
cgit v1.2.3


From e831e400f73b0e7aa4629200c2ef71e068262414 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 3 Mar 2021 11:15:17 +0100
Subject: printk: kmsg_dump: remove unused fields

struct kmsg_dumper still contains some fields that were used to
iterate the old ringbuffer. They are no longer used. Remove them
and update the struct documentation.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20210303101528.29901-5-john.ogness@linutronix.de
---
 include/linux/kmsg_dump.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 3378bcbe585e..ae38035f1dca 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -36,6 +36,9 @@ enum kmsg_dump_reason {
  * 		through the record iterator
  * @max_reason:	filter for highest reason number that should be dumped
  * @registered:	Flag that specifies if this is already registered
+ * @active:	Flag that specifies if this is currently dumping
+ * @cur_seq:	Points to the oldest message to dump
+ * @next_seq:	Points after the newest message to dump
  */
 struct kmsg_dumper {
 	struct list_head list;
@@ -45,8 +48,6 @@ struct kmsg_dumper {
 	bool registered;
 
 	/* private state of the kmsg iterator */
-	u32 cur_idx;
-	u32 next_idx;
 	u64 cur_seq;
 	u64 next_seq;
 };
-- 
cgit v1.2.3


From 726b5097701a8d46f5354be780e1a11fc4ca1187 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 3 Mar 2021 11:15:18 +0100
Subject: printk: refactor kmsg_dump_get_buffer()

kmsg_dump_get_buffer() requires nearly the same logic as
syslog_print_all(), but uses different variable names and
does not make use of the ringbuffer loop macros. Modify
kmsg_dump_get_buffer() so that the implementation is as similar
to syslog_print_all() as possible.

A follow-up commit will move this common logic into a
separate helper function.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20210303101528.29901-6-john.ogness@linutronix.de
---
 include/linux/kmsg_dump.h |  2 +-
 kernel/printk/printk.c    | 62 +++++++++++++++++++++++++----------------------
 2 files changed, 34 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index ae38035f1dca..070c994ff19f 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -62,7 +62,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 			char *line, size_t size, size_t *len);
 
 bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
-			  char *buf, size_t size, size_t *len);
+			  char *buf, size_t size, size_t *len_out);
 
 void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper);
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 77ae2704e979..ed678d84dc51 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3410,7 +3410,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
  * @syslog: include the "<4>" prefixes
  * @buf: buffer to copy the line to
  * @size: maximum size of the buffer
- * @len: length of line placed into buffer
+ * @len_out: length of line placed into buffer
  *
  * Start at the end of the kmsg buffer and fill the provided buffer
  * with as many of the *youngest* kmsg records that fit into it.
@@ -3424,7 +3424,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
  * read.
  */
 bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
-			  char *buf, size_t size, size_t *len)
+			  char *buf, size_t size, size_t *len_out)
 {
 	struct printk_info info;
 	unsigned int line_count;
@@ -3432,12 +3432,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	unsigned long flags;
 	u64 seq;
 	u64 next_seq;
-	size_t l = 0;
+	size_t len = 0;
 	bool ret = false;
 	bool time = printk_time;
 
-	prb_rec_init_rd(&r, &info, buf, size);
-
 	if (!dumper->active || !buf || !size)
 		goto out;
 
@@ -3455,48 +3453,54 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 		goto out;
 	}
 
-	/* calculate length of entire buffer */
-	seq = dumper->cur_seq;
-	while (prb_read_valid_info(prb, seq, &info, &line_count)) {
-		if (r.info->seq >= dumper->next_seq)
+	/*
+	 * Find first record that fits, including all following records,
+	 * into the user-provided buffer for this dump.
+	 */
+
+	prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
+		if (info.seq >= dumper->next_seq)
 			break;
-		l += get_record_print_text_size(&info, line_count, syslog, time);
-		seq = r.info->seq + 1;
+		len += get_record_print_text_size(&info, line_count, syslog, time);
 	}
 
-	/* move first record forward until length fits into the buffer */
-	seq = dumper->cur_seq;
-	while (l >= size && prb_read_valid_info(prb, seq,
-						&info, &line_count)) {
-		if (r.info->seq >= dumper->next_seq)
+	/*
+	 * Move first record forward until length fits into the buffer. Ignore
+	 * newest messages that were not counted in the above cycle. Messages
+	 * might appear and get lost in the meantime. This is the best effort
+	 * that prevents an infinite loop.
+	 */
+	prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
+		if (len < size || info.seq >= dumper->next_seq)
 			break;
-		l -= get_record_print_text_size(&info, line_count, syslog, time);
-		seq = r.info->seq + 1;
+		len -= get_record_print_text_size(&info, line_count, syslog, time);
 	}
 
-	/* last message in next interation */
+	/*
+	 * Next kmsg_dump_get_buffer() invocation will dump block of
+	 * older records stored right before this one.
+	 */
 	next_seq = seq;
 
-	/* actually read text into the buffer now */
-	l = 0;
-	while (prb_read_valid(prb, seq, &r)) {
+	prb_rec_init_rd(&r, &info, buf, size);
+
+	len = 0;
+	prb_for_each_record(seq, prb, seq, &r) {
 		if (r.info->seq >= dumper->next_seq)
 			break;
 
-		l += record_print_text(&r, syslog, time);
+		len += record_print_text(&r, syslog, time);
 
-		/* adjust record to store to remaining buffer space */
-		prb_rec_init_rd(&r, &info, buf + l, size - l);
-
-		seq = r.info->seq + 1;
+		/* Adjust record to store to remaining buffer space. */
+		prb_rec_init_rd(&r, &info, buf + len, size - len);
 	}
 
 	dumper->next_seq = next_seq;
 	ret = true;
 	logbuf_unlock_irqrestore(flags);
 out:
-	if (len)
-		*len = l;
+	if (len_out)
+		*len_out = len;
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
-- 
cgit v1.2.3


From 5f6c7648e556f41a3064bb6dceb9e102c50b618d Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 3 Mar 2021 11:15:24 +0100
Subject: printk: kmsg_dumper: remove @active field

All 6 kmsg_dumpers do not benefit from the @active flag:

  (provide their own synchronization)
  - arch/powerpc/kernel/nvram_64.c
  - arch/um/kernel/kmsg_dump.c
  - drivers/mtd/mtdoops.c
  - fs/pstore/platform.c

  (only dump on KMSG_DUMP_PANIC, which does not require
  synchronization)
  - arch/powerpc/platforms/powernv/opal-kmsg.c
  - drivers/hv/vmbus_drv.c

The other 2 kmsg_dump users also do not rely on @active:

  (hard-code @active to always be true)
  - arch/powerpc/xmon/xmon.c
  - kernel/debug/kdb/kdb_main.c

Therefore, @active can be removed.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20210303101528.29901-12-john.ogness@linutronix.de
---
 arch/powerpc/xmon/xmon.c    |  2 +-
 include/linux/kmsg_dump.h   |  2 --
 kernel/debug/kdb/kdb_main.c |  2 +-
 kernel/printk/printk.c      | 10 +---------
 4 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index dcd817ca2edf..a777573a0c38 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3005,7 +3005,7 @@ print_address(unsigned long addr)
 static void
 dump_log_buf(void)
 {
-	struct kmsg_dumper dumper = { .active = 1 };
+	struct kmsg_dumper dumper;
 	unsigned char buf[128];
 	size_t len;
 
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 070c994ff19f..84eaa2090efa 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -36,7 +36,6 @@ enum kmsg_dump_reason {
  * 		through the record iterator
  * @max_reason:	filter for highest reason number that should be dumped
  * @registered:	Flag that specifies if this is already registered
- * @active:	Flag that specifies if this is currently dumping
  * @cur_seq:	Points to the oldest message to dump
  * @next_seq:	Points after the newest message to dump
  */
@@ -44,7 +43,6 @@ struct kmsg_dumper {
 	struct list_head list;
 	void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason);
 	enum kmsg_dump_reason max_reason;
-	bool active;
 	bool registered;
 
 	/* private state of the kmsg iterator */
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 930ac1b25ec7..315169d5e119 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2101,7 +2101,7 @@ static int kdb_dmesg(int argc, const char **argv)
 	int adjust = 0;
 	int n = 0;
 	int skip = 0;
-	struct kmsg_dumper dumper = { .active = 1 };
+	struct kmsg_dumper dumper;
 	size_t len;
 	char buf[201];
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e794a08de00f..ce4cc64ba7c9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3408,8 +3408,6 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 			continue;
 
 		/* initialize iterator with data about the stored records */
-		dumper->active = true;
-
 		logbuf_lock_irqsave(flags);
 		dumper->cur_seq = latched_seq_read_nolock(&clear_seq);
 		dumper->next_seq = prb_next_seq(prb);
@@ -3417,9 +3415,6 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 
 		/* invoke dumper which will iterate over records */
 		dumper->dump(dumper, reason);
-
-		/* reset iterator */
-		dumper->active = false;
 	}
 	rcu_read_unlock();
 }
@@ -3454,9 +3449,6 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 
 	prb_rec_init_rd(&r, &info, line, size);
 
-	if (!dumper->active)
-		goto out;
-
 	/* Read text or count text lines? */
 	if (line) {
 		if (!prb_read_valid(prb, dumper->cur_seq, &r))
@@ -3542,7 +3534,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	bool ret = false;
 	bool time = printk_time;
 
-	if (!dumper->active || !buf || !size)
+	if (!buf || !size)
 		goto out;
 
 	logbuf_lock_irqsave(flags);
-- 
cgit v1.2.3


From f9f3f02db98bbe678a8e57fe9432b196174744a3 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 3 Mar 2021 11:15:25 +0100
Subject: printk: introduce a kmsg_dump iterator

Rather than storing the iterator information in the registered
kmsg_dumper structure, create a separate iterator structure. The
kmsg_dump_iter structure can reside on the stack of the caller, thus
allowing lockless use of the kmsg_dump functions.

Update code that accesses the kernel logs using the kmsg_dumper
structure to use the new kmsg_dump_iter structure. For kmsg_dumpers,
this also means adding a call to kmsg_dump_rewind() to initialize
the iterator.

All this is in preparation for removal of @logbuf_lock.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org> # pstore
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20210303101528.29901-13-john.ogness@linutronix.de
---
 arch/powerpc/kernel/nvram_64.c |  8 ++++--
 arch/powerpc/xmon/xmon.c       |  6 ++--
 arch/um/kernel/kmsg_dump.c     |  5 +++-
 drivers/hv/vmbus_drv.c         |  4 ++-
 drivers/mtd/mtdoops.c          |  5 +++-
 fs/pstore/platform.c           |  5 +++-
 include/linux/kmsg_dump.h      | 36 +++++++++++++-----------
 kernel/debug/kdb/kdb_main.c    | 10 +++----
 kernel/printk/printk.c         | 63 +++++++++++++++++++++---------------------
 9 files changed, 80 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 532f22637783..3c8d9bbb51cf 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -647,6 +647,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 {
 	struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
 	static unsigned int oops_count = 0;
+	static struct kmsg_dump_iter iter;
 	static bool panicking = false;
 	static DEFINE_SPINLOCK(lock);
 	unsigned long flags;
@@ -681,13 +682,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
 		return;
 
 	if (big_oops_buf) {
-		kmsg_dump_get_buffer(dumper, false,
+		kmsg_dump_rewind(&iter);
+		kmsg_dump_get_buffer(&iter, false,
 				     big_oops_buf, big_oops_buf_sz, &text_len);
 		rc = zip_oops(text_len);
 	}
 	if (rc != 0) {
-		kmsg_dump_rewind(dumper);
-		kmsg_dump_get_buffer(dumper, false,
+		kmsg_dump_rewind(&iter);
+		kmsg_dump_get_buffer(&iter, false,
 				     oops_data, oops_data_sz, &text_len);
 		err_type = ERR_TYPE_KERNEL_PANIC;
 		oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index a777573a0c38..dbcf403b1d8d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3005,7 +3005,7 @@ print_address(unsigned long addr)
 static void
 dump_log_buf(void)
 {
-	struct kmsg_dumper dumper;
+	struct kmsg_dump_iter iter;
 	unsigned char buf[128];
 	size_t len;
 
@@ -3017,9 +3017,9 @@ dump_log_buf(void)
 	catch_memory_errors = 1;
 	sync();
 
-	kmsg_dump_rewind_nolock(&dumper);
+	kmsg_dump_rewind_nolock(&iter);
 	xmon_start_pagination();
-	while (kmsg_dump_get_line_nolock(&dumper, false, buf, sizeof(buf), &len)) {
+	while (kmsg_dump_get_line_nolock(&iter, false, buf, sizeof(buf), &len)) {
 		buf[len] = '\0';
 		printf("%s", buf);
 	}
diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c
index a765d235e50e..0224fcb36e22 100644
--- a/arch/um/kernel/kmsg_dump.c
+++ b/arch/um/kernel/kmsg_dump.c
@@ -10,6 +10,7 @@
 static void kmsg_dumper_stdout(struct kmsg_dumper *dumper,
 				enum kmsg_dump_reason reason)
 {
+	static struct kmsg_dump_iter iter;
 	static DEFINE_SPINLOCK(lock);
 	static char line[1024];
 	struct console *con;
@@ -35,8 +36,10 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper,
 	if (!spin_trylock_irqsave(&lock, flags))
 		return;
 
+	kmsg_dump_rewind(&iter);
+
 	printf("kmsg_dump:\n");
-	while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) {
+	while (kmsg_dump_get_line(&iter, true, line, sizeof(line), &len)) {
 		line[len] = '\0';
 		printf("%s", line);
 	}
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 10dce9f91216..b341b144bde8 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1391,6 +1391,7 @@ static void vmbus_isr(void)
 static void hv_kmsg_dump(struct kmsg_dumper *dumper,
 			 enum kmsg_dump_reason reason)
 {
+	struct kmsg_dump_iter iter;
 	size_t bytes_written;
 	phys_addr_t panic_pa;
 
@@ -1404,7 +1405,8 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper,
 	 * Write dump contents to the page. No need to synchronize; panic should
 	 * be single-threaded.
 	 */
-	kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE,
+	kmsg_dump_rewind(&iter);
+	kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
 			     &bytes_written);
 	if (bytes_written)
 		hyperv_report_panic_msg(panic_pa, bytes_written);
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index 8bbfba40a554..862c4a889234 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -277,14 +277,17 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper,
 {
 	struct mtdoops_context *cxt = container_of(dumper,
 			struct mtdoops_context, dump);
+	struct kmsg_dump_iter iter;
 
 	/* Only dump oopses if dump_oops is set */
 	if (reason == KMSG_DUMP_OOPS && !dump_oops)
 		return;
 
+	kmsg_dump_rewind(&iter);
+
 	if (test_and_set_bit(0, &cxt->oops_buf_busy))
 		return;
-	kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE,
+	kmsg_dump_get_buffer(&iter, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE,
 			     record_size - MTDOOPS_HEADER_SIZE, NULL);
 	clear_bit(0, &cxt->oops_buf_busy);
 
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index d963ae7902f9..b9614db48b1d 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -385,6 +385,7 @@ void pstore_record_init(struct pstore_record *record,
 static void pstore_dump(struct kmsg_dumper *dumper,
 			enum kmsg_dump_reason reason)
 {
+	struct kmsg_dump_iter iter;
 	unsigned long	total = 0;
 	const char	*why;
 	unsigned int	part = 1;
@@ -405,6 +406,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		}
 	}
 
+	kmsg_dump_rewind(&iter);
+
 	oopscount++;
 	while (total < kmsg_bytes) {
 		char *dst;
@@ -435,7 +438,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		dst_size -= header_size;
 
 		/* Write dump contents. */
-		if (!kmsg_dump_get_buffer(dumper, true, dst + header_size,
+		if (!kmsg_dump_get_buffer(&iter, true, dst + header_size,
 					  dst_size, &dump_size))
 			break;
 
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 84eaa2090efa..36c8c57e1051 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -29,6 +29,16 @@ enum kmsg_dump_reason {
 	KMSG_DUMP_MAX
 };
 
+/**
+ * struct kmsg_dump_iter - iterator for retrieving kernel messages
+ * @cur_seq:	Points to the oldest message to dump
+ * @next_seq:	Points after the newest message to dump
+ */
+struct kmsg_dump_iter {
+	u64	cur_seq;
+	u64	next_seq;
+};
+
 /**
  * struct kmsg_dumper - kernel crash message dumper structure
  * @list:	Entry in the dumper list (private)
@@ -36,35 +46,29 @@ enum kmsg_dump_reason {
  * 		through the record iterator
  * @max_reason:	filter for highest reason number that should be dumped
  * @registered:	Flag that specifies if this is already registered
- * @cur_seq:	Points to the oldest message to dump
- * @next_seq:	Points after the newest message to dump
  */
 struct kmsg_dumper {
 	struct list_head list;
 	void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason);
 	enum kmsg_dump_reason max_reason;
 	bool registered;
-
-	/* private state of the kmsg iterator */
-	u64 cur_seq;
-	u64 next_seq;
 };
 
 #ifdef CONFIG_PRINTK
 void kmsg_dump(enum kmsg_dump_reason reason);
 
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter, bool syslog,
 			       char *line, size_t size, size_t *len);
 
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
 			char *line, size_t size, size_t *len);
 
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
 			  char *buf, size_t size, size_t *len_out);
 
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper);
+void kmsg_dump_rewind_nolock(struct kmsg_dump_iter *iter);
 
-void kmsg_dump_rewind(struct kmsg_dumper *dumper);
+void kmsg_dump_rewind(struct kmsg_dump_iter *iter);
 
 int kmsg_dump_register(struct kmsg_dumper *dumper);
 
@@ -76,30 +80,30 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason)
 {
 }
 
-static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper,
+static inline bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter,
 					     bool syslog, const char *line,
 					     size_t size, size_t *len)
 {
 	return false;
 }
 
-static inline bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+static inline bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
 				const char *line, size_t size, size_t *len)
 {
 	return false;
 }
 
-static inline bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+static inline bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
 					char *buf, size_t size, size_t *len)
 {
 	return false;
 }
 
-static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+static inline void kmsg_dump_rewind_nolock(struct kmsg_dump_iter *iter)
 {
 }
 
-static inline void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+static inline void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
 {
 }
 
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 315169d5e119..8544d7a55a57 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2101,7 +2101,7 @@ static int kdb_dmesg(int argc, const char **argv)
 	int adjust = 0;
 	int n = 0;
 	int skip = 0;
-	struct kmsg_dumper dumper;
+	struct kmsg_dump_iter iter;
 	size_t len;
 	char buf[201];
 
@@ -2126,8 +2126,8 @@ static int kdb_dmesg(int argc, const char **argv)
 		kdb_set(2, setargs);
 	}
 
-	kmsg_dump_rewind_nolock(&dumper);
-	while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+	kmsg_dump_rewind_nolock(&iter);
+	while (kmsg_dump_get_line_nolock(&iter, 1, NULL, 0, NULL))
 		n++;
 
 	if (lines < 0) {
@@ -2159,8 +2159,8 @@ static int kdb_dmesg(int argc, const char **argv)
 	if (skip >= n || skip < 0)
 		return 0;
 
-	kmsg_dump_rewind_nolock(&dumper);
-	while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+	kmsg_dump_rewind_nolock(&iter);
+	while (kmsg_dump_get_line_nolock(&iter, 1, buf, sizeof(buf), &len)) {
 		if (skip) {
 			skip--;
 			continue;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ce4cc64ba7c9..b49dee256947 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3390,7 +3390,6 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
 	struct kmsg_dumper *dumper;
-	unsigned long flags;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(dumper, &dump_list, list) {
@@ -3407,12 +3406,6 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		if (reason > max_reason)
 			continue;
 
-		/* initialize iterator with data about the stored records */
-		logbuf_lock_irqsave(flags);
-		dumper->cur_seq = latched_seq_read_nolock(&clear_seq);
-		dumper->next_seq = prb_next_seq(prb);
-		logbuf_unlock_irqrestore(flags);
-
 		/* invoke dumper which will iterate over records */
 		dumper->dump(dumper, reason);
 	}
@@ -3421,7 +3414,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 
 /**
  * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
  * @syslog: include the "<4>" prefixes
  * @line: buffer to copy the line to
  * @size: maximum size of the buffer
@@ -3438,24 +3431,28 @@ void kmsg_dump(enum kmsg_dump_reason reason)
  *
  * The function is similar to kmsg_dump_get_line(), but grabs no locks.
  */
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter, bool syslog,
 			       char *line, size_t size, size_t *len)
 {
+	u64 min_seq = latched_seq_read_nolock(&clear_seq);
 	struct printk_info info;
 	unsigned int line_count;
 	struct printk_record r;
 	size_t l = 0;
 	bool ret = false;
 
+	if (iter->cur_seq < min_seq)
+		iter->cur_seq = min_seq;
+
 	prb_rec_init_rd(&r, &info, line, size);
 
 	/* Read text or count text lines? */
 	if (line) {
-		if (!prb_read_valid(prb, dumper->cur_seq, &r))
+		if (!prb_read_valid(prb, iter->cur_seq, &r))
 			goto out;
 		l = record_print_text(&r, syslog, printk_time);
 	} else {
-		if (!prb_read_valid_info(prb, dumper->cur_seq,
+		if (!prb_read_valid_info(prb, iter->cur_seq,
 					 &info, &line_count)) {
 			goto out;
 		}
@@ -3464,7 +3461,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 
 	}
 
-	dumper->cur_seq = r.info->seq + 1;
+	iter->cur_seq = r.info->seq + 1;
 	ret = true;
 out:
 	if (len)
@@ -3474,7 +3471,7 @@ out:
 
 /**
  * kmsg_dump_get_line - retrieve one kmsg log line
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
  * @syslog: include the "<4>" prefixes
  * @line: buffer to copy the line to
  * @size: maximum size of the buffer
@@ -3489,14 +3486,14 @@ out:
  * A return value of FALSE indicates that there are no more records to
  * read.
  */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
 			char *line, size_t size, size_t *len)
 {
 	unsigned long flags;
 	bool ret;
 
 	logbuf_lock_irqsave(flags);
-	ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+	ret = kmsg_dump_get_line_nolock(iter, syslog, line, size, len);
 	logbuf_unlock_irqrestore(flags);
 
 	return ret;
@@ -3505,7 +3502,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
 
 /**
  * kmsg_dump_get_buffer - copy kmsg log lines
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
  * @syslog: include the "<4>" prefixes
  * @buf: buffer to copy the line to
  * @size: maximum size of the buffer
@@ -3522,9 +3519,10 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
  * A return value of FALSE indicates that there are no more records to
  * read.
  */
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
 			  char *buf, size_t size, size_t *len_out)
 {
+	u64 min_seq = latched_seq_read_nolock(&clear_seq);
 	struct printk_info info;
 	struct printk_record r;
 	unsigned long flags;
@@ -3537,16 +3535,19 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	if (!buf || !size)
 		goto out;
 
+	if (iter->cur_seq < min_seq)
+		iter->cur_seq = min_seq;
+
 	logbuf_lock_irqsave(flags);
-	if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) {
-		if (info.seq != dumper->cur_seq) {
+	if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
+		if (info.seq != iter->cur_seq) {
 			/* messages are gone, move to first available one */
-			dumper->cur_seq = info.seq;
+			iter->cur_seq = info.seq;
 		}
 	}
 
 	/* last entry */
-	if (dumper->cur_seq >= dumper->next_seq) {
+	if (iter->cur_seq >= iter->next_seq) {
 		logbuf_unlock_irqrestore(flags);
 		goto out;
 	}
@@ -3557,7 +3558,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	 * because this function (by way of record_print_text()) will
 	 * not write more than size-1 bytes of text into @buf.
 	 */
-	seq = find_first_fitting_seq(dumper->cur_seq, dumper->next_seq,
+	seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
 				     size - 1, syslog, time);
 
 	/*
@@ -3570,7 +3571,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 
 	len = 0;
 	prb_for_each_record(seq, prb, seq, &r) {
-		if (r.info->seq >= dumper->next_seq)
+		if (r.info->seq >= iter->next_seq)
 			break;
 
 		len += record_print_text(&r, syslog, time);
@@ -3579,7 +3580,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 		prb_rec_init_rd(&r, &info, buf + len, size - len);
 	}
 
-	dumper->next_seq = next_seq;
+	iter->next_seq = next_seq;
 	ret = true;
 	logbuf_unlock_irqrestore(flags);
 out:
@@ -3591,7 +3592,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
 
 /**
  * kmsg_dump_rewind_nolock - reset the iterator (unlocked version)
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
  *
  * Reset the dumper's iterator so that kmsg_dump_get_line() and
  * kmsg_dump_get_buffer() can be called again and used multiple
@@ -3599,26 +3600,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
  *
  * The function is similar to kmsg_dump_rewind(), but grabs no locks.
  */
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+void kmsg_dump_rewind_nolock(struct kmsg_dump_iter *iter)
 {
-	dumper->cur_seq = latched_seq_read_nolock(&clear_seq);
-	dumper->next_seq = prb_next_seq(prb);
+	iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+	iter->next_seq = prb_next_seq(prb);
 }
 
 /**
  * kmsg_dump_rewind - reset the iterator
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
  *
  * Reset the dumper's iterator so that kmsg_dump_get_line() and
  * kmsg_dump_get_buffer() can be called again and used multiple
  * times within the same dumper.dump() callback.
  */
-void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
 {
 	unsigned long flags;
 
 	logbuf_lock_irqsave(flags);
-	kmsg_dump_rewind_nolock(dumper);
+	kmsg_dump_rewind_nolock(iter);
 	logbuf_unlock_irqrestore(flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
-- 
cgit v1.2.3


From a4f987653241db9fa1f99531b430cebb83f1eae1 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 3 Mar 2021 11:15:27 +0100
Subject: printk: kmsg_dump: remove _nolock() variants

kmsg_dump_rewind() and kmsg_dump_get_line() are lockless, so there is
no need for _nolock() variants. Remove these functions and switch all
callers of the _nolock() variants.

The functions without _nolock() were chosen because they are already
exported to kernel modules.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20210303101528.29901-15-john.ogness@linutronix.de
---
 arch/powerpc/xmon/xmon.c    |  4 +--
 include/linux/kmsg_dump.h   | 16 ------------
 kernel/debug/kdb/kdb_main.c |  8 +++---
 kernel/printk/printk.c      | 60 ++++++---------------------------------------
 4 files changed, 14 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index dbcf403b1d8d..dd0e0aac954e 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3017,9 +3017,9 @@ dump_log_buf(void)
 	catch_memory_errors = 1;
 	sync();
 
-	kmsg_dump_rewind_nolock(&iter);
+	kmsg_dump_rewind(&iter);
 	xmon_start_pagination();
-	while (kmsg_dump_get_line_nolock(&iter, false, buf, sizeof(buf), &len)) {
+	while (kmsg_dump_get_line(&iter, false, buf, sizeof(buf), &len)) {
 		buf[len] = '\0';
 		printf("%s", buf);
 	}
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 36c8c57e1051..906521c2329c 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -57,17 +57,12 @@ struct kmsg_dumper {
 #ifdef CONFIG_PRINTK
 void kmsg_dump(enum kmsg_dump_reason reason);
 
-bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter, bool syslog,
-			       char *line, size_t size, size_t *len);
-
 bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
 			char *line, size_t size, size_t *len);
 
 bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
 			  char *buf, size_t size, size_t *len_out);
 
-void kmsg_dump_rewind_nolock(struct kmsg_dump_iter *iter);
-
 void kmsg_dump_rewind(struct kmsg_dump_iter *iter);
 
 int kmsg_dump_register(struct kmsg_dumper *dumper);
@@ -80,13 +75,6 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason)
 {
 }
 
-static inline bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter,
-					     bool syslog, const char *line,
-					     size_t size, size_t *len)
-{
-	return false;
-}
-
 static inline bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
 				const char *line, size_t size, size_t *len)
 {
@@ -99,10 +87,6 @@ static inline bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog
 	return false;
 }
 
-static inline void kmsg_dump_rewind_nolock(struct kmsg_dump_iter *iter)
-{
-}
-
 static inline void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
 {
 }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 8544d7a55a57..67d9f2403b52 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2126,8 +2126,8 @@ static int kdb_dmesg(int argc, const char **argv)
 		kdb_set(2, setargs);
 	}
 
-	kmsg_dump_rewind_nolock(&iter);
-	while (kmsg_dump_get_line_nolock(&iter, 1, NULL, 0, NULL))
+	kmsg_dump_rewind(&iter);
+	while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL))
 		n++;
 
 	if (lines < 0) {
@@ -2159,8 +2159,8 @@ static int kdb_dmesg(int argc, const char **argv)
 	if (skip >= n || skip < 0)
 		return 0;
 
-	kmsg_dump_rewind_nolock(&iter);
-	while (kmsg_dump_get_line_nolock(&iter, 1, buf, sizeof(buf), &len)) {
+	kmsg_dump_rewind(&iter);
+	while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) {
 		if (skip) {
 			skip--;
 			continue;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8994bc192b88..602de86d4e76 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3373,7 +3373,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 }
 
 /**
- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
+ * kmsg_dump_get_line - retrieve one kmsg log line
  * @iter: kmsg dump iterator
  * @syslog: include the "<4>" prefixes
  * @line: buffer to copy the line to
@@ -3388,22 +3388,22 @@ void kmsg_dump(enum kmsg_dump_reason reason)
  *
  * A return value of FALSE indicates that there are no more records to
  * read.
- *
- * The function is similar to kmsg_dump_get_line(), but grabs no locks.
  */
-bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter, bool syslog,
-			       char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
+			char *line, size_t size, size_t *len)
 {
 	u64 min_seq = latched_seq_read_nolock(&clear_seq);
 	struct printk_info info;
 	unsigned int line_count;
 	struct printk_record r;
+	unsigned long flags;
 	size_t l = 0;
 	bool ret = false;
 
 	if (iter->cur_seq < min_seq)
 		iter->cur_seq = min_seq;
 
+	printk_safe_enter_irqsave(flags);
 	prb_rec_init_rd(&r, &info, line, size);
 
 	/* Read text or count text lines? */
@@ -3424,40 +3424,11 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter, bool syslog,
 	iter->cur_seq = r.info->seq + 1;
 	ret = true;
 out:
+	printk_safe_exit_irqrestore(flags);
 	if (len)
 		*len = l;
 	return ret;
 }
-
-/**
- * kmsg_dump_get_line - retrieve one kmsg log line
- * @iter: kmsg dump iterator
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
-			char *line, size_t size, size_t *len)
-{
-	unsigned long flags;
-	bool ret;
-
-	printk_safe_enter_irqsave(flags);
-	ret = kmsg_dump_get_line_nolock(iter, syslog, line, size, len);
-	printk_safe_exit_irqrestore(flags);
-
-	return ret;
-}
 EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
 
 /**
@@ -3550,22 +3521,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
 
-/**
- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version)
- * @iter: kmsg dump iterator
- *
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
- *
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
- */
-void kmsg_dump_rewind_nolock(struct kmsg_dump_iter *iter)
-{
-	iter->cur_seq = latched_seq_read_nolock(&clear_seq);
-	iter->next_seq = prb_next_seq(prb);
-}
-
 /**
  * kmsg_dump_rewind - reset the iterator
  * @iter: kmsg dump iterator
@@ -3579,7 +3534,8 @@ void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
 	unsigned long flags;
 
 	printk_safe_enter_irqsave(flags);
-	kmsg_dump_rewind_nolock(iter);
+	iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+	iter->next_seq = prb_next_seq(prb);
 	printk_safe_exit_irqrestore(flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
-- 
cgit v1.2.3


From 62d5247d239d4b48762192a251c647d7c997616a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 25 Feb 2021 18:33:18 +0200
Subject: gpiolib: acpi: Add ACPI_GPIO_QUIRK_ABSOLUTE_NUMBER quirk

On some systems the ACPI tables has wrong pin number and instead of
having a relative one it provides an absolute one in the global GPIO
number space.

Add ACPI_GPIO_QUIRK_ABSOLUTE_NUMBER quirk to cope with such cases.

Fixes: ba8c90c61847 ("gpio: pca953x: Override IRQ for one of the expanders on Galileo Gen 2")
Depends-on: 0ea683931adb ("gpio: dwapb: Convert driver to using the GPIO-lib-based IRQ-chip")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-acpi.c   | 7 ++++++-
 include/linux/gpio/consumer.h | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 86efa2d9bf7f..0fa0127d50ec 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -677,6 +677,7 @@ static int acpi_populate_gpio_lookup(struct acpi_resource *ares, void *data)
 	if (!lookup->desc) {
 		const struct acpi_resource_gpio *agpio = &ares->data.gpio;
 		bool gpioint = agpio->connection_type == ACPI_RESOURCE_GPIO_TYPE_INT;
+		struct gpio_desc *desc;
 		u16 pin_index;
 
 		if (lookup->info.quirks & ACPI_GPIO_QUIRK_ONLY_GPIOIO && gpioint)
@@ -689,8 +690,12 @@ static int acpi_populate_gpio_lookup(struct acpi_resource *ares, void *data)
 		if (pin_index >= agpio->pin_table_length)
 			return 1;
 
-		lookup->desc = acpi_get_gpiod(agpio->resource_source.string_ptr,
+		if (lookup->info.quirks & ACPI_GPIO_QUIRK_ABSOLUTE_NUMBER)
+			desc = gpio_to_desc(agpio->pin_table[pin_index]);
+		else
+			desc = acpi_get_gpiod(agpio->resource_source.string_ptr,
 					      agpio->pin_table[pin_index]);
+		lookup->desc = desc;
 		lookup->info.pin_config = agpio->pin_config;
 		lookup->info.debounce = agpio->debounce_timeout;
 		lookup->info.gpioint = gpioint;
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index ef49307611d2..c73b25bc9213 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -674,6 +674,8 @@ struct acpi_gpio_mapping {
  * get GpioIo type explicitly, this quirk may be used.
  */
 #define ACPI_GPIO_QUIRK_ONLY_GPIOIO		BIT(1)
+/* Use given pin as an absolute GPIO number in the system */
+#define ACPI_GPIO_QUIRK_ABSOLUTE_NUMBER		BIT(2)
 
 	unsigned int quirks;
 };
-- 
cgit v1.2.3


From 809390219fb9c2421239afe5c9eb862d73978ba0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 25 Feb 2021 18:33:19 +0200
Subject: gpiolib: acpi: Allow to find GpioInt() resource by name and index

Currently only search by index is supported. However, in some cases
we might need to pass the quirks to the acpi_dev_gpio_irq_get().

For this, split out acpi_dev_gpio_irq_get_by() and replace
acpi_dev_gpio_irq_get() by calling above with NULL for name parameter.

Fixes: ba8c90c61847 ("gpio: pca953x: Override IRQ for one of the expanders on Galileo Gen 2")
Depends-on: 0ea683931adb ("gpio: dwapb: Convert driver to using the GPIO-lib-based IRQ-chip")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-acpi.c | 12 ++++++++----
 include/linux/acpi.h        | 10 ++++++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 0fa0127d50ec..1aacd2a5a1fd 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -945,8 +945,9 @@ struct gpio_desc *acpi_node_get_gpiod(struct fwnode_handle *fwnode,
 }
 
 /**
- * acpi_dev_gpio_irq_get() - Find GpioInt and translate it to Linux IRQ number
+ * acpi_dev_gpio_irq_get_by() - Find GpioInt and translate it to Linux IRQ number
  * @adev: pointer to a ACPI device to get IRQ from
+ * @name: optional name of GpioInt resource
  * @index: index of GpioInt resource (starting from %0)
  *
  * If the device has one or more GpioInt resources, this function can be
@@ -956,9 +957,12 @@ struct gpio_desc *acpi_node_get_gpiod(struct fwnode_handle *fwnode,
  * The function is idempotent, though each time it runs it will configure GPIO
  * pin direction according to the flags in GpioInt resource.
  *
+ * The function takes optional @name parameter. If the resource has a property
+ * name, then only those will be taken into account.
+ *
  * Return: Linux IRQ number (> %0) on success, negative errno on failure.
  */
-int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
+int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int index)
 {
 	int idx, i;
 	unsigned int irq_flags;
@@ -968,7 +972,7 @@ int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
 		struct acpi_gpio_info info;
 		struct gpio_desc *desc;
 
-		desc = acpi_get_gpiod_by_index(adev, NULL, i, &info);
+		desc = acpi_get_gpiod_by_index(adev, name, i, &info);
 
 		/* Ignore -EPROBE_DEFER, it only matters if idx matches */
 		if (IS_ERR(desc) && PTR_ERR(desc) != -EPROBE_DEFER)
@@ -1013,7 +1017,7 @@ int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
 	}
 	return -ENOENT;
 }
-EXPORT_SYMBOL_GPL(acpi_dev_gpio_irq_get);
+EXPORT_SYMBOL_GPL(acpi_dev_gpio_irq_get_by);
 
 static acpi_status
 acpi_gpio_adr_space_handler(u32 function, acpi_physical_address address,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9f432411e988..fcdaab723916 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1079,19 +1079,25 @@ void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const c
 #if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB)
 bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
 				struct acpi_resource_gpio **agpio);
-int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index);
+int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int index);
 #else
 static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
 					      struct acpi_resource_gpio **agpio)
 {
 	return false;
 }
-static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
+static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev,
+					   const char *name, int index)
 {
 	return -ENXIO;
 }
 #endif
 
+static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
+{
+	return acpi_dev_gpio_irq_get_by(adev, NULL, index);
+}
+
 /* Device properties */
 
 #ifdef CONFIG_ACPI
-- 
cgit v1.2.3


From 6e3e2c4362e41a2f18e3f7a5ad81bd2f49a47b85 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 1 Mar 2021 20:37:10 -0500
Subject: new helper: inode_wrong_type()

inode_wrong_type(inode, mode) returns true if setting inode->i_mode
to given value would've changed the inode type.  We have enough of
those checks open-coded to make a helper worthwhile.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_inode.c      | 4 ++--
 fs/9p/vfs_inode_dotl.c | 4 ++--
 fs/cifs/inode.c        | 5 ++---
 fs/fuse/dir.c          | 6 +++---
 fs/fuse/inode.c        | 2 +-
 fs/fuse/readdir.c      | 2 +-
 fs/nfs/inode.c         | 6 +++---
 fs/nfsd/nfsproc.c      | 2 +-
 fs/overlayfs/namei.c   | 4 ++--
 include/linux/fs.h     | 5 +++++
 10 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d97f0b45e9c..795706520b5e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -399,7 +399,7 @@ static int v9fs_test_inode(struct inode *inode, void *data)
 
 	umode = p9mode2unixmode(v9ses, st, &rdev);
 	/* don't match inode of different type */
-	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+	if (inode_wrong_type(inode, umode))
 		return 0;
 
 	/* compare qid details */
@@ -1390,7 +1390,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	 * Don't update inode if the file type is different
 	 */
 	umode = p9mode2unixmode(v9ses, st, &rdev);
-	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+	if (inode_wrong_type(inode, umode))
 		goto out;
 
 	/*
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1dc7af046615..df0b87b05c42 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -59,7 +59,7 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data)
 	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
 
 	/* don't match inode of different type */
-	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+	if (inode_wrong_type(inode, st->st_mode))
 		return 0;
 
 	if (inode->i_generation != st->st_gen)
@@ -959,7 +959,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	/*
 	 * Don't update inode if the file type is different
 	 */
-	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+	if (inode_wrong_type(inode, st->st_mode))
 		goto out;
 
 	/*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7c61bc9573c0..d46b36d52211 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -426,8 +426,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 		}
 
 		/* if filetype is different, return error */
-		if (unlikely(((*pinode)->i_mode & S_IFMT) !=
-		    (fattr.cf_mode & S_IFMT))) {
+		if (unlikely(inode_wrong_type(*pinode, fattr.cf_mode))) {
 			CIFS_I(*pinode)->time = 0; /* force reval */
 			rc = -ESTALE;
 			goto cgiiu_exit;
@@ -1249,7 +1248,7 @@ cifs_find_inode(struct inode *inode, void *opaque)
 		return 0;
 
 	/* don't match inode of different type */
-	if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
+	if (inode_wrong_type(inode, fattr->cf_mode))
 		return 0;
 
 	/* if it's not a directory or has no dentries, then flag it */
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 06a18700a845..2400b98e8808 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -252,7 +252,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 		if (ret == -ENOMEM)
 			goto out;
 		if (ret || fuse_invalid_attr(&outarg.attr) ||
-		    (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+		    inode_wrong_type(inode, outarg.attr.mode))
 			goto invalid;
 
 		forget_all_cached_acls(inode);
@@ -1054,7 +1054,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 	err = fuse_simple_request(fm, &args);
 	if (!err) {
 		if (fuse_invalid_attr(&outarg.attr) ||
-		    (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+		    inode_wrong_type(inode, outarg.attr.mode)) {
 			fuse_make_bad(inode);
 			err = -EIO;
 		} else {
@@ -1703,7 +1703,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
 	}
 
 	if (fuse_invalid_attr(&outarg.attr) ||
-	    (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+	    inode_wrong_type(inode, outarg.attr.mode)) {
 		fuse_make_bad(inode);
 		err = -EIO;
 		goto error;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b0e18b470e91..b4b956da3851 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -350,7 +350,7 @@ retry:
 		inode->i_generation = generation;
 		fuse_init_inode(inode, attr);
 		unlock_new_inode(inode);
-	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
+	} else if (inode_wrong_type(inode, attr->mode)) {
 		/* Inode has changed type, any I/O on the old should fail */
 		fuse_make_bad(inode);
 		iput(inode);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 3441ffa740f3..277f7041d55a 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -202,7 +202,7 @@ retry:
 		inode = d_inode(dentry);
 		if (!inode ||
 		    get_node_id(inode) != o->nodeid ||
-		    ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+		    inode_wrong_type(inode, o->attr.mode)) {
 			d_invalidate(dentry);
 			dput(dentry);
 			goto retry;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 749bbea14d99..b0da2408816d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -334,7 +334,7 @@ nfs_find_actor(struct inode *inode, void *opaque)
 
 	if (NFS_FILEID(inode) != fattr->fileid)
 		return 0;
-	if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode))
+	if (inode_wrong_type(inode, fattr->mode))
 		return 0;
 	if (nfs_compare_fh(NFS_FH(inode), fh))
 		return 0;
@@ -1460,7 +1460,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 			return 0;
 		return -ESTALE;
 	}
-	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode))
 		return -ESTALE;
 
 
@@ -1875,7 +1875,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+	if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) {
 		/*
 		* Big trouble! The inode has become a different object.
 		*/
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a8d5449dd0e9..6d51687a0585 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -381,7 +381,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
 
 		/* Make sure the type and device matches */
 		resp->status = nfserr_exist;
-		if (inode && type != (inode->i_mode & S_IFMT))
+		if (inode && inode_wrong_type(inode, type))
 			goto out_unlock;
 	}
 
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 3fe05fb5d145..1d573972ce22 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -371,7 +371,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
 		return PTR_ERR(origin);
 
 	if (upperdentry && !ovl_is_whiteout(upperdentry) &&
-	    ((d_inode(origin)->i_mode ^ d_inode(upperdentry)->i_mode) & S_IFMT))
+	    inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode))
 		goto invalid;
 
 	if (!*stackp)
@@ -730,7 +730,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
 		index = ERR_PTR(-ESTALE);
 		goto out;
 	} else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) ||
-		   ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) {
+		   inode_wrong_type(inode, d_inode(origin)->i_mode)) {
 		/*
 		 * Index should always be of the same file type as origin
 		 * except for the case of a whiteout index. A whiteout
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6a..9e0d76a41229 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2884,6 +2884,11 @@ static inline bool execute_ok(struct inode *inode)
 	return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
 }
 
+static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
+{
+	return (inode->i_mode ^ mode) & S_IFMT;
+}
+
 static inline void file_start_write(struct file *file)
 {
 	if (!S_ISREG(file_inode(file)->i_mode))
-- 
cgit v1.2.3


From d0f1088b31db2d03497a74ca67755df5515f8ff4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 8 Mar 2020 09:16:37 -0400
Subject: coredump: don't bother with do_truncate()

have dump_skip() just remember how much needs to be skipped,
leave actual seeks/writing zeroes to the next dump_emit()
or the end of coredump output, whichever comes first.
And instead of playing with do_truncate() in the end, just
write one NUL at the end of the last gap (if any).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/coredump.c |  3 +-
 fs/binfmt_elf.c                              |  4 +-
 fs/binfmt_elf_fdpic.c                        |  3 +-
 fs/coredump.c                                | 72 +++++++++++++++++-----------
 include/linux/binfmts.h                      |  1 +
 include/linux/coredump.h                     |  4 +-
 6 files changed, 49 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 60b5583e9eaf..1a587618015c 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -149,8 +149,7 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i,
 			return -EIO;
 	}
 
-	if (!dump_skip(cprm, roundup(cprm->pos - ret + sz, 4) - cprm->pos))
-		return -EIO;
+	dump_skip_to(cprm, roundup(cprm->pos - ret + sz, 4));
 	return 0;
 }
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b12ba98ae9f5..187b3f2b9202 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2267,8 +2267,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 		goto end_coredump;
 
 	/* Align to page */
-	if (!dump_skip(cprm, dataoff - cprm->pos))
-		goto end_coredump;
+	dump_skip_to(cprm, dataoff);
 
 	for (i = 0; i < vma_count; i++) {
 		struct core_vma_metadata *meta = vma_meta + i;
@@ -2276,7 +2275,6 @@ static int elf_core_dump(struct coredump_params *cprm)
 		if (!dump_user_range(cprm, meta->start, meta->dump_size))
 			goto end_coredump;
 	}
-	dump_truncate(cprm);
 
 	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 3cfd6cd46f26..2c99b102c860 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1631,8 +1631,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 				goto end_coredump;
 	}
 
-	if (!dump_skip(cprm, dataoff - cprm->pos))
-		goto end_coredump;
+	dump_skip_to(cprm, dataoff);
 
 	if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
 		goto end_coredump;
diff --git a/fs/coredump.c b/fs/coredump.c
index 1c0fdc1aa70b..2868e3e171ae 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -809,6 +809,16 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		}
 		file_start_write(cprm.file);
 		core_dumped = binfmt->core_dump(&cprm);
+		/*
+		 * Ensures that file size is big enough to contain the current
+		 * file postion. This prevents gdb from complaining about
+		 * a truncated file if the last "write" to the file was
+		 * dump_skip.
+		 */
+		if (cprm.to_skip) {
+			cprm.to_skip--;
+			dump_emit(&cprm, "", 1);
+		}
 		file_end_write(cprm.file);
 	}
 	if (ispipe && core_pipe_limit)
@@ -835,7 +845,7 @@ fail:
  * do on a core-file: use only these functions to write out all the
  * necessary info.
  */
-int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
+static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 {
 	struct file *file = cprm->file;
 	loff_t pos = file->f_pos;
@@ -855,9 +865,8 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 
 	return 1;
 }
-EXPORT_SYMBOL(dump_emit);
 
-int dump_skip(struct coredump_params *cprm, size_t nr)
+static int __dump_skip(struct coredump_params *cprm, size_t nr)
 {
 	static char zeroes[PAGE_SIZE];
 	struct file *file = cprm->file;
@@ -869,13 +878,35 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
 		return 1;
 	} else {
 		while (nr > PAGE_SIZE) {
-			if (!dump_emit(cprm, zeroes, PAGE_SIZE))
+			if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
 				return 0;
 			nr -= PAGE_SIZE;
 		}
-		return dump_emit(cprm, zeroes, nr);
+		return __dump_emit(cprm, zeroes, nr);
 	}
 }
+
+int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
+{
+	if (cprm->to_skip) {
+		if (!__dump_skip(cprm, cprm->to_skip))
+			return 0;
+		cprm->to_skip = 0;
+	}
+	return __dump_emit(cprm, addr, nr);
+}
+EXPORT_SYMBOL(dump_emit);
+
+void dump_skip_to(struct coredump_params *cprm, unsigned long pos)
+{
+	cprm->to_skip = pos - cprm->pos;
+}
+EXPORT_SYMBOL(dump_skip_to);
+
+void dump_skip(struct coredump_params *cprm, size_t nr)
+{
+	cprm->to_skip += nr;
+}
 EXPORT_SYMBOL(dump_skip);
 
 #ifdef CONFIG_ELF_CORE
@@ -902,11 +933,11 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 			stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
 			kunmap_local(kaddr);
 			put_page(page);
+			if (stop)
+				return 0;
 		} else {
-			stop = !dump_skip(cprm, PAGE_SIZE);
+			dump_skip(cprm, PAGE_SIZE);
 		}
-		if (stop)
-			return 0;
 	}
 	return 1;
 }
@@ -914,32 +945,15 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 
 int dump_align(struct coredump_params *cprm, int align)
 {
-	unsigned mod = cprm->pos & (align - 1);
+	unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1);
 	if (align & (align - 1))
 		return 0;
-	return mod ? dump_skip(cprm, align - mod) : 1;
+	if (mod)
+		cprm->to_skip += align - mod;
+	return 1;
 }
 EXPORT_SYMBOL(dump_align);
 
-/*
- * Ensures that file size is big enough to contain the current file
- * postion. This prevents gdb from complaining about a truncated file
- * if the last "write" to the file was dump_skip.
- */
-void dump_truncate(struct coredump_params *cprm)
-{
-	struct file *file = cprm->file;
-	loff_t offset;
-
-	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
-		offset = file->f_op->llseek(file, 0, SEEK_CUR);
-		if (i_size_read(file->f_mapping->host) < offset)
-			do_truncate(file_mnt_user_ns(file), file->f_path.dentry,
-				    offset, 0, file);
-	}
-}
-EXPORT_SYMBOL(dump_truncate);
-
 /*
  * The purpose of always_dump_vma() is to make sure that special kernel mappings
  * that are useful for post-mortem analysis are included in every core dump.
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 0abd93efc181..049cf9421d83 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -86,6 +86,7 @@ struct coredump_params {
 	unsigned long mm_flags;
 	loff_t written;
 	loff_t pos;
+	loff_t to_skip;
 };
 
 /*
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index e58e8c207782..247aae4e69c8 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -18,10 +18,10 @@ struct core_vma_metadata {
  * functions to write out all the necessary info.
  */
 struct coredump_params;
-extern int dump_skip(struct coredump_params *cprm, size_t nr);
+extern void dump_skip_to(struct coredump_params *cprm, unsigned long to);
+extern void dump_skip(struct coredump_params *cprm, size_t nr);
 extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 extern int dump_align(struct coredump_params *cprm, int align);
-extern void dump_truncate(struct coredump_params *cprm);
 int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		    unsigned long len);
 int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
-- 
cgit v1.2.3


From a64b89088bb1413bb84424f0b16a4d1f9bb0e947 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Jan 2021 20:51:48 -0500
Subject: coredump.h: move CONFIG_COREDUMP-only stuff inside the ifdef

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/coredump.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 247aae4e69c8..78fcd776b185 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -7,12 +7,17 @@
 #include <linux/fs.h>
 #include <asm/siginfo.h>
 
+#ifdef CONFIG_COREDUMP
 struct core_vma_metadata {
 	unsigned long start, end;
 	unsigned long flags;
 	unsigned long dump_size;
 };
 
+extern int core_uses_pid;
+extern char core_pattern[];
+extern unsigned int core_pipe_limit;
+
 /*
  * These are the only things you should do on a core-file: use only these
  * functions to write out all the necessary info.
@@ -27,14 +32,9 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
 		      struct core_vma_metadata **vma_meta,
 		      size_t *vma_data_size_ptr);
-#ifdef CONFIG_COREDUMP
 extern void do_coredump(const kernel_siginfo_t *siginfo);
 #else
 static inline void do_coredump(const kernel_siginfo_t *siginfo) {}
 #endif
 
-extern int core_uses_pid;
-extern char core_pattern[];
-extern unsigned int core_pipe_limit;
-
 #endif /* _LINUX_COREDUMP_H */
-- 
cgit v1.2.3


From 4c324548f09fec413b4ee589174dabacfe17d953 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 5 Mar 2021 19:41:44 +0100
Subject: ACPI: utils: Introduce acpi_evaluation_failure_warn()

Quite a few users of ACPI objects want to log a warning message if
the evaluation fails which is a repeating pattern, so introduce a
helper function for that purpose and convert some code where it is
open-coded to using it.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/pci_link.c             |  6 ++----
 drivers/acpi/processor_perflib.c    | 10 +++-------
 drivers/acpi/processor_throttling.c | 16 ++++------------
 drivers/acpi/utils.c                | 14 ++++++++++++++
 include/linux/acpi.h                |  5 +++++
 5 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index b9b80e26cb5c..cb7b900d9466 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -256,8 +256,7 @@ static int acpi_pci_link_get_current(struct acpi_pci_link *link)
 	status = acpi_walk_resources(handle, METHOD_NAME__CRS,
 				     acpi_pci_link_check_current, &irq);
 	if (ACPI_FAILURE(status)) {
-		acpi_handle_warn(handle, "_CRS evaluation failed: %s\n",
-				 acpi_format_exception(status));
+		acpi_evaluation_failure_warn(handle, "_CRS", status);
 		result = -ENODEV;
 		goto end;
 	}
@@ -345,8 +344,7 @@ static int acpi_pci_link_set(struct acpi_pci_link *link, int irq)
 
 	/* check for total failure */
 	if (ACPI_FAILURE(status)) {
-		acpi_handle_warn(handle, "_SRS evaluation failed: %s",
-				 acpi_format_exception(status));
+		acpi_evaluation_failure_warn(handle, "_SRS", status);
 		result = -ENODEV;
 		goto end;
 	}
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 686b23fb968c..1b6aa635bff6 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -67,9 +67,7 @@ static int acpi_processor_get_platform_limit(struct acpi_processor *pr)
 		acpi_processor_ppc_in_use = true;
 
 		if (ACPI_FAILURE(status)) {
-			acpi_handle_warn(pr->handle,
-					 "_PPC evaluation failed: %s\n",
-					 acpi_format_exception(status));
+			acpi_evaluation_failure_warn(pr->handle, "_PPC", status);
 			return -ENODEV;
 		}
 	}
@@ -199,8 +197,7 @@ static int acpi_processor_get_performance_control(struct acpi_processor *pr)
 
 	status = acpi_evaluate_object(pr->handle, "_PCT", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
-		acpi_handle_warn(pr->handle, "_PCT evaluation failed: %s\n",
-				 acpi_format_exception(status));
+		acpi_evaluation_failure_warn(pr->handle, "_PCT", status);
 		return -ENODEV;
 	}
 
@@ -300,8 +297,7 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr)
 
 	status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
-		acpi_handle_warn(pr->handle, "_PSS evaluation failed: %s\n",
-				 acpi_format_exception(status));
+		acpi_evaluation_failure_warn(pr->handle, "_PSS", status);
 		return -ENODEV;
 	}
 
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index fb6834cb0882..e61b8f038364 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -281,9 +281,7 @@ static int acpi_processor_get_platform_limit(struct acpi_processor *pr)
 	status = acpi_evaluate_integer(pr->handle, "_TPC", NULL, &tpc);
 	if (ACPI_FAILURE(status)) {
 		if (status != AE_NOT_FOUND)
-			acpi_handle_warn(pr->handle,
-					 "_TPC evaluation failed: %s\n",
-					 acpi_format_exception(status));
+			acpi_evaluation_failure_warn(pr->handle, "_TPC", status);
 
 		return -ENODEV;
 	}
@@ -416,9 +414,7 @@ static int acpi_processor_get_throttling_control(struct acpi_processor *pr)
 	status = acpi_evaluate_object(pr->handle, "_PTC", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
 		if (status != AE_NOT_FOUND)
-			acpi_handle_warn(pr->handle,
-					 "_PTC evaluation failed: %s\n",
-					 acpi_format_exception(status));
+			acpi_evaluation_failure_warn(pr->handle, "_PTC", status);
 
 		return -ENODEV;
 	}
@@ -503,9 +499,7 @@ static int acpi_processor_get_throttling_states(struct acpi_processor *pr)
 	status = acpi_evaluate_object(pr->handle, "_TSS", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
 		if (status != AE_NOT_FOUND)
-			acpi_handle_warn(pr->handle,
-					 "_TSS evaluation failed: %s\n",
-					 acpi_format_exception(status));
+			acpi_evaluation_failure_warn(pr->handle, "_TSS", status);
 
 		return -ENODEV;
 	}
@@ -586,9 +580,7 @@ static int acpi_processor_get_tsd(struct acpi_processor *pr)
 	status = acpi_evaluate_object(pr->handle, "_TSD", NULL, &buffer);
 	if (ACPI_FAILURE(status)) {
 		if (status != AE_NOT_FOUND)
-			acpi_handle_warn(pr->handle,
-					 "_TSD evaluation failed: %s\n",
-					 acpi_format_exception(status));
+			acpi_evaluation_failure_warn(pr->handle, "_TSD", status);
 
 		return -ENODEV;
 	}
diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 682edd913b3b..f1aff4dab476 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -511,6 +511,20 @@ __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle,
 EXPORT_SYMBOL(__acpi_handle_debug);
 #endif
 
+/**
+ * acpi_evaluation_failure_warn - Log evaluation failure warning.
+ * @handle: Parent object handle.
+ * @name: Name of the object whose evaluation has failed.
+ * @status: Status value returned by the failing object evaluation.
+ */
+void acpi_evaluation_failure_warn(acpi_handle handle, const char *name,
+				  acpi_status status)
+{
+	acpi_handle_warn(handle, "%s evaluation failed: %s\n", name,
+			 acpi_format_exception(status));
+}
+EXPORT_SYMBOL_GPL(acpi_evaluation_failure_warn);
+
 /**
  * acpi_has_method: Check whether @handle has a method named @name
  * @handle: ACPI device handle
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9f432411e988..35aa70defc57 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1027,9 +1027,14 @@ static inline void acpi_ec_set_gpe_wake_mask(u8 action) {}
 __printf(3, 4)
 void acpi_handle_printk(const char *level, acpi_handle handle,
 			const char *fmt, ...);
+void acpi_evaluation_failure_warn(acpi_handle handle, const char *name,
+				  acpi_status status);
 #else	/* !CONFIG_ACPI */
 static inline __printf(3, 4) void
 acpi_handle_printk(const char *level, void *handle, const char *fmt, ...) {}
+static inline void acpi_evaluation_failure_warn(acpi_handle handle,
+						const char *name,
+						acpi_status status) {}
 #endif	/* !CONFIG_ACPI */
 
 #if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG)
-- 
cgit v1.2.3


From 9336a5f64b54d2913fb5daa1ac0280ff36f1c5ed Mon Sep 17 00:00:00 2001
From: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Date: Sun, 21 Feb 2021 09:49:18 -0800
Subject: kexec: Move ELF fields to struct kimage

ELF related fields elf_headers, elf_headers_sz, and elf_load_addr are
defined in architecture specific 'struct kimage_arch' for x86, powerpc,
and arm64.  The name of these fields are different in these
architectures that makes it hard to have a common code for setting up
the device tree for kexec system call.

Move the ELF fields to 'struct kimage' defined in include/linux/kexec.h
so common code can use it.

Suggested-by: Rob Herring <robh@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210221174930.27324-2-nramas@linux.microsoft.com
---
 include/linux/kexec.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 8a7aa1d7e0e3..0bfab392367f 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -305,6 +305,11 @@ struct kimage {
 	/* Virtual address of IMA measurement buffer for kexec syscall */
 	void *ima_buffer;
 #endif
+
+	/* Core ELF header buffer */
+	void *elf_headers;
+	unsigned long elf_headers_sz;
+	unsigned long elf_load_addr;
 };
 
 /* kexec interface functions */
-- 
cgit v1.2.3


From b30be4dc733e5067b56def359b0823f1e54ded8c Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Sun, 21 Feb 2021 09:49:22 -0800
Subject: of: Add a common kexec FDT setup function

Both arm64 and powerpc do essentially the same FDT /chosen setup for
kexec.  The differences are either omissions that arm64 should have
or additional properties that will be ignored.  The setup code can be
combined and shared by both powerpc and arm64.

The differences relative to the arm64 version:
 - If /chosen doesn't exist, it will be created (should never happen).
 - Any old dtb and initrd reserved memory will be released.
 - The new initrd and elfcorehdr are marked reserved.
 - "linux,booted-from-kexec" is set.

The differences relative to the powerpc version:
 - "kaslr-seed" and "rng-seed" may be set.
 - "linux,elfcorehdr" is set.
 - Any existing "linux,usable-memory-range" is removed.

Combine the code for setting up the /chosen node in the FDT and updating
the memory reservation for kexec, for powerpc and arm64, in
of_kexec_alloc_and_setup_fdt() and move it to "drivers/of/kexec.c".

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210221174930.27324-6-nramas@linux.microsoft.com
---
 drivers/of/Makefile |   6 ++
 drivers/of/kexec.c  | 265 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of.h  |   5 +
 3 files changed, 276 insertions(+)
 create mode 100644 drivers/of/kexec.c

(limited to 'include/linux')

diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 6e1e5212f058..c13b982084a3 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -14,4 +14,10 @@ obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
 obj-$(CONFIG_OF_NUMA) += of_numa.o
 
+ifdef CONFIG_KEXEC_FILE
+ifdef CONFIG_OF_FLATTREE
+obj-y	+= kexec.o
+endif
+endif
+
 obj-$(CONFIG_OF_UNITTEST) += unittest-data/
diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
new file mode 100644
index 000000000000..8fa8946cda42
--- /dev/null
+++ b/drivers/of/kexec.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Arm Limited
+ *
+ * Based on arch/arm64/kernel/machine_kexec_file.c:
+ *  Copyright (C) 2018 Linaro Limited
+ *
+ * And arch/powerpc/kexec/file_load.c:
+ *  Copyright (C) 2016  IBM Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/libfdt.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/random.h>
+#include <linux/types.h>
+
+/* relevant device tree properties */
+#define FDT_PROP_KEXEC_ELFHDR	"linux,elfcorehdr"
+#define FDT_PROP_MEM_RANGE	"linux,usable-memory-range"
+#define FDT_PROP_INITRD_START	"linux,initrd-start"
+#define FDT_PROP_INITRD_END	"linux,initrd-end"
+#define FDT_PROP_BOOTARGS	"bootargs"
+#define FDT_PROP_KASLR_SEED	"kaslr-seed"
+#define FDT_PROP_RNG_SEED	"rng-seed"
+#define RNG_SEED_SIZE		128
+
+/*
+ * Additional space needed for the FDT buffer so that we can add initrd,
+ * bootargs, kaslr-seed, rng-seed, useable-memory-range and elfcorehdr.
+ */
+#define FDT_EXTRA_SPACE 0x1000
+
+/**
+ * fdt_find_and_del_mem_rsv - delete memory reservation with given address and size
+ *
+ * @fdt:	Flattened device tree for the current kernel.
+ * @start:	Starting address of the reserved memory.
+ * @size:	Size of the reserved memory.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+static int fdt_find_and_del_mem_rsv(void *fdt, unsigned long start, unsigned long size)
+{
+	int i, ret, num_rsvs = fdt_num_mem_rsv(fdt);
+
+	for (i = 0; i < num_rsvs; i++) {
+		u64 rsv_start, rsv_size;
+
+		ret = fdt_get_mem_rsv(fdt, i, &rsv_start, &rsv_size);
+		if (ret) {
+			pr_err("Malformed device tree.\n");
+			return -EINVAL;
+		}
+
+		if (rsv_start == start && rsv_size == size) {
+			ret = fdt_del_mem_rsv(fdt, i);
+			if (ret) {
+				pr_err("Error deleting device tree reservation.\n");
+				return -EINVAL;
+			}
+
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * of_kexec_alloc_and_setup_fdt - Alloc and setup a new Flattened Device Tree
+ *
+ * @image:		kexec image being loaded.
+ * @initrd_load_addr:	Address where the next initrd will be loaded.
+ * @initrd_len:		Size of the next initrd, or 0 if there will be none.
+ * @cmdline:		Command line for the next kernel, or NULL if there will
+ *			be none.
+ * @extra_fdt_size:	Additional size for the new FDT buffer.
+ *
+ * Return: fdt on success, or NULL errno on error.
+ */
+void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
+				   unsigned long initrd_load_addr,
+				   unsigned long initrd_len,
+				   const char *cmdline, size_t extra_fdt_size)
+{
+	void *fdt;
+	int ret, chosen_node;
+	const void *prop;
+	size_t fdt_size;
+
+	fdt_size = fdt_totalsize(initial_boot_params) +
+		   (cmdline ? strlen(cmdline) : 0) +
+		   FDT_EXTRA_SPACE +
+		   extra_fdt_size;
+	fdt = kvmalloc(fdt_size, GFP_KERNEL);
+	if (!fdt)
+		return NULL;
+
+	ret = fdt_open_into(initial_boot_params, fdt, fdt_size);
+	if (ret < 0) {
+		pr_err("Error %d setting up the new device tree.\n", ret);
+		goto out;
+	}
+
+	/* Remove memory reservation for the current device tree. */
+	ret = fdt_find_and_del_mem_rsv(fdt, __pa(initial_boot_params),
+				       fdt_totalsize(initial_boot_params));
+	if (ret == -EINVAL) {
+		pr_err("Error removing memory reservation.\n");
+		goto out;
+	}
+
+	chosen_node = fdt_path_offset(fdt, "/chosen");
+	if (chosen_node == -FDT_ERR_NOTFOUND)
+		chosen_node = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
+					      "chosen");
+	if (chosen_node < 0) {
+		ret = chosen_node;
+		goto out;
+	}
+
+	ret = fdt_delprop(fdt, chosen_node, FDT_PROP_KEXEC_ELFHDR);
+	if (ret && ret != -FDT_ERR_NOTFOUND)
+		goto out;
+	ret = fdt_delprop(fdt, chosen_node, FDT_PROP_MEM_RANGE);
+	if (ret && ret != -FDT_ERR_NOTFOUND)
+		goto out;
+
+	/* Did we boot using an initrd? */
+	prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", NULL);
+	if (prop) {
+		u64 tmp_start, tmp_end, tmp_size;
+
+		tmp_start = fdt64_to_cpu(*((const fdt64_t *) prop));
+
+		prop = fdt_getprop(fdt, chosen_node, "linux,initrd-end", NULL);
+		if (!prop) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		tmp_end = fdt64_to_cpu(*((const fdt64_t *) prop));
+
+		/*
+		 * kexec reserves exact initrd size, while firmware may
+		 * reserve a multiple of PAGE_SIZE, so check for both.
+		 */
+		tmp_size = tmp_end - tmp_start;
+		ret = fdt_find_and_del_mem_rsv(fdt, tmp_start, tmp_size);
+		if (ret == -ENOENT)
+			ret = fdt_find_and_del_mem_rsv(fdt, tmp_start,
+						       round_up(tmp_size, PAGE_SIZE));
+		if (ret == -EINVAL)
+			goto out;
+	}
+
+	/* add initrd-* */
+	if (initrd_load_addr) {
+		ret = fdt_setprop_u64(fdt, chosen_node, FDT_PROP_INITRD_START,
+				      initrd_load_addr);
+		if (ret)
+			goto out;
+
+		ret = fdt_setprop_u64(fdt, chosen_node, FDT_PROP_INITRD_END,
+				      initrd_load_addr + initrd_len);
+		if (ret)
+			goto out;
+
+		ret = fdt_add_mem_rsv(fdt, initrd_load_addr, initrd_len);
+		if (ret)
+			goto out;
+
+	} else {
+		ret = fdt_delprop(fdt, chosen_node, FDT_PROP_INITRD_START);
+		if (ret && (ret != -FDT_ERR_NOTFOUND))
+			goto out;
+
+		ret = fdt_delprop(fdt, chosen_node, FDT_PROP_INITRD_END);
+		if (ret && (ret != -FDT_ERR_NOTFOUND))
+			goto out;
+	}
+
+	if (image->type == KEXEC_TYPE_CRASH) {
+		/* add linux,elfcorehdr */
+		ret = fdt_appendprop_addrrange(fdt, 0, chosen_node,
+				FDT_PROP_KEXEC_ELFHDR,
+				image->elf_load_addr,
+				image->elf_headers_sz);
+		if (ret)
+			goto out;
+
+		/*
+		 * Avoid elfcorehdr from being stomped on in kdump kernel by
+		 * setting up memory reserve map.
+		 */
+		ret = fdt_add_mem_rsv(fdt, image->elf_load_addr,
+				      image->elf_headers_sz);
+		if (ret)
+			goto out;
+
+		/* add linux,usable-memory-range */
+		ret = fdt_appendprop_addrrange(fdt, 0, chosen_node,
+				FDT_PROP_MEM_RANGE,
+				crashk_res.start,
+				crashk_res.end - crashk_res.start + 1);
+		if (ret)
+			goto out;
+	}
+
+	/* add bootargs */
+	if (cmdline) {
+		ret = fdt_setprop_string(fdt, chosen_node, FDT_PROP_BOOTARGS, cmdline);
+		if (ret)
+			goto out;
+	} else {
+		ret = fdt_delprop(fdt, chosen_node, FDT_PROP_BOOTARGS);
+		if (ret && (ret != -FDT_ERR_NOTFOUND))
+			goto out;
+	}
+
+	/* add kaslr-seed */
+	ret = fdt_delprop(fdt, chosen_node, FDT_PROP_KASLR_SEED);
+	if (ret == -FDT_ERR_NOTFOUND)
+		ret = 0;
+	else if (ret)
+		goto out;
+
+	if (rng_is_initialized()) {
+		u64 seed = get_random_u64();
+
+		ret = fdt_setprop_u64(fdt, chosen_node, FDT_PROP_KASLR_SEED, seed);
+		if (ret)
+			goto out;
+	} else {
+		pr_notice("RNG is not initialised: omitting \"%s\" property\n",
+				FDT_PROP_KASLR_SEED);
+	}
+
+	/* add rng-seed */
+	if (rng_is_initialized()) {
+		void *rng_seed;
+
+		ret = fdt_setprop_placeholder(fdt, chosen_node, FDT_PROP_RNG_SEED,
+				RNG_SEED_SIZE, &rng_seed);
+		if (ret)
+			goto out;
+		get_random_bytes(rng_seed, RNG_SEED_SIZE);
+	} else {
+		pr_notice("RNG is not initialised: omitting \"%s\" property\n",
+				FDT_PROP_RNG_SEED);
+	}
+
+	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
+
+out:
+	if (ret) {
+		kvfree(fdt);
+		fdt = NULL;
+	}
+
+	return fdt;
+}
diff --git a/include/linux/of.h b/include/linux/of.h
index 4b27c9a27df3..d66c915df910 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -560,6 +560,11 @@ int of_map_id(struct device_node *np, u32 id,
 
 phys_addr_t of_dma_get_max_cpu_address(struct device_node *np);
 
+struct kimage;
+void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
+				   unsigned long initrd_load_addr,
+				   unsigned long initrd_len,
+				   const char *cmdline, size_t extra_fdt_size);
 #else /* CONFIG_OF */
 
 static inline void of_core_init(void)
-- 
cgit v1.2.3


From 0c605158be32104bb85cbd12fb575e6f1e17d3e7 Mon Sep 17 00:00:00 2001
From: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Date: Sun, 21 Feb 2021 09:49:25 -0800
Subject: powerpc: Move ima buffer fields to struct kimage

The fields ima_buffer_addr and ima_buffer_size in "struct kimage_arch"
for powerpc are used to carry forward the IMA measurement list across
kexec system call.  These fields are not architecture specific, but are
currently limited to powerpc.

arch_ima_add_kexec_buffer() defined in "arch/powerpc/kexec/ima.c"
sets ima_buffer_addr and ima_buffer_size for the kexec system call.
This function does not have architecture specific code, but is
currently limited to powerpc.

Move ima_buffer_addr and ima_buffer_size to "struct kimage".
Set ima_buffer_addr and ima_buffer_size in ima_add_kexec_buffer()
in security/integrity/ima/ima_kexec.c.

Co-developed-by: Prakhar Srivastava <prsriva@linux.microsoft.com>
Signed-off-by: Prakhar Srivastava <prsriva@linux.microsoft.com>
Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Suggested-by: Will Deacon <will@kernel.org>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210221174930.27324-9-nramas@linux.microsoft.com
---
 arch/powerpc/include/asm/ima.h     |  3 ---
 arch/powerpc/include/asm/kexec.h   |  5 -----
 arch/powerpc/kexec/ima.c           | 29 ++++++-----------------------
 include/linux/kexec.h              |  3 +++
 security/integrity/ima/ima_kexec.c |  8 ++------
 5 files changed, 11 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h
index ead488cf3981..51f64fd06c19 100644
--- a/arch/powerpc/include/asm/ima.h
+++ b/arch/powerpc/include/asm/ima.h
@@ -14,9 +14,6 @@ static inline void remove_ima_buffer(void *fdt, int chosen_node) {}
 #endif
 
 #ifdef CONFIG_IMA_KEXEC
-int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
-			      size_t size);
-
 int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node);
 #else
 static inline int setup_ima_buffer(const struct kimage *image, void *fdt,
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 33d628084b3c..977bcbf94528 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -108,11 +108,6 @@ struct kimage_arch {
 	unsigned long backup_start;
 	void *backup_buf;
 	void *fdt;
-
-#ifdef CONFIG_IMA_KEXEC
-	phys_addr_t ima_buffer_addr;
-	size_t ima_buffer_size;
-#endif
 };
 
 char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
diff --git a/arch/powerpc/kexec/ima.c b/arch/powerpc/kexec/ima.c
index 720e50e490b6..ed38125e2f87 100644
--- a/arch/powerpc/kexec/ima.c
+++ b/arch/powerpc/kexec/ima.c
@@ -128,23 +128,6 @@ void remove_ima_buffer(void *fdt, int chosen_node)
 }
 
 #ifdef CONFIG_IMA_KEXEC
-/**
- * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA buffer
- *
- * Architectures should use this function to pass on the IMA buffer
- * information to the next kernel.
- *
- * Return: 0 on success, negative errno on error.
- */
-int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
-			      size_t size)
-{
-	image->arch.ima_buffer_addr = load_addr;
-	image->arch.ima_buffer_size = size;
-
-	return 0;
-}
-
 static int write_number(void *p, u64 value, int cells)
 {
 	if (cells == 1) {
@@ -180,7 +163,7 @@ int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
 	u8 value[16];
 
 	remove_ima_buffer(fdt, chosen_node);
-	if (!image->arch.ima_buffer_size)
+	if (!image->ima_buffer_size)
 		return 0;
 
 	ret = get_addr_size_cells(&addr_cells, &size_cells);
@@ -192,11 +175,11 @@ int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
 	if (entry_size > sizeof(value))
 		return -EINVAL;
 
-	ret = write_number(value, image->arch.ima_buffer_addr, addr_cells);
+	ret = write_number(value, image->ima_buffer_addr, addr_cells);
 	if (ret)
 		return ret;
 
-	ret = write_number(value + 4 * addr_cells, image->arch.ima_buffer_size,
+	ret = write_number(value + 4 * addr_cells, image->ima_buffer_size,
 			   size_cells);
 	if (ret)
 		return ret;
@@ -206,13 +189,13 @@ int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
 	if (ret < 0)
 		return -EINVAL;
 
-	ret = fdt_add_mem_rsv(fdt, image->arch.ima_buffer_addr,
-			      image->arch.ima_buffer_size);
+	ret = fdt_add_mem_rsv(fdt, image->ima_buffer_addr,
+			      image->ima_buffer_size);
 	if (ret)
 		return -EINVAL;
 
 	pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n",
-		 image->arch.ima_buffer_addr, image->arch.ima_buffer_size);
+		 image->ima_buffer_addr, image->ima_buffer_size);
 
 	return 0;
 }
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0bfab392367f..0c994ae37729 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -304,6 +304,9 @@ struct kimage {
 #ifdef CONFIG_IMA_KEXEC
 	/* Virtual address of IMA measurement buffer for kexec syscall */
 	void *ima_buffer;
+
+	phys_addr_t ima_buffer_addr;
+	size_t ima_buffer_size;
 #endif
 
 	/* Core ELF header buffer */
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index e29bea3dd4cc..8b1a3d50c49c 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -123,12 +123,8 @@ void ima_add_kexec_buffer(struct kimage *image)
 		return;
 	}
 
-	ret = arch_ima_add_kexec_buffer(image, kbuf.mem, kexec_segment_size);
-	if (ret) {
-		pr_err("Error passing over kexec measurement buffer.\n");
-		return;
-	}
-
+	image->ima_buffer_addr = kbuf.mem;
+	image->ima_buffer_size = kexec_segment_size;
 	image->ima_buffer = kexec_buffer;
 
 	pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n",
-- 
cgit v1.2.3


From fee3ff99bc67604fba77f19da0106f3ec52b1956 Mon Sep 17 00:00:00 2001
From: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Date: Sun, 21 Feb 2021 09:49:27 -0800
Subject: powerpc: Move arch independent ima kexec functions to
 drivers/of/kexec.c

The functions defined in "arch/powerpc/kexec/ima.c" handle setting up
and freeing the resources required to carry over the IMA measurement
list from the current kernel to the next kernel across kexec system call.
These functions do not have architecture specific code, but are
currently limited to powerpc.

Move remove_ima_buffer() and setup_ima_buffer() calls into
of_kexec_alloc_and_setup_fdt() defined in "drivers/of/kexec.c".

Move the remaining architecture independent functions from
"arch/powerpc/kexec/ima.c" to "drivers/of/kexec.c".
Delete "arch/powerpc/kexec/ima.c" and "arch/powerpc/include/asm/ima.h".
Remove references to the deleted files and functions in powerpc and
in ima.

Co-developed-by: Prakhar Srivastava <prsriva@linux.microsoft.com>
Signed-off-by: Prakhar Srivastava <prsriva@linux.microsoft.com>
Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Tested-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210221174930.27324-11-nramas@linux.microsoft.com
---
 arch/powerpc/include/asm/ima.h     |  27 -----
 arch/powerpc/include/asm/kexec.h   |   3 -
 arch/powerpc/kexec/Makefile        |   7 --
 arch/powerpc/kexec/file_load.c     |  25 ----
 arch/powerpc/kexec/file_load_64.c  |   4 -
 arch/powerpc/kexec/ima.c           | 202 -------------------------------
 drivers/of/kexec.c                 | 240 +++++++++++++++++++++++++++++++++++++
 include/linux/of.h                 |   2 +
 security/integrity/ima/ima.h       |   4 -
 security/integrity/ima/ima_kexec.c |   1 +
 10 files changed, 243 insertions(+), 272 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/ima.h
 delete mode 100644 arch/powerpc/kexec/ima.c

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h
deleted file mode 100644
index 51f64fd06c19..000000000000
--- a/arch/powerpc/include/asm/ima.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_POWERPC_IMA_H
-#define _ASM_POWERPC_IMA_H
-
-struct kimage;
-
-int ima_get_kexec_buffer(void **addr, size_t *size);
-int ima_free_kexec_buffer(void);
-
-#ifdef CONFIG_IMA
-void remove_ima_buffer(void *fdt, int chosen_node);
-#else
-static inline void remove_ima_buffer(void *fdt, int chosen_node) {}
-#endif
-
-#ifdef CONFIG_IMA_KEXEC
-int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node);
-#else
-static inline int setup_ima_buffer(const struct kimage *image, void *fdt,
-				   int chosen_node)
-{
-	remove_ima_buffer(fdt, chosen_node);
-	return 0;
-}
-#endif /* CONFIG_IMA_KEXEC */
-
-#endif /* _ASM_POWERPC_IMA_H */
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 977bcbf94528..f2c5e606c8b3 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -115,9 +115,6 @@ char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
 int setup_purgatory(struct kimage *image, const void *slave_code,
 		    const void *fdt, unsigned long kernel_load_addr,
 		    unsigned long fdt_load_addr);
-int setup_new_fdt(const struct kimage *image, void *fdt,
-		  unsigned long initrd_load_addr, unsigned long initrd_len,
-		  const char *cmdline);
 int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 4aff6846c772..b6c52608cb49 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -9,13 +9,6 @@ obj-$(CONFIG_PPC32)		+= relocate_32.o
 
 obj-$(CONFIG_KEXEC_FILE)	+= file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o
 
-ifdef CONFIG_HAVE_IMA_KEXEC
-ifdef CONFIG_IMA
-obj-y				+= ima.o
-endif
-endif
-
-
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_core_$(BITS).o := n
 KCOV_INSTRUMENT_core_$(BITS).o := n
diff --git a/arch/powerpc/kexec/file_load.c b/arch/powerpc/kexec/file_load.c
index c6bbd06d13e2..dc28cb7813c8 100644
--- a/arch/powerpc/kexec/file_load.c
+++ b/arch/powerpc/kexec/file_load.c
@@ -19,7 +19,6 @@
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
 #include <asm/setup.h>
-#include <asm/ima.h>
 
 #define SLAVE_CODE_SIZE		256	/* First 0x100 bytes */
 
@@ -140,27 +139,3 @@ int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
 
 	return -ENOENT;
 }
-
-/*
- * setup_new_fdt - modify /chosen and memory reservation for the next kernel
- * @image:		kexec image being loaded.
- * @fdt:		Flattened device tree for the next kernel.
- * @initrd_load_addr:	Address where the next initrd will be loaded.
- * @initrd_len:		Size of the next initrd, or 0 if there will be none.
- * @cmdline:		Command line for the next kernel, or NULL if there will
- *			be none.
- *
- * Return: 0 on success, or negative errno on error.
- */
-int setup_new_fdt(const struct kimage *image, void *fdt,
-		  unsigned long initrd_load_addr, unsigned long initrd_len,
-		  const char *cmdline)
-{
-	int ret;
-
-	ret = setup_ima_buffer(image, fdt, fdt_path_offset(fdt, "/chosen"));
-	if (ret)
-		pr_err("Error setting up the new device tree.\n");
-
-	return ret;
-}
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index dff207cfa59b..3e94c91c3cb1 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -979,10 +979,6 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 	struct crash_mem *umem = NULL, *rmem = NULL;
 	int i, nr_ranges, ret;
 
-	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
-	if (ret)
-		goto out;
-
 	/*
 	 * Restrict memory usage for kdump kernel by setting up
 	 * usable memory ranges and memory reserve map.
diff --git a/arch/powerpc/kexec/ima.c b/arch/powerpc/kexec/ima.c
deleted file mode 100644
index ed38125e2f87..000000000000
--- a/arch/powerpc/kexec/ima.c
+++ /dev/null
@@ -1,202 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2016 IBM Corporation
- *
- * Authors:
- * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
- */
-
-#include <linux/slab.h>
-#include <linux/kexec.h>
-#include <linux/of.h>
-#include <linux/memblock.h>
-#include <linux/libfdt.h>
-
-static int get_addr_size_cells(int *addr_cells, int *size_cells)
-{
-	struct device_node *root;
-
-	root = of_find_node_by_path("/");
-	if (!root)
-		return -EINVAL;
-
-	*addr_cells = of_n_addr_cells(root);
-	*size_cells = of_n_size_cells(root);
-
-	of_node_put(root);
-
-	return 0;
-}
-
-static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
-			       size_t *size)
-{
-	int ret, addr_cells, size_cells;
-
-	ret = get_addr_size_cells(&addr_cells, &size_cells);
-	if (ret)
-		return ret;
-
-	if (len < 4 * (addr_cells + size_cells))
-		return -ENOENT;
-
-	*addr = of_read_number(prop, addr_cells);
-	*size = of_read_number(prop + 4 * addr_cells, size_cells);
-
-	return 0;
-}
-
-/**
- * ima_get_kexec_buffer - get IMA buffer from the previous kernel
- * @addr:	On successful return, set to point to the buffer contents.
- * @size:	On successful return, set to the buffer size.
- *
- * Return: 0 on success, negative errno on error.
- */
-int ima_get_kexec_buffer(void **addr, size_t *size)
-{
-	int ret, len;
-	unsigned long tmp_addr;
-	size_t tmp_size;
-	const void *prop;
-
-	prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
-	if (!prop)
-		return -ENOENT;
-
-	ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size);
-	if (ret)
-		return ret;
-
-	*addr = __va(tmp_addr);
-	*size = tmp_size;
-
-	return 0;
-}
-
-/**
- * ima_free_kexec_buffer - free memory used by the IMA buffer
- */
-int ima_free_kexec_buffer(void)
-{
-	int ret;
-	unsigned long addr;
-	size_t size;
-	struct property *prop;
-
-	prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
-	if (!prop)
-		return -ENOENT;
-
-	ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size);
-	if (ret)
-		return ret;
-
-	ret = of_remove_property(of_chosen, prop);
-	if (ret)
-		return ret;
-
-	return memblock_free(addr, size);
-
-}
-
-/**
- * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
- *
- * The IMA measurement buffer is of no use to a subsequent kernel, so we always
- * remove it from the device tree.
- */
-void remove_ima_buffer(void *fdt, int chosen_node)
-{
-	int ret, len;
-	unsigned long addr;
-	size_t size;
-	const void *prop;
-
-	prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len);
-	if (!prop)
-		return;
-
-	ret = do_get_kexec_buffer(prop, len, &addr, &size);
-	fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
-	if (ret)
-		return;
-
-	ret = delete_fdt_mem_rsv(fdt, addr, size);
-	if (!ret)
-		pr_debug("Removed old IMA buffer reservation.\n");
-}
-
-#ifdef CONFIG_IMA_KEXEC
-static int write_number(void *p, u64 value, int cells)
-{
-	if (cells == 1) {
-		u32 tmp;
-
-		if (value > U32_MAX)
-			return -EINVAL;
-
-		tmp = cpu_to_be32(value);
-		memcpy(p, &tmp, sizeof(tmp));
-	} else if (cells == 2) {
-		u64 tmp;
-
-		tmp = cpu_to_be64(value);
-		memcpy(p, &tmp, sizeof(tmp));
-	} else
-		return -EINVAL;
-
-	return 0;
-}
-
-/**
- * setup_ima_buffer - add IMA buffer information to the fdt
- * @image:		kexec image being loaded.
- * @fdt:		Flattened device tree for the next kernel.
- * @chosen_node:	Offset to the chosen node.
- *
- * Return: 0 on success, or negative errno on error.
- */
-int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
-{
-	int ret, addr_cells, size_cells, entry_size;
-	u8 value[16];
-
-	remove_ima_buffer(fdt, chosen_node);
-	if (!image->ima_buffer_size)
-		return 0;
-
-	ret = get_addr_size_cells(&addr_cells, &size_cells);
-	if (ret)
-		return ret;
-
-	entry_size = 4 * (addr_cells + size_cells);
-
-	if (entry_size > sizeof(value))
-		return -EINVAL;
-
-	ret = write_number(value, image->ima_buffer_addr, addr_cells);
-	if (ret)
-		return ret;
-
-	ret = write_number(value + 4 * addr_cells, image->ima_buffer_size,
-			   size_cells);
-	if (ret)
-		return ret;
-
-	ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value,
-			  entry_size);
-	if (ret < 0)
-		return -EINVAL;
-
-	ret = fdt_add_mem_rsv(fdt, image->ima_buffer_addr,
-			      image->ima_buffer_size);
-	if (ret)
-		return -EINVAL;
-
-	pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n",
-		 image->ima_buffer_addr, image->ima_buffer_size);
-
-	return 0;
-}
-#endif /* CONFIG_IMA_KEXEC */
diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index 8fa8946cda42..6512e25671df 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -11,6 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/kexec.h>
+#include <linux/memblock.h>
 #include <linux/libfdt.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
@@ -69,6 +70,240 @@ static int fdt_find_and_del_mem_rsv(void *fdt, unsigned long start, unsigned lon
 	return -ENOENT;
 }
 
+/**
+ * get_addr_size_cells - Get address and size of root node
+ *
+ * @addr_cells: Return address of the root node
+ * @size_cells: Return size of the root node
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+static int get_addr_size_cells(int *addr_cells, int *size_cells)
+{
+	struct device_node *root;
+
+	root = of_find_node_by_path("/");
+	if (!root)
+		return -EINVAL;
+
+	*addr_cells = of_n_addr_cells(root);
+	*size_cells = of_n_size_cells(root);
+
+	of_node_put(root);
+
+	return 0;
+}
+
+/**
+ * do_get_kexec_buffer - Get address and size of device tree property
+ *
+ * @prop: Device tree property
+ * @len: Size of @prop
+ * @addr: Return address of the node
+ * @size: Return size of the node
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
+			       size_t *size)
+{
+	int ret, addr_cells, size_cells;
+
+	ret = get_addr_size_cells(&addr_cells, &size_cells);
+	if (ret)
+		return ret;
+
+	if (len < 4 * (addr_cells + size_cells))
+		return -ENOENT;
+
+	*addr = of_read_number(prop, addr_cells);
+	*size = of_read_number(prop + 4 * addr_cells, size_cells);
+
+	return 0;
+}
+
+/**
+ * ima_get_kexec_buffer - get IMA buffer from the previous kernel
+ * @addr:	On successful return, set to point to the buffer contents.
+ * @size:	On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int ima_get_kexec_buffer(void **addr, size_t *size)
+{
+	int ret, len;
+	unsigned long tmp_addr;
+	size_t tmp_size;
+	const void *prop;
+
+	if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))
+		return -ENOTSUPP;
+
+	prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
+	if (!prop)
+		return -ENOENT;
+
+	ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size);
+	if (ret)
+		return ret;
+
+	*addr = __va(tmp_addr);
+	*size = tmp_size;
+
+	return 0;
+}
+
+/**
+ * ima_free_kexec_buffer - free memory used by the IMA buffer
+ */
+int ima_free_kexec_buffer(void)
+{
+	int ret;
+	unsigned long addr;
+	size_t size;
+	struct property *prop;
+
+	if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))
+		return -ENOTSUPP;
+
+	prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
+	if (!prop)
+		return -ENOENT;
+
+	ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size);
+	if (ret)
+		return ret;
+
+	ret = of_remove_property(of_chosen, prop);
+	if (ret)
+		return ret;
+
+	return memblock_free(addr, size);
+
+}
+
+/**
+ * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
+ *
+ * @fdt: Flattened Device Tree to update
+ * @chosen_node: Offset to the chosen node in the device tree
+ *
+ * The IMA measurement buffer is of no use to a subsequent kernel, so we always
+ * remove it from the device tree.
+ */
+static void remove_ima_buffer(void *fdt, int chosen_node)
+{
+	int ret, len;
+	unsigned long addr;
+	size_t size;
+	const void *prop;
+
+	if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))
+		return;
+
+	prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len);
+	if (!prop)
+		return;
+
+	ret = do_get_kexec_buffer(prop, len, &addr, &size);
+	fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
+	if (ret)
+		return;
+
+	ret = fdt_find_and_del_mem_rsv(fdt, addr, size);
+	if (!ret)
+		pr_debug("Removed old IMA buffer reservation.\n");
+}
+
+#ifdef CONFIG_IMA_KEXEC
+/**
+ * write_number - Convert number to big-endian format
+ *
+ * @p:		Buffer to write the number to
+ * @value:	Number to convert
+ * @cells:	Number of cells
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+static int write_number(void *p, u64 value, int cells)
+{
+	if (cells == 1) {
+		u32 tmp;
+
+		if (value > U32_MAX)
+			return -EINVAL;
+
+		tmp = cpu_to_be32(value);
+		memcpy(p, &tmp, sizeof(tmp));
+	} else if (cells == 2) {
+		u64 tmp;
+
+		tmp = cpu_to_be64(value);
+		memcpy(p, &tmp, sizeof(tmp));
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * setup_ima_buffer - add IMA buffer information to the fdt
+ * @image:		kexec image being loaded.
+ * @fdt:		Flattened device tree for the next kernel.
+ * @chosen_node:	Offset to the chosen node.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+static int setup_ima_buffer(const struct kimage *image, void *fdt,
+			    int chosen_node)
+{
+	int ret, addr_cells, size_cells, entry_size;
+	u8 value[16];
+
+	if (!image->ima_buffer_size)
+		return 0;
+
+	ret = get_addr_size_cells(&addr_cells, &size_cells);
+	if (ret)
+		return ret;
+
+	entry_size = 4 * (addr_cells + size_cells);
+
+	if (entry_size > sizeof(value))
+		return -EINVAL;
+
+	ret = write_number(value, image->ima_buffer_addr, addr_cells);
+	if (ret)
+		return ret;
+
+	ret = write_number(value + 4 * addr_cells, image->ima_buffer_size,
+			   size_cells);
+	if (ret)
+		return ret;
+
+	ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value,
+			  entry_size);
+	if (ret < 0)
+		return -EINVAL;
+
+	ret = fdt_add_mem_rsv(fdt, image->ima_buffer_addr,
+			      image->ima_buffer_size);
+	if (ret)
+		return -EINVAL;
+
+	pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n",
+		 image->ima_buffer_addr, image->ima_buffer_size);
+
+	return 0;
+}
+#else /* CONFIG_IMA_KEXEC */
+static inline int setup_ima_buffer(const struct kimage *image, void *fdt,
+				   int chosen_node)
+{
+	return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
+
 /*
  * of_kexec_alloc_and_setup_fdt - Alloc and setup a new Flattened Device Tree
  *
@@ -254,6 +489,11 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
 	}
 
 	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
+	if (ret)
+		goto out;
+
+	remove_ima_buffer(fdt, chosen_node);
+	ret = setup_ima_buffer(image, fdt, fdt_path_offset(fdt, "/chosen"));
 
 out:
 	if (ret) {
diff --git a/include/linux/of.h b/include/linux/of.h
index d66c915df910..e9209ef44cc0 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -565,6 +565,8 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
 				   unsigned long initrd_load_addr,
 				   unsigned long initrd_len,
 				   const char *cmdline, size_t extra_fdt_size);
+int ima_get_kexec_buffer(void **addr, size_t *size);
+int ima_free_kexec_buffer(void);
 #else /* CONFIG_OF */
 
 static inline void of_core_init(void)
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 8e8b5251550e..f0e448ed1f9f 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -24,10 +24,6 @@
 
 #include "../integrity.h"
 
-#ifdef CONFIG_HAVE_IMA_KEXEC
-#include <asm/ima.h>
-#endif
-
 enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN,
 		     IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII };
 enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8, TPM_PCR10 = 10 };
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index 8b1a3d50c49c..667887665823 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -10,6 +10,7 @@
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/kexec.h>
+#include <linux/of.h>
 #include "ima.h"
 
 #ifdef CONFIG_IMA_KEXEC
-- 
cgit v1.2.3


From 69dd4503a7e6bae3389b8e028e5768008be8f2d7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 16 Feb 2021 15:36:07 +0100
Subject: irqdomain: Remove debugfs_file from struct irq_domain

There's no need to keep around a dentry pointer to a simple file that
debugfs itself can look up when we need to remove it from the system.
So simplify the code by deleting the variable and cleaning up the logic
around the debugfs file.

Cc: Marc Zyngier <maz@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/YCvYV53ZdzQSWY6w@kroah.com
---
 include/linux/irqdomain.h | 4 ----
 kernel/irq/irqdomain.c    | 9 ++++-----
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 42d196805f58..33cacc8af26d 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -150,7 +150,6 @@ struct irq_domain_chip_generic;
  *      setting up one or more generic chips for interrupt controllers
  *      drivers using the generic chip library which uses this pointer.
  * @parent: Pointer to parent irq_domain to support hierarchy irq_domains
- * @debugfs_file: dentry for the domain debugfs file
  *
  * Revmap data, used internally by irq_domain
  * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that
@@ -174,9 +173,6 @@ struct irq_domain {
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	struct irq_domain *parent;
 #endif
-#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
-	struct dentry		*debugfs_file;
-#endif
 
 	/* reverse map data. The linear map gets appended to the irq_domain */
 	irq_hw_number_t hwirq_max;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 288151393a06..d10ab1d689d5 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1898,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
 
 static void debugfs_add_domain_dir(struct irq_domain *d)
 {
-	if (!d->name || !domain_dir || d->debugfs_file)
+	if (!d->name || !domain_dir)
 		return;
-	d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
-					      &irq_domain_debug_fops);
+	debugfs_create_file(d->name, 0444, domain_dir, d,
+			    &irq_domain_debug_fops);
 }
 
 static void debugfs_remove_domain_dir(struct irq_domain *d)
 {
-	debugfs_remove(d->debugfs_file);
-	d->debugfs_file = NULL;
+	debugfs_remove(debugfs_lookup(d->name, domain_dir));
 }
 
 void __init irq_domain_debugfs_init(struct dentry *root)
-- 
cgit v1.2.3


From 5bb1369d4bea078dd1298dfc2c6ce781d9e34dde Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Sat, 16 Jan 2021 00:11:45 +0900
Subject: rculist: Replace reference to atomic_ops.rst

The hlist_nulls_for_each_entry_rcu() docbook header references the
atomic_ops.rst file, which was removed in commit f0400a77ebdc ("atomic:
Delete obsolete documentation").  This commit therefore substitutes a
section in memory-barriers.txt discussing the use of barrier() in loops.

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rculist_nulls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index ff3e94779e73..d8afdb8784c1 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -161,7 +161,7 @@ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
  *
  * The barrier() is needed to make sure compiler doesn't cache first element [1],
  * as this loop can be restarted [2]
- * [1] Documentation/core-api/atomic_ops.rst around line 114
+ * [1] Documentation/memory-barriers.txt around line 1533
  * [2] Documentation/RCU/rculist_nulls.rst around line 146
  */
 #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)			\
-- 
cgit v1.2.3


From e75956bd00cf4246067c6aee7751faf313233435 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 14 Jan 2021 08:22:02 +0100
Subject: rcu: Fix kfree_rcu() docbook errors

After commit 5130b8fd0690 ("rcu: Introduce kfree_rcu() single-argument macro"),
kernel-doc now emits two warnings:

	./include/linux/rcupdate.h:884: warning: Excess function parameter 'ptr' description in 'kfree_rcu'
	./include/linux/rcupdate.h:884: warning: Excess function parameter 'rhf' description in 'kfree_rcu'

This commit added some macro magic in order to call two different versions
of kfree_rcu(), the first having just one argument and the second having
two arguments.  That makes it difficult to document the kfree_rcu() arguments
in the docboook header.

In order to make clearer that this macro accepts optional arguments,
this commit uses macro concatenation so that this macro changes from:
	#define kfree_rcu kvfree_rcu

to:
	#define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf)

That not only helps kernel-doc understand the macro arguments, but also
provides a better C definition that makes clearer that the first argument
is mandatory and the second one is optional.

Fixes: 5130b8fd0690 ("rcu: Introduce kfree_rcu() single-argument macro")
Tested-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index bd04f722714f..5cc6deaa5df2 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -881,7 +881,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  * The BUILD_BUG_ON check must not involve any function calls, hence the
  * checks are done in macros here.
  */
-#define kfree_rcu kvfree_rcu
+#define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf)
 
 /**
  * kvfree_rcu() - kvfree an object after a grace period.
-- 
cgit v1.2.3


From 5bb1bb353cfe343fc3c84faf06f72ba309fde541 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 Jan 2021 13:46:11 -0800
Subject: mm: Don't build mm_dump_obj() on CONFIG_PRINTK=n kernels

The mem_dump_obj() functionality adds a few hundred bytes, which is a
small price to pay.  Except on kernels built with CONFIG_PRINTK=n, in
which mem_dump_obj() messages will be suppressed.  This commit therefore
makes mem_dump_obj() be a static inline empty function on kernels built
with CONFIG_PRINTK=n and excludes all of its support functions as well.
This avoids kernel bloat on systems that cannot use mem_dump_obj().

Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: <linux-mm@kvack.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/mm.h      | 4 ++++
 include/linux/slab.h    | 2 ++
 include/linux/vmalloc.h | 2 +-
 mm/slab.c               | 2 ++
 mm/slab.h               | 2 ++
 mm/slab_common.c        | 2 ++
 mm/slob.c               | 2 ++
 mm/slub.c               | 2 ++
 mm/util.c               | 2 ++
 mm/vmalloc.c            | 2 ++
 10 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 77e64e3eac80..89fca443e6f1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3135,7 +3135,11 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
 
 extern int sysctl_nr_trim_pages;
 
+#ifdef CONFIG_PRINTK
 void mem_dump_obj(void *object);
+#else
+static inline void mem_dump_obj(void *object) {}
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7ae604076767..0c97d788762c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -186,8 +186,10 @@ void kfree(const void *);
 void kfree_sensitive(const void *);
 size_t __ksize(const void *);
 size_t ksize(const void *);
+#ifdef CONFIG_PRINTK
 bool kmem_valid_obj(void *object);
 void kmem_dump_obj(void *object);
+#endif
 
 #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
 void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index df92211cf771..3de7be6dd17c 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -241,7 +241,7 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 int register_vmap_purge_notifier(struct notifier_block *nb);
 int unregister_vmap_purge_notifier(struct notifier_block *nb);
 
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) && defined(CONFIG_PRINTK)
 bool vmalloc_dump_obj(void *object);
 #else
 static inline bool vmalloc_dump_obj(void *object) { return false; }
diff --git a/mm/slab.c b/mm/slab.c
index 51fd424e0d6d..2e64efeb99a1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3651,6 +3651,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_PRINTK
 void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
 {
 	struct kmem_cache *cachep;
@@ -3670,6 +3671,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
 	if (DEBUG && cachep->flags & SLAB_STORE_USER)
 		kpp->kp_ret = *dbg_userword(cachep, objp);
 }
+#endif
 
 /**
  * __do_kmalloc - allocate memory
diff --git a/mm/slab.h b/mm/slab.h
index 076582f58f68..120b1d0dfb6d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -619,6 +619,7 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c)
 	return false;
 }
 
+#ifdef CONFIG_PRINTK
 #define KS_ADDRS_COUNT 16
 struct kmem_obj_info {
 	void *kp_ptr;
@@ -630,5 +631,6 @@ struct kmem_obj_info {
 	void *kp_stack[KS_ADDRS_COUNT];
 };
 void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
+#endif
 
 #endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 88e833986332..cec95363e621 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -526,6 +526,7 @@ bool slab_is_available(void)
 	return slab_state >= UP;
 }
 
+#ifdef CONFIG_PRINTK
 /**
  * kmem_valid_obj - does the pointer reference a valid slab object?
  * @object: pointer to query.
@@ -600,6 +601,7 @@ void kmem_dump_obj(void *object)
 		pr_info("    %pS\n", kp.kp_stack[i]);
 	}
 }
+#endif
 
 #ifndef CONFIG_SLOB
 /* Create a cache during boot when no slab services are available yet */
diff --git a/mm/slob.c b/mm/slob.c
index 0578429b991b..74d3f6e60666 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -461,11 +461,13 @@ out:
 	spin_unlock_irqrestore(&slob_lock, flags);
 }
 
+#ifdef CONFIG_PRINTK
 void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
 {
 	kpp->kp_ptr = object;
 	kpp->kp_page = page;
 }
+#endif
 
 /*
  * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
diff --git a/mm/slub.c b/mm/slub.c
index e26c274b4657..077a019e4d7a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3963,6 +3963,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	return 0;
 }
 
+#ifdef CONFIG_PRINTK
 void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
 {
 	void *base;
@@ -4002,6 +4003,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
 #endif
 #endif
 }
+#endif
 
 /********************************************************************
  *		Kmalloc subsystem
diff --git a/mm/util.c b/mm/util.c
index 54870226cea6..2d497fe0f17d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -983,6 +983,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
 	return ret;
 }
 
+#ifdef CONFIG_PRINTK
 /**
  * mem_dump_obj - Print available provenance information
  * @object: object for which to find provenance information.
@@ -1013,3 +1014,4 @@ void mem_dump_obj(void *object)
 	}
 	pr_cont(" non-slab/vmalloc memory.\n");
 }
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4f5f8c907897..d5f2a84e488a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3450,6 +3450,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 }
 #endif	/* CONFIG_SMP */
 
+#ifdef CONFIG_PRINTK
 bool vmalloc_dump_obj(void *object)
 {
 	struct vm_struct *vm;
@@ -3462,6 +3463,7 @@ bool vmalloc_dump_obj(void *object)
 		vm->nr_pages, (unsigned long)vm->addr, vm->caller);
 	return true;
 }
+#endif
 
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
-- 
cgit v1.2.3


From bd0ccc4afca2d6ae0029cae35c4f1d2e2ade7579 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 15 Jan 2021 18:09:53 +0100
Subject: kcsan: Add missing license and copyright headers

Adds missing license and/or copyright headers for KCSAN source files.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/dev-tools/kcsan.rst | 3 +++
 include/linux/kcsan-checks.h      | 6 ++++++
 include/linux/kcsan.h             | 7 +++++++
 kernel/kcsan/atomic.h             | 5 +++++
 kernel/kcsan/core.c               | 5 +++++
 kernel/kcsan/debugfs.c            | 5 +++++
 kernel/kcsan/encoding.h           | 5 +++++
 kernel/kcsan/kcsan.h              | 3 ++-
 kernel/kcsan/report.c             | 5 +++++
 kernel/kcsan/selftest.c           | 5 +++++
 10 files changed, 48 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst
index be7a0b0e1f28..d85ce238ace7 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -1,3 +1,6 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright (C) 2019, Google LLC.
+
 The Kernel Concurrency Sanitizer (KCSAN)
 ========================================
 
diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h
index cf14840609ce..9fd0ad80fef6 100644
--- a/include/linux/kcsan-checks.h
+++ b/include/linux/kcsan-checks.h
@@ -1,4 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KCSAN access checks and modifiers. These can be used to explicitly check
+ * uninstrumented accesses, or change KCSAN checking behaviour of accesses.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #ifndef _LINUX_KCSAN_CHECKS_H
 #define _LINUX_KCSAN_CHECKS_H
diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h
index 53340d8789f9..fc266ecb2a4d 100644
--- a/include/linux/kcsan.h
+++ b/include/linux/kcsan.h
@@ -1,4 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. Public interface and
+ * data structures to set up runtime. See kcsan-checks.h for explicit checks and
+ * modifiers. For more info please see Documentation/dev-tools/kcsan.rst.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #ifndef _LINUX_KCSAN_H
 #define _LINUX_KCSAN_H
diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
index 75fe701f4127..530ae1bda8e7 100644
--- a/kernel/kcsan/atomic.h
+++ b/kernel/kcsan/atomic.h
@@ -1,4 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Rules for implicitly atomic memory accesses.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #ifndef _KERNEL_KCSAN_ATOMIC_H
 #define _KERNEL_KCSAN_ATOMIC_H
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 23e7acb5c667..45c821d4e8bd 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -1,4 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN core runtime.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #define pr_fmt(fmt) "kcsan: " fmt
 
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 209ad8dcfcec..c1dd02f3be8b 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -1,4 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN debugfs interface.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #define pr_fmt(fmt) "kcsan: " fmt
 
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
index 7ee405524904..170a2bb22f53 100644
--- a/kernel/kcsan/encoding.h
+++ b/kernel/kcsan/encoding.h
@@ -1,4 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KCSAN watchpoint encoding.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #ifndef _KERNEL_KCSAN_ENCODING_H
 #define _KERNEL_KCSAN_ENCODING_H
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 87ccdb3b051f..9881099d4179 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -1,8 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-
 /*
  * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please
  * see Documentation/dev-tools/kcsan.rst.
+ *
+ * Copyright (C) 2019, Google LLC.
  */
 
 #ifndef _KERNEL_KCSAN_KCSAN_H
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index d3bf87e6007c..13dce3c664d6 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -1,4 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN reporting.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #include <linux/debug_locks.h>
 #include <linux/delay.h>
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 9014a3a82cf9..7f29cb0f5e63 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -1,4 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN short boot-time selftests.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
 
 #define pr_fmt(fmt) "kcsan: " fmt
 
-- 
cgit v1.2.3


From 1019d7923d9d4cc878a1a85d4fc2d6619cfe1a6a Mon Sep 17 00:00:00 2001
From: Tong Zhang <ztong0001@gmail.com>
Date: Sun, 7 Mar 2021 22:25:28 -0500
Subject: atm: fix a typo in the struct description

phy_data means private PHY data not date

Signed-off-by: Tong Zhang <ztong0001@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/atmdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index 60cd25c0461b..9b02961d65ee 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -151,7 +151,7 @@ struct atm_dev {
 	const char	*type;		/* device type name */
 	int		number;		/* device index */
 	void		*dev_data;	/* per-device data */
-	void		*phy_data;	/* private PHY date */
+	void		*phy_data;	/* private PHY data */
 	unsigned long	flags;		/* device flags (ATM_DF_*) */
 	struct list_head local;		/* local ATM addresses */
 	struct list_head lecs;		/* LECS ATM addresses learned via ILMI */
-- 
cgit v1.2.3


From 1224451bb6f938023dd7fa4e7ba43bfb185bc9e3 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 8 Mar 2021 14:30:37 +0100
Subject: PM / devfreq: Register devfreq as a cooling device on demand

Currently the default behavior is to manually having the devfreq
backend to register themselves as a devfreq cooling device.

Instead of adding the code in the drivers for the thermal cooling
device registering, let's provide a flag in the devfreq's profile to
tell the common devfreq code to register the newly created devfreq as
a cooling device.

Suggested-by: Chanwoo Choi <cwchoi00@gmail.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/devfreq.c | 9 +++++++++
 include/linux/devfreq.h   | 9 +++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index bf3047896e41..b6d3e7db0b09 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -11,6 +11,7 @@
 #include <linux/kmod.h>
 #include <linux/sched.h>
 #include <linux/debugfs.h>
+#include <linux/devfreq_cooling.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -935,6 +936,12 @@ struct devfreq *devfreq_add_device(struct device *dev,
 
 	mutex_unlock(&devfreq_list_lock);
 
+	if (devfreq->profile->is_cooling_device) {
+		devfreq->cdev = devfreq_cooling_em_register(devfreq, NULL);
+		if (IS_ERR(devfreq->cdev))
+			devfreq->cdev = NULL;
+	}
+
 	return devfreq;
 
 err_init:
@@ -960,6 +967,8 @@ int devfreq_remove_device(struct devfreq *devfreq)
 	if (!devfreq)
 		return -EINVAL;
 
+	devfreq_cooling_unregister(devfreq->cdev);
+
 	if (devfreq->governor) {
 		devfreq->governor->event_handler(devfreq,
 						 DEVFREQ_GOV_STOP, NULL);
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 26ea0850be9b..142474b4af96 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -38,6 +38,7 @@ enum devfreq_timer {
 
 struct devfreq;
 struct devfreq_governor;
+struct thermal_cooling_device;
 
 /**
  * struct devfreq_dev_status - Data given from devfreq user device to
@@ -98,11 +99,15 @@ struct devfreq_dev_status {
  * @freq_table:		Optional list of frequencies to support statistics
  *			and freq_table must be generated in ascending order.
  * @max_state:		The size of freq_table.
+ *
+ * @is_cooling_device: A self-explanatory boolean giving the device a
+ *                     cooling effect property.
  */
 struct devfreq_dev_profile {
 	unsigned long initial_freq;
 	unsigned int polling_ms;
 	enum devfreq_timer timer;
+	bool is_cooling_device;
 
 	int (*target)(struct device *dev, unsigned long *freq, u32 flags);
 	int (*get_dev_status)(struct device *dev,
@@ -156,6 +161,7 @@ struct devfreq_stats {
  * @suspend_count:	 suspend requests counter for a device.
  * @stats:	Statistics of devfreq device behavior
  * @transition_notifier_list: list head of DEVFREQ_TRANSITION_NOTIFIER notifier
+ * @cdev:	Cooling device pointer if the devfreq has cooling property
  * @nb_min:		Notifier block for DEV_PM_QOS_MIN_FREQUENCY
  * @nb_max:		Notifier block for DEV_PM_QOS_MAX_FREQUENCY
  *
@@ -198,6 +204,9 @@ struct devfreq {
 
 	struct srcu_notifier_head transition_notifier_list;
 
+	/* Pointer to the cooling device if used for thermal mitigation */
+	struct thermal_cooling_device *cdev;
+
 	struct notifier_block nb_min;
 	struct notifier_block nb_max;
 };
-- 
cgit v1.2.3


From f38a1644832792ece8c63c9a5d9cd7122c62bf64 Mon Sep 17 00:00:00 2001
From: Ray Chi <raychi@google.com>
Date: Mon, 8 Mar 2021 21:31:46 +0800
Subject: power: supply: core: provide function stubs if CONFIG_POWER_SUPPLY=n

Fix build error when CONFIG_POWER_SUPPLY is not enabled.

The build error occurs in mips (cavium_octeon_defconfig).

mips-linux-gnu-ld: drivers/usb/dwc3/core.o: in function `dwc3_remove':
drivers/usb/dwc3/core.c:1657: undefined reference to `power_supply_put'
mips-linux-gnu-ld: drivers/usb/dwc3/core.o: in function `dwc3_get_properties':
drivers/usb/dwc3/core.c:1270: undefined reference to `power_supply_get_by_name'
mips-linux-gnu-ld: drivers/usb/dwc3/core.o: in function `dwc3_probe':
drivers/usb/dwc3/core.c:1632: undefined reference to `power_supply_put'

Fixes: 59fa3def35de ("usb: dwc3: add a power supply for current control")
Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Ray Chi <raychi@google.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 81a55e974feb..6e776be5bfa0 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -381,8 +381,14 @@ struct power_supply_battery_info {
 extern struct atomic_notifier_head power_supply_notifier;
 extern int power_supply_reg_notifier(struct notifier_block *nb);
 extern void power_supply_unreg_notifier(struct notifier_block *nb);
+#if IS_ENABLED(CONFIG_POWER_SUPPLY)
 extern struct power_supply *power_supply_get_by_name(const char *name);
 extern void power_supply_put(struct power_supply *psy);
+#else
+static inline void power_supply_put(struct power_supply *psy) {}
+static inline struct power_supply *power_supply_get_by_name(const char *name)
+{ return NULL; }
+#endif
 #ifdef CONFIG_OF
 extern struct power_supply *power_supply_get_by_phandle(struct device_node *np,
 							const char *property);
-- 
cgit v1.2.3


From 05a68ce5fa51a83c360381630f823545c5757aa2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 9 Mar 2021 10:50:28 -0800
Subject: bpf: Don't do bpf_cgroup_storage_set() for kuprobe/tp programs

For kuprobe and tracepoint bpf programs, kernel calls
trace_call_bpf() which calls BPF_PROG_RUN_ARRAY_CHECK()
to run the program array. Currently, BPF_PROG_RUN_ARRAY_CHECK()
also calls bpf_cgroup_storage_set() to set percpu
cgroup local storage with NULL value. This is
due to Commit 394e40a29788 ("bpf: extend bpf_prog_array to store
pointers to the cgroup storage") which modified
__BPF_PROG_RUN_ARRAY() to call bpf_cgroup_storage_set()
and this macro is also used by BPF_PROG_RUN_ARRAY_CHECK().

kuprobe and tracepoint programs are not allowed to call
bpf_get_local_storage() helper hence does not
access percpu cgroup local storage. Let us
change BPF_PROG_RUN_ARRAY_CHECK() not to
modify percpu cgroup local storage.

The issue is observed when I tried to debug [1] where
percpu data is overwritten due to
  preempt_disable -> migration_disable
change. This patch does not completely fix the above issue,
which will be addressed separately, e.g., multiple cgroup
prog runs may preempt each other. But it does fix
any potential issue caused by tracing program
overwriting percpu cgroup storage:
 - in a busy system, a tracing program is to run between
   bpf_cgroup_storage_set() and the cgroup prog run.
 - a kprobe program is triggered by a helper in cgroup prog
   before bpf_get_local_storage() is called.

 [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T

Fixes: 394e40a29788 ("bpf: extend bpf_prog_array to store pointers to the cgroup storage")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Link: https://lore.kernel.org/bpf/20210309185028.3763817-1-yhs@fb.com
---
 include/linux/bpf.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cccaef1088ea..d7e0f479a5b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1093,7 +1093,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		_ret;							\
 	 })
 
-#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
+#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage)	\
 	({						\
 		struct bpf_prog_array_item *_item;	\
 		struct bpf_prog *_prog;			\
@@ -1106,7 +1106,8 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			goto _out;			\
 		_item = &_array->items[0];		\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			if (set_cg_storage)		\
+				bpf_cgroup_storage_set(_item->cgroup_storage);	\
 			_ret &= func(_prog, ctx);	\
 			_item++;			\
 		}					\
@@ -1153,10 +1154,10 @@ _out:							\
 	})
 
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
-	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, false, true)
 
 #define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)	\
-	__BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)
 
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
-- 
cgit v1.2.3


From e6a4750ffe9d701c4d55212b14b615e63571d235 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 8 Mar 2021 12:29:06 +0100
Subject: bpf, xdp: Make bpf_redirect_map() a map operation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the bpf_redirect_map() implementation dispatches to the
correct map-lookup function via a switch-statement. To avoid the
dispatching, this change adds bpf_redirect_map() as a map
operation. Each map provides its bpf_redirect_map() version, and
correct function is automatically selected by the BPF verifier.

A nice side-effect of the code movement is that the map lookup
functions are now local to the map implementation files, which removes
one additional function call.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210308112907.559576-2-bjorn.topel@gmail.com
---
 include/linux/bpf.h    | 26 ++++++--------------------
 include/linux/filter.h | 27 +++++++++++++++++++++++++++
 include/net/xdp_sock.h | 19 -------------------
 kernel/bpf/cpumap.c    |  8 +++++++-
 kernel/bpf/devmap.c    | 16 ++++++++++++++--
 kernel/bpf/verifier.c  | 13 +++++++++++--
 net/core/filter.c      | 39 +--------------------------------------
 net/xdp/xskmap.c       | 16 ++++++++++++++++
 8 files changed, 82 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c931bc97019d..a25730eaa148 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -118,6 +118,9 @@ struct bpf_map_ops {
 					   void *owner, u32 size);
 	struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
 
+	/* Misc helpers.*/
+	int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags);
+
 	/* map_meta_equal must be implemented for maps that can be
 	 * used as an inner map.  It is a runtime check to ensure
 	 * an inner map can be inserted to an outer map.
@@ -1450,9 +1453,9 @@ struct btf *bpf_get_btf_vmlinux(void);
 /* Map specifics */
 struct xdp_buff;
 struct sk_buff;
+struct bpf_dtab_netdev;
+struct bpf_cpu_map_entry;
 
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_flush(void);
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
@@ -1462,7 +1465,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog);
 bool dev_map_can_have_prog(struct bpf_map *map);
 
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_flush(void);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
@@ -1593,17 +1595,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 	return -EOPNOTSUPP;
 }
 
-static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
-						       u32 key)
-{
-	return NULL;
-}
-
-static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
-							     u32 key)
-{
-	return NULL;
-}
 static inline bool dev_map_can_have_prog(struct bpf_map *map)
 {
 	return false;
@@ -1615,6 +1606,7 @@ static inline void __dev_flush(void)
 
 struct xdp_buff;
 struct bpf_dtab_netdev;
+struct bpf_cpu_map_entry;
 
 static inline
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
@@ -1639,12 +1631,6 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
 	return 0;
 }
 
-static inline
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-	return NULL;
-}
-
 static inline void __cpu_map_flush(void)
 {
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 3b00fc906ccd..008691fd3b58 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1472,4 +1472,31 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
+static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
+						  void *lookup_elem(struct bpf_map *map, u32 key))
+{
+	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+	/* Lower bits of the flags are used as return code on lookup failure */
+	if (unlikely(flags > XDP_TX))
+		return XDP_ABORTED;
+
+	ri->tgt_value = lookup_elem(map, ifindex);
+	if (unlikely(!ri->tgt_value)) {
+		/* If the lookup fails we want to clear out the state in the
+		 * redirect_info struct completely, so that if an eBPF program
+		 * performs multiple lookups, the last one always takes
+		 * precedence.
+		 */
+		WRITE_ONCE(ri->map, NULL);
+		return flags;
+	}
+
+	ri->flags = flags;
+	ri->tgt_index = ifindex;
+	WRITE_ONCE(ri->map, map);
+
+	return XDP_REDIRECT;
+}
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index cc17bc957548..9c0722c6d7ac 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -80,19 +80,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
 void __xsk_map_flush(void);
 
-static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
-						     u32 key)
-{
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct xdp_sock *xs;
-
-	if (key >= map->max_entries)
-		return NULL;
-
-	xs = READ_ONCE(m->xsk_map[key]);
-	return xs;
-}
-
 #else
 
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
@@ -109,12 +96,6 @@ static inline void __xsk_map_flush(void)
 {
 }
 
-static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
-						     u32 key)
-{
-	return NULL;
-}
-
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5d1469de6921..7352d4160b7f 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -563,7 +563,7 @@ static void cpu_map_free(struct bpf_map *map)
 	kfree(cmap);
 }
 
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	struct bpf_cpu_map_entry *rcpu;
@@ -600,6 +600,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
+static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+	return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+}
+
 static int cpu_map_btf_id;
 const struct bpf_map_ops cpu_map_ops = {
 	.map_meta_equal		= bpf_map_meta_equal,
@@ -612,6 +617,7 @@ const struct bpf_map_ops cpu_map_ops = {
 	.map_check_btf		= map_check_no_btf,
 	.map_btf_name		= "bpf_cpu_map",
 	.map_btf_id		= &cpu_map_btf_id,
+	.map_redirect		= cpu_map_redirect,
 };
 
 static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 85d9d1b72a33..f7f42448259f 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -258,7 +258,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct hlist_head *head = dev_map_index_hash(dtab, key);
@@ -392,7 +392,7 @@ void __dev_flush(void)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *obj;
@@ -735,6 +735,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
 					 map, key, value, map_flags);
 }
 
+static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+	return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+}
+
+static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+	return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+}
+
 static int dev_map_btf_id;
 const struct bpf_map_ops dev_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
@@ -747,6 +757,7 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_check_btf = map_check_no_btf,
 	.map_btf_name = "bpf_dtab",
 	.map_btf_id = &dev_map_btf_id,
+	.map_redirect = dev_map_redirect,
 };
 
 static int dev_map_hash_map_btf_id;
@@ -761,6 +772,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
 	.map_check_btf = map_check_no_btf,
 	.map_btf_name = "bpf_dtab",
 	.map_btf_id = &dev_map_hash_map_btf_id,
+	.map_redirect = dev_hash_map_redirect,
 };
 
 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9fe90ce52a65..97eb0b2435b8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5582,7 +5582,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	    func_id != BPF_FUNC_map_push_elem &&
 	    func_id != BPF_FUNC_map_pop_elem &&
 	    func_id != BPF_FUNC_map_peek_elem &&
-	    func_id != BPF_FUNC_for_each_map_elem)
+	    func_id != BPF_FUNC_for_each_map_elem &&
+	    func_id != BPF_FUNC_redirect_map)
 		return 0;
 
 	if (map == NULL) {
@@ -12017,7 +12018,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		     insn->imm == BPF_FUNC_map_delete_elem ||
 		     insn->imm == BPF_FUNC_map_push_elem   ||
 		     insn->imm == BPF_FUNC_map_pop_elem    ||
-		     insn->imm == BPF_FUNC_map_peek_elem)) {
+		     insn->imm == BPF_FUNC_map_peek_elem   ||
+		     insn->imm == BPF_FUNC_redirect_map)) {
 			aux = &env->insn_aux_data[i + delta];
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
@@ -12059,6 +12061,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 				     (int (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
 				     (int (*)(struct bpf_map *map, void *value))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_redirect,
+				     (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+
 patch_map_ops_generic:
 			switch (insn->imm) {
 			case BPF_FUNC_map_lookup_elem:
@@ -12085,6 +12090,10 @@ patch_map_ops_generic:
 				insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
 					    __bpf_call_base;
 				continue;
+			case BPF_FUNC_redirect_map:
+				insn->imm = BPF_CAST_CALL(ops->map_redirect) -
+					    __bpf_call_base;
+				continue;
 			}
 
 			goto patch_call_imm;
diff --git a/net/core/filter.c b/net/core/filter.c
index 588b19ba0da8..183b0aa6b027 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3943,22 +3943,6 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
-static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
-{
-	switch (map->map_type) {
-	case BPF_MAP_TYPE_DEVMAP:
-		return __dev_map_lookup_elem(map, index);
-	case BPF_MAP_TYPE_DEVMAP_HASH:
-		return __dev_map_hash_lookup_elem(map, index);
-	case BPF_MAP_TYPE_CPUMAP:
-		return __cpu_map_lookup_elem(map, index);
-	case BPF_MAP_TYPE_XSKMAP:
-		return __xsk_map_lookup_elem(map, index);
-	default:
-		return NULL;
-	}
-}
-
 void bpf_clear_redirect_map(struct bpf_map *map)
 {
 	struct bpf_redirect_info *ri;
@@ -4112,28 +4096,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
 BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
 	   u64, flags)
 {
-	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-
-	/* Lower bits of the flags are used as return code on lookup failure */
-	if (unlikely(flags > XDP_TX))
-		return XDP_ABORTED;
-
-	ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
-	if (unlikely(!ri->tgt_value)) {
-		/* If the lookup fails we want to clear out the state in the
-		 * redirect_info struct completely, so that if an eBPF program
-		 * performs multiple lookups, the last one always takes
-		 * precedence.
-		 */
-		WRITE_ONCE(ri->map, NULL);
-		return flags;
-	}
-
-	ri->flags = flags;
-	ri->tgt_index = ifindex;
-	WRITE_ONCE(ri->map, map);
-
-	return XDP_REDIRECT;
+	return map->ops->map_redirect(map, ifindex, flags);
 }
 
 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 113fd9017203..fbeb4870f798 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -125,6 +125,16 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	return insn - insn_buf;
 }
 
+static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	return READ_ONCE(m->xsk_map[key]);
+}
+
 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
@@ -215,6 +225,11 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
+static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+	return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem);
+}
+
 void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
 			     struct xdp_sock **map_entry)
 {
@@ -247,4 +262,5 @@ const struct bpf_map_ops xsk_map_ops = {
 	.map_check_btf = map_check_no_btf,
 	.map_btf_name = "xsk_map",
 	.map_btf_id = &xsk_map_btf_id,
+	.map_redirect = xsk_map_redirect,
 };
-- 
cgit v1.2.3


From ee75aef23afe6e88497151c127c13ed69f41aaa2 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 8 Mar 2021 12:29:07 +0100
Subject: bpf, xdp: Restructure redirect actions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The XDP_REDIRECT implementations for maps and non-maps are fairly
similar, but obviously need to take different code paths depending on
if the target is using a map or not. Today, the redirect targets for
XDP either uses a map, or is based on ifindex.

Here, the map type and id are added to bpf_redirect_info, instead of
the actual map. Map type, map item/ifindex, and the map_id (if any) is
passed to xdp_do_redirect().

For ifindex-based redirect, used by the bpf_redirect() XDP BFP helper,
a special map type/id are used. Map type of UNSPEC together with map id
equal to INT_MAX has the special meaning of an ifindex based
redirect. Note that valid map ids are 1 inclusive, INT_MAX exclusive
([1,INT_MAX[).

In addition to making the code easier to follow, using explicit type
and id in bpf_redirect_info has a slight positive performance impact
by avoiding a pointer indirection for the map type lookup, and instead
use the cacheline for bpf_redirect_info.

Since the actual map is not passed via bpf_redirect_info anymore, the
map lookup is only done in the BPF helper. This means that the
bpf_clear_redirect_map() function can be removed. The actual map item
is RCU protected.

The bpf_redirect_info flags member is not used by XDP, and not
read/written any more. The map member is only written to when
required/used, and not unconditionally.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/bpf/20210308112907.559576-3-bjorn.topel@gmail.com
---
 include/linux/filter.h     |  10 +--
 include/trace/events/xdp.h |  62 ++++++++++-------
 kernel/bpf/cpumap.c        |   1 -
 kernel/bpf/devmap.c        |   1 -
 net/core/filter.c          | 170 ++++++++++++++++++++-------------------------
 net/xdp/xskmap.c           |   1 -
 6 files changed, 116 insertions(+), 129 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 008691fd3b58..b2b85b2cad8e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -646,7 +646,8 @@ struct bpf_redirect_info {
 	u32 flags;
 	u32 tgt_index;
 	void *tgt_value;
-	struct bpf_map *map;
+	u32 map_id;
+	enum bpf_map_type map_type;
 	u32 kern_flags;
 	struct bpf_nh_params nh;
 };
@@ -1488,13 +1489,14 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
 		 * performs multiple lookups, the last one always takes
 		 * precedence.
 		 */
-		WRITE_ONCE(ri->map, NULL);
+		ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
+		ri->map_type = BPF_MAP_TYPE_UNSPEC;
 		return flags;
 	}
 
-	ri->flags = flags;
 	ri->tgt_index = ifindex;
-	WRITE_ONCE(ri->map, map);
+	ri->map_id = map->id;
+	ri->map_type = map->map_type;
 
 	return XDP_REDIRECT;
 }
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 76a97176ab81..fcad3645a70b 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -86,19 +86,15 @@ struct _bpf_dtab_netdev {
 };
 #endif /* __DEVMAP_OBJ_TYPE */
 
-#define devmap_ifindex(tgt, map)				\
-	(((map->map_type == BPF_MAP_TYPE_DEVMAP ||	\
-		  map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \
-	  ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0)
-
 DECLARE_EVENT_CLASS(xdp_redirect_template,
 
 	TP_PROTO(const struct net_device *dev,
 		 const struct bpf_prog *xdp,
 		 const void *tgt, int err,
-		 const struct bpf_map *map, u32 index),
+		 enum bpf_map_type map_type,
+		 u32 map_id, u32 index),
 
-	TP_ARGS(dev, xdp, tgt, err, map, index),
+	TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index),
 
 	TP_STRUCT__entry(
 		__field(int, prog_id)
@@ -111,14 +107,22 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
 	),
 
 	TP_fast_assign(
+		u32 ifindex = 0, map_index = index;
+
+		if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+			ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
+		} else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+			ifindex = index;
+			map_index = 0;
+		}
+
 		__entry->prog_id	= xdp->aux->id;
 		__entry->act		= XDP_REDIRECT;
 		__entry->ifindex	= dev->ifindex;
 		__entry->err		= err;
-		__entry->to_ifindex	= map ? devmap_ifindex(tgt, map) :
-						index;
-		__entry->map_id		= map ? map->id : 0;
-		__entry->map_index	= map ? index : 0;
+		__entry->to_ifindex	= ifindex;
+		__entry->map_id		= map_id;
+		__entry->map_index	= map_index;
 	),
 
 	TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
@@ -133,45 +137,49 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
 	TP_PROTO(const struct net_device *dev,
 		 const struct bpf_prog *xdp,
 		 const void *tgt, int err,
-		 const struct bpf_map *map, u32 index),
-	TP_ARGS(dev, xdp, tgt, err, map, index)
+		 enum bpf_map_type map_type,
+		 u32 map_id, u32 index),
+	TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
 );
 
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err,
 	TP_PROTO(const struct net_device *dev,
 		 const struct bpf_prog *xdp,
 		 const void *tgt, int err,
-		 const struct bpf_map *map, u32 index),
-	TP_ARGS(dev, xdp, tgt, err, map, index)
+		 enum bpf_map_type map_type,
+		 u32 map_id, u32 index),
+	TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
 );
 
-#define _trace_xdp_redirect(dev, xdp, to)				\
-	 trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to)
+#define _trace_xdp_redirect(dev, xdp, to)						\
+	 trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to)
 
-#define _trace_xdp_redirect_err(dev, xdp, to, err)			\
-	 trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to)
+#define _trace_xdp_redirect_err(dev, xdp, to, err)					\
+	 trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to)
 
-#define _trace_xdp_redirect_map(dev, xdp, to, map, index)		\
-	 trace_xdp_redirect(dev, xdp, to, 0, map, index)
+#define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \
+	 trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index)
 
-#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err)	\
-	 trace_xdp_redirect_err(dev, xdp, to, err, map, index)
+#define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \
+	 trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index)
 
 /* not used anymore, but kept around so as not to break old programs */
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map,
 	TP_PROTO(const struct net_device *dev,
 		 const struct bpf_prog *xdp,
 		 const void *tgt, int err,
-		 const struct bpf_map *map, u32 index),
-	TP_ARGS(dev, xdp, tgt, err, map, index)
+		 enum bpf_map_type map_type,
+		 u32 map_id, u32 index),
+	TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
 );
 
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err,
 	TP_PROTO(const struct net_device *dev,
 		 const struct bpf_prog *xdp,
 		 const void *tgt, int err,
-		 const struct bpf_map *map, u32 index),
-	TP_ARGS(dev, xdp, tgt, err, map, index)
+		 enum bpf_map_type map_type,
+		 u32 map_id, u32 index),
+	TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
 );
 
 TRACE_EVENT(xdp_cpumap_kthread,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 7352d4160b7f..0cf2791d5099 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map)
 	 * complete.
 	 */
 
-	bpf_clear_redirect_map(map);
 	synchronize_rcu();
 
 	/* For cpu_map the remote CPUs can still be using the entries
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f7f42448259f..7a5ad7331c3b 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map)
 	list_del_rcu(&dtab->list);
 	spin_unlock(&dev_map_lock);
 
-	bpf_clear_redirect_map(map);
 	synchronize_rcu();
 
 	/* Make sure prior __dev_map_entry_free() have completed. */
diff --git a/net/core/filter.c b/net/core/filter.c
index 183b0aa6b027..b6732000d8a2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3918,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
-			    struct bpf_map *map, struct xdp_buff *xdp)
-{
-	switch (map->map_type) {
-	case BPF_MAP_TYPE_DEVMAP:
-	case BPF_MAP_TYPE_DEVMAP_HASH:
-		return dev_map_enqueue(fwd, xdp, dev_rx);
-	case BPF_MAP_TYPE_CPUMAP:
-		return cpu_map_enqueue(fwd, xdp, dev_rx);
-	case BPF_MAP_TYPE_XSKMAP:
-		return __xsk_map_redirect(fwd, xdp);
-	default:
-		return -EBADRQC;
-	}
-	return 0;
-}
-
 void xdp_do_flush(void)
 {
 	__dev_flush();
@@ -3943,55 +3926,52 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
-void bpf_clear_redirect_map(struct bpf_map *map)
-{
-	struct bpf_redirect_info *ri;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		ri = per_cpu_ptr(&bpf_redirect_info, cpu);
-		/* Avoid polluting remote cacheline due to writes if
-		 * not needed. Once we pass this test, we need the
-		 * cmpxchg() to make sure it hasn't been changed in
-		 * the meantime by remote CPU.
-		 */
-		if (unlikely(READ_ONCE(ri->map) == map))
-			cmpxchg(&ri->map, map, NULL);
-	}
-}
-
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-	struct bpf_map *map = READ_ONCE(ri->map);
-	u32 index = ri->tgt_index;
+	enum bpf_map_type map_type = ri->map_type;
 	void *fwd = ri->tgt_value;
+	u32 map_id = ri->map_id;
 	int err;
 
-	ri->tgt_index = 0;
-	ri->tgt_value = NULL;
-	WRITE_ONCE(ri->map, NULL);
+	ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+	ri->map_type = BPF_MAP_TYPE_UNSPEC;
 
-	if (unlikely(!map)) {
-		fwd = dev_get_by_index_rcu(dev_net(dev), index);
-		if (unlikely(!fwd)) {
-			err = -EINVAL;
-			goto err;
+	switch (map_type) {
+	case BPF_MAP_TYPE_DEVMAP:
+		fallthrough;
+	case BPF_MAP_TYPE_DEVMAP_HASH:
+		err = dev_map_enqueue(fwd, xdp, dev);
+		break;
+	case BPF_MAP_TYPE_CPUMAP:
+		err = cpu_map_enqueue(fwd, xdp, dev);
+		break;
+	case BPF_MAP_TYPE_XSKMAP:
+		err = __xsk_map_redirect(fwd, xdp);
+		break;
+	case BPF_MAP_TYPE_UNSPEC:
+		if (map_id == INT_MAX) {
+			fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+			if (unlikely(!fwd)) {
+				err = -EINVAL;
+				break;
+			}
+			err = dev_xdp_enqueue(fwd, xdp, dev);
+			break;
 		}
-
-		err = dev_xdp_enqueue(fwd, xdp, dev);
-	} else {
-		err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+		fallthrough;
+	default:
+		err = -EBADRQC;
 	}
 
 	if (unlikely(err))
 		goto err;
 
-	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
 	return 0;
 err:
-	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -4000,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct sk_buff *skb,
 				       struct xdp_buff *xdp,
 				       struct bpf_prog *xdp_prog,
-				       struct bpf_map *map)
+				       void *fwd,
+				       enum bpf_map_type map_type, u32 map_id)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-	u32 index = ri->tgt_index;
-	void *fwd = ri->tgt_value;
-	int err = 0;
-
-	ri->tgt_index = 0;
-	ri->tgt_value = NULL;
-	WRITE_ONCE(ri->map, NULL);
-
-	if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
-	    map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
-		struct bpf_dtab_netdev *dst = fwd;
+	int err;
 
-		err = dev_map_generic_redirect(dst, skb, xdp_prog);
+	switch (map_type) {
+	case BPF_MAP_TYPE_DEVMAP:
+		fallthrough;
+	case BPF_MAP_TYPE_DEVMAP_HASH:
+		err = dev_map_generic_redirect(fwd, skb, xdp_prog);
 		if (unlikely(err))
 			goto err;
-	} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
-		struct xdp_sock *xs = fwd;
-
-		err = xsk_generic_rcv(xs, xdp);
+		break;
+	case BPF_MAP_TYPE_XSKMAP:
+		err = xsk_generic_rcv(fwd, xdp);
 		if (err)
 			goto err;
 		consume_skb(skb);
-	} else {
+		break;
+	default:
 		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
 		err = -EBADRQC;
 		goto err;
 	}
 
-	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
 	return 0;
 err:
-	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
 	return err;
 }
 
@@ -4042,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 			    struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-	struct bpf_map *map = READ_ONCE(ri->map);
-	u32 index = ri->tgt_index;
-	struct net_device *fwd;
-	int err = 0;
-
-	if (map)
-		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
-						   map);
-	ri->tgt_index = 0;
-	fwd = dev_get_by_index_rcu(dev_net(dev), index);
-	if (unlikely(!fwd)) {
-		err = -EINVAL;
-		goto err;
-	}
+	enum bpf_map_type map_type = ri->map_type;
+	void *fwd = ri->tgt_value;
+	u32 map_id = ri->map_id;
+	int err;
 
-	err = xdp_ok_fwd_dev(fwd, skb->len);
-	if (unlikely(err))
-		goto err;
+	ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+	ri->map_type = BPF_MAP_TYPE_UNSPEC;
 
-	skb->dev = fwd;
-	_trace_xdp_redirect(dev, xdp_prog, index);
-	generic_xdp_tx(skb, xdp_prog);
-	return 0;
+	if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+		fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+		if (unlikely(!fwd)) {
+			err = -EINVAL;
+			goto err;
+		}
+
+		err = xdp_ok_fwd_dev(fwd, skb->len);
+		if (unlikely(err))
+			goto err;
+
+		skb->dev = fwd;
+		_trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+		generic_xdp_tx(skb, xdp_prog);
+		return 0;
+	}
+
+	return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
 err:
-	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
+	_trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
 	return err;
 }
 
@@ -4077,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 	if (unlikely(flags))
 		return XDP_ABORTED;
 
-	ri->flags = flags;
+	/* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+	 * by map_idr) is used for ifindex based XDP redirect.
+	 */
 	ri->tgt_index = ifindex;
-	ri->tgt_value = NULL;
-	WRITE_ONCE(ri->map, NULL);
+	ri->map_id = INT_MAX;
+	ri->map_type = BPF_MAP_TYPE_UNSPEC;
 
 	return XDP_REDIRECT;
 }
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index fbeb4870f798..67b4ce504852 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -87,7 +87,6 @@ static void xsk_map_free(struct bpf_map *map)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
 
-	bpf_clear_redirect_map(map);
 	synchronize_net();
 	bpf_map_area_free(m);
 }
-- 
cgit v1.2.3


From 924a9bc362a5223cd448ca08c3dde21235adc310 Mon Sep 17 00:00:00 2001
From: Balazs Nemeth <bnemeth@redhat.com>
Date: Tue, 9 Mar 2021 12:31:00 +0100
Subject: net: check if protocol extracted by virtio_net_hdr_set_proto is
 correct

For gso packets, virtio_net_hdr_set_proto sets the protocol (if it isn't
set) based on the type in the virtio net hdr, but the skb could contain
anything since it could come from packet_snd through a raw socket. If
there is a mismatch between what virtio_net_hdr_set_proto sets and
the actual protocol, then the skb could be handled incorrectly later
on.

An example where this poses an issue is with the subsequent call to
skb_flow_dissect_flow_keys_basic which relies on skb->protocol being set
correctly. A specially crafted packet could fool
skb_flow_dissect_flow_keys_basic preventing EINVAL to be returned.

Avoid blindly trusting the information provided by the virtio net header
by checking that the protocol in the packet actually matches the
protocol set by virtio_net_hdr_set_proto. Note that since the protocol
is only checked if skb->dev implements header_ops->parse_protocol,
packets from devices without the implementation are not checked at this
stage.

Fixes: 9274124f023b ("net: stricter validation of untrusted gso packets")
Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index e8a924eeea3d..6b5fcfa1e555 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -79,8 +79,13 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 		if (gso_type && skb->network_header) {
 			struct flow_keys_basic keys;
 
-			if (!skb->protocol)
+			if (!skb->protocol) {
+				__be16 protocol = dev_parse_header_protocol(skb);
+
 				virtio_net_hdr_set_proto(skb, hdr);
+				if (protocol && protocol != skb->protocol)
+					return -EINVAL;
+			}
 retry:
 			if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
 							      NULL, 0, 0, 0,
-- 
cgit v1.2.3


From eec73529a9321616ed13cf732cd21a17eb1a2836 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 10 Mar 2021 08:16:40 +0530
Subject: arch_topology: Rename freq_scale as arch_freq_scale

Rename freq_scale to a less generic name, as it will get exported soon
for modules. Since x86 already names its own implementation of this as
arch_freq_scale, lets stick to that.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/arm64/kernel/topology.c  | 6 +++---
 drivers/base/arch_topology.c  | 4 ++--
 include/linux/arch_topology.h | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index e08a4126453a..aa8d893619ed 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -256,8 +256,8 @@ static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val,
 	 * initialized AMU support and enabled invariance. The AMU counters will
 	 * keep on working just fine in the absence of the cpufreq driver, and
 	 * for the CPUs for which there are no counters available, the last set
-	 * value of freq_scale will remain valid as that is the frequency those
-	 * CPUs are running at.
+	 * value of arch_freq_scale will remain valid as that is the frequency
+	 * those CPUs are running at.
 	 */
 
 	return 0;
@@ -327,7 +327,7 @@ void topology_scale_freq_tick(void)
 			  const_cnt - prev_const_cnt);
 
 	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
-	this_cpu_write(freq_scale, (unsigned long)scale);
+	this_cpu_write(arch_freq_scale, (unsigned long)scale);
 }
 
 #ifdef CONFIG_ACPI_CPPC_LIB
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index de8587cc119e..2a1cecbde0a4 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -31,7 +31,7 @@ __weak bool arch_freq_counters_available(const struct cpumask *cpus)
 {
 	return false;
 }
-DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
 
 void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
 			     unsigned long max_freq)
@@ -53,7 +53,7 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
 	scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
 
 	for_each_cpu(i, cpus)
-		per_cpu(freq_scale, i) = scale;
+		per_cpu(arch_freq_scale, i) = scale;
 }
 
 DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 0f6cd6b73a61..583af517f123 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -23,11 +23,11 @@ static inline unsigned long topology_get_cpu_scale(int cpu)
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 
-DECLARE_PER_CPU(unsigned long, freq_scale);
+DECLARE_PER_CPU(unsigned long, arch_freq_scale);
 
 static inline unsigned long topology_get_freq_scale(int cpu)
 {
-	return per_cpu(freq_scale, cpu);
+	return per_cpu(arch_freq_scale, cpu);
 }
 
 void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
-- 
cgit v1.2.3


From 01e055c120a46e78650b5f903088badbbdaae9ad Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 10 Mar 2021 08:21:04 +0530
Subject: arch_topology: Allow multiple entities to provide sched_freq_tick()
 callback

This patch attempts to make it generic enough so other parts of the
kernel can also provide their own implementation of scale_freq_tick()
callback, which is called by the scheduler periodically to update the
per-cpu arch_freq_scale variable.

The implementations now need to provide 'struct scale_freq_data' for the
CPUs for which they have hardware counters available, and a callback
gets registered for each possible CPU in a per-cpu variable.

The arch specific (or ARM AMU) counters are updated to adapt to this and
they take the highest priority if they are available, i.e. they will be
used instead of CPPC based counters for example.

The special code to rebuild the sched domains, in case invariance status
change for the system, is moved out of arm64 specific code and is added
to arch_topology.c.

Note that this also defines SCALE_FREQ_SOURCE_CPUFREQ but doesn't use it
and it is added to show that cpufreq is also acts as source of
information for FIE and will be used by default if no other counters are
supported for a platform.

Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Ionela Voinescu <ionela.voinescu@arm.com>
Acked-by: Will Deacon <will@kernel.org> # for arm64
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/arm64/include/asm/topology.h |  10 +---
 arch/arm64/kernel/topology.c      | 105 ++++++++++++++------------------------
 drivers/base/arch_topology.c      |  82 +++++++++++++++++++++++++++--
 include/linux/arch_topology.h     |  14 ++++-
 4 files changed, 131 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 3b8dca4eb08d..ec2db3419c41 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -17,17 +17,9 @@ int pcibus_to_node(struct pci_bus *bus);
 #include <linux/arch_topology.h>
 
 void update_freq_counters_refs(void);
-void topology_scale_freq_tick(void);
-
-#ifdef CONFIG_ARM64_AMU_EXTN
-/*
- * Replace task scheduler's default counter-based
- * frequency-invariance scale factor setting.
- */
-#define arch_scale_freq_tick topology_scale_freq_tick
-#endif /* CONFIG_ARM64_AMU_EXTN */
 
 /* Replace task scheduler's default frequency-invariant accounting */
+#define arch_scale_freq_tick topology_scale_freq_tick
 #define arch_set_freq_scale topology_set_freq_scale
 #define arch_scale_freq_capacity topology_get_freq_scale
 #define arch_scale_freq_invariant topology_scale_freq_invariant
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index aa8d893619ed..4dd14a6620c1 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -199,12 +199,47 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate)
 	return 0;
 }
 
-static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
-#define amu_freq_invariant() static_branch_unlikely(&amu_fie_key)
+static void amu_scale_freq_tick(void)
+{
+	u64 prev_core_cnt, prev_const_cnt;
+	u64 core_cnt, const_cnt, scale;
+
+	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
+	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
+
+	update_freq_counters_refs();
+
+	const_cnt = this_cpu_read(arch_const_cycles_prev);
+	core_cnt = this_cpu_read(arch_core_cycles_prev);
+
+	if (unlikely(core_cnt <= prev_core_cnt ||
+		     const_cnt <= prev_const_cnt))
+		return;
+
+	/*
+	 *	    /\core    arch_max_freq_scale
+	 * scale =  ------- * --------------------
+	 *	    /\const   SCHED_CAPACITY_SCALE
+	 *
+	 * See validate_cpu_freq_invariance_counters() for details on
+	 * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT.
+	 */
+	scale = core_cnt - prev_core_cnt;
+	scale *= this_cpu_read(arch_max_freq_scale);
+	scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT,
+			  const_cnt - prev_const_cnt);
+
+	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
+	this_cpu_write(arch_freq_scale, (unsigned long)scale);
+}
+
+static struct scale_freq_data amu_sfd = {
+	.source = SCALE_FREQ_SOURCE_ARCH,
+	.set_freq_scale = amu_scale_freq_tick,
+};
 
 static void amu_fie_setup(const struct cpumask *cpus)
 {
-	bool invariant;
 	int cpu;
 
 	/* We are already set since the last insmod of cpufreq driver */
@@ -221,25 +256,10 @@ static void amu_fie_setup(const struct cpumask *cpus)
 
 	cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus);
 
-	invariant = topology_scale_freq_invariant();
-
-	/* We aren't fully invariant yet */
-	if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask))
-		return;
-
-	static_branch_enable(&amu_fie_key);
+	topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus);
 
 	pr_debug("CPUs[%*pbl]: counters will be used for FIE.",
 		 cpumask_pr_args(cpus));
-
-	/*
-	 * Task scheduler behavior depends on frequency invariance support,
-	 * either cpufreq or counter driven. If the support status changes as
-	 * a result of counter initialisation and use, retrigger the build of
-	 * scheduling domains to ensure the information is propagated properly.
-	 */
-	if (!invariant)
-		rebuild_sched_domains_energy();
 }
 
 static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val,
@@ -283,53 +303,6 @@ static int __init init_amu_fie(void)
 }
 core_initcall(init_amu_fie);
 
-bool arch_freq_counters_available(const struct cpumask *cpus)
-{
-	return amu_freq_invariant() &&
-	       cpumask_subset(cpus, amu_fie_cpus);
-}
-
-void topology_scale_freq_tick(void)
-{
-	u64 prev_core_cnt, prev_const_cnt;
-	u64 core_cnt, const_cnt, scale;
-	int cpu = smp_processor_id();
-
-	if (!amu_freq_invariant())
-		return;
-
-	if (!cpumask_test_cpu(cpu, amu_fie_cpus))
-		return;
-
-	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
-	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
-
-	update_freq_counters_refs();
-
-	const_cnt = this_cpu_read(arch_const_cycles_prev);
-	core_cnt = this_cpu_read(arch_core_cycles_prev);
-
-	if (unlikely(core_cnt <= prev_core_cnt ||
-		     const_cnt <= prev_const_cnt))
-		return;
-
-	/*
-	 *	    /\core    arch_max_freq_scale
-	 * scale =  ------- * --------------------
-	 *	    /\const   SCHED_CAPACITY_SCALE
-	 *
-	 * See validate_cpu_freq_invariance_counters() for details on
-	 * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT.
-	 */
-	scale = core_cnt - prev_core_cnt;
-	scale *= this_cpu_read(arch_max_freq_scale);
-	scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT,
-			  const_cnt - prev_const_cnt);
-
-	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
-	this_cpu_write(arch_freq_scale, (unsigned long)scale);
-}
-
 #ifdef CONFIG_ACPI_CPPC_LIB
 #include <acpi/cppc_acpi.h>
 
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 2a1cecbde0a4..ebcd2ea3091f 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -21,16 +21,90 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 
+static DEFINE_PER_CPU(struct scale_freq_data *, sft_data);
+static struct cpumask scale_freq_counters_mask;
+static bool scale_freq_invariant;
+
+static bool supports_scale_freq_counters(const struct cpumask *cpus)
+{
+	return cpumask_subset(cpus, &scale_freq_counters_mask);
+}
+
 bool topology_scale_freq_invariant(void)
 {
 	return cpufreq_supports_freq_invariance() ||
-	       arch_freq_counters_available(cpu_online_mask);
+	       supports_scale_freq_counters(cpu_online_mask);
 }
 
-__weak bool arch_freq_counters_available(const struct cpumask *cpus)
+static void update_scale_freq_invariant(bool status)
 {
-	return false;
+	if (scale_freq_invariant == status)
+		return;
+
+	/*
+	 * Task scheduler behavior depends on frequency invariance support,
+	 * either cpufreq or counter driven. If the support status changes as
+	 * a result of counter initialisation and use, retrigger the build of
+	 * scheduling domains to ensure the information is propagated properly.
+	 */
+	if (topology_scale_freq_invariant() == status) {
+		scale_freq_invariant = status;
+		rebuild_sched_domains_energy();
+	}
 }
+
+void topology_set_scale_freq_source(struct scale_freq_data *data,
+				    const struct cpumask *cpus)
+{
+	struct scale_freq_data *sfd;
+	int cpu;
+
+	/*
+	 * Avoid calling rebuild_sched_domains() unnecessarily if FIE is
+	 * supported by cpufreq.
+	 */
+	if (cpumask_empty(&scale_freq_counters_mask))
+		scale_freq_invariant = topology_scale_freq_invariant();
+
+	for_each_cpu(cpu, cpus) {
+		sfd = per_cpu(sft_data, cpu);
+
+		/* Use ARCH provided counters whenever possible */
+		if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) {
+			per_cpu(sft_data, cpu) = data;
+			cpumask_set_cpu(cpu, &scale_freq_counters_mask);
+		}
+	}
+
+	update_scale_freq_invariant(true);
+}
+
+void topology_clear_scale_freq_source(enum scale_freq_source source,
+				      const struct cpumask *cpus)
+{
+	struct scale_freq_data *sfd;
+	int cpu;
+
+	for_each_cpu(cpu, cpus) {
+		sfd = per_cpu(sft_data, cpu);
+
+		if (sfd && sfd->source == source) {
+			per_cpu(sft_data, cpu) = NULL;
+			cpumask_clear_cpu(cpu, &scale_freq_counters_mask);
+		}
+	}
+
+	update_scale_freq_invariant(false);
+}
+
+void topology_scale_freq_tick(void)
+{
+	struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data);
+
+	if (sfd)
+		sfd->set_freq_scale();
+}
+
 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
 
 void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
@@ -47,7 +121,7 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
 	 * want to update the scale factor with information from CPUFREQ.
 	 * Instead the scale factor will be updated from arch_scale_freq_tick.
 	 */
-	if (arch_freq_counters_available(cpus))
+	if (supports_scale_freq_counters(cpus))
 		return;
 
 	scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 583af517f123..11e555cfaecb 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -34,7 +34,19 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
 			     unsigned long max_freq);
 bool topology_scale_freq_invariant(void);
 
-bool arch_freq_counters_available(const struct cpumask *cpus);
+enum scale_freq_source {
+	SCALE_FREQ_SOURCE_CPUFREQ = 0,
+	SCALE_FREQ_SOURCE_ARCH,
+};
+
+struct scale_freq_data {
+	enum scale_freq_source source;
+	void (*set_freq_scale)(void);
+};
+
+void topology_scale_freq_tick(void);
+void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus);
+void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus);
 
 DECLARE_PER_CPU(unsigned long, thermal_pressure);
 
-- 
cgit v1.2.3


From 51f24030358bdeeb9e75a38618dd029c5a53beeb Mon Sep 17 00:00:00 2001
From: Shuo Liu <shuo.a.liu@intel.com>
Date: Sun, 21 Feb 2021 21:43:38 +0800
Subject: cpu/hotplug: Fix build error of using {add,remove}_cpu() with
 !CONFIG_SMP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

279dcf693ac7 ("virt: acrn: Introduce an interface for Service VM to
control vCPU") introduced {add,remove}_cpu() usage and it hit below
error with !CONFIG_SMP:

../drivers/virt/acrn/hsm.c: In function ‘remove_cpu_store’:
../drivers/virt/acrn/hsm.c:389:3: error: implicit declaration of function ‘remove_cpu’; [-Werror=implicit-function-declaration]
   remove_cpu(cpu);

../drivers/virt/acrn/hsm.c:402:2: error: implicit declaration of function ‘add_cpu’; [-Werror=implicit-function-declaration]
   add_cpu(cpu);

Add add_cpu() function prototypes with !CONFIG_SMP and remove_cpu() with
!CONFIG_HOTPLUG_CPU for such usage.

Fixes: 279dcf693ac7 ("virt: acrn: Introduce an interface for Service VM to control vCPU")
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Qais Yousef <qais.yousef@arm.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Qais Yousef <qais.yousef@arm.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Signed-off-by: Shuo Liu <shuo.a.liu@intel.com>
Link: https://lore.kernel.org/r/20210221134339.57851-1-shuo.a.liu@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cpu.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 3aaa0687e8df..94a578a96202 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -108,6 +108,8 @@ static inline void cpu_maps_update_done(void)
 {
 }
 
+static inline int add_cpu(unsigned int cpu) { return 0;}
+
 #endif /* CONFIG_SMP */
 extern struct bus_type cpu_subsys;
 
@@ -137,6 +139,7 @@ static inline int  cpus_read_trylock(void) { return true; }
 static inline void lockdep_assert_cpus_held(void) { }
 static inline void cpu_hotplug_disable(void) { }
 static inline void cpu_hotplug_enable(void) { }
+static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
 static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
 #endif	/* !CONFIG_HOTPLUG_CPU */
 
-- 
cgit v1.2.3


From 00025161b2d931f97d72418114abf73bfb51ae4c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 2 Mar 2021 07:21:33 +0100
Subject: PCI: remove synclink entries from pci_ids

The drivers were removed in a1f714b44e34 (tty: Remove redundant synclink
driver) and 3d608a591b2b (tty: Remove redundant synclinkmp driver).

So remove also the PCI ID entries.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210302062214.29627-3-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/pci_ids.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a76ccb697bef..8a18517696c1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2065,8 +2065,6 @@
 #define PCI_DEVICE_ID_EXAR_XR17V358	0x0358
 
 #define PCI_VENDOR_ID_MICROGATE		0x13c0
-#define PCI_DEVICE_ID_MICROGATE_USC	0x0010
-#define PCI_DEVICE_ID_MICROGATE_SCA	0x0030
 
 #define PCI_VENDOR_ID_3WARE		0x13C1
 #define PCI_DEVICE_ID_3WARE_1000	0x1000
-- 
cgit v1.2.3


From f76edd8f7ce06cdff2fe5b6b39a49644c684a161 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 2 Mar 2021 07:21:35 +0100
Subject: tty: cyclades, remove this orphan

The Cyclades driver was orphaned by commit d459883e6c54 (MAINTAINERS:
remove two dead e-mail) 13 years ago. Noone stepped up to take care of
them and to fix all the issues the driver has.

On the top of that, there is no way to obtain the firmware for Z cards
from the vendor as cyclades.com ceased to exist.

So it's time to drop the driver with all its traces.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210302062214.29627-5-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/devices.txt              |   10 -
 Documentation/driver-api/serial/cyclades_z.rst     |   11 -
 Documentation/driver-api/serial/index.rst          |    1 -
 Documentation/process/magic-number.rst             |    1 -
 .../translations/it_IT/process/magic-number.rst    |    1 -
 .../translations/zh_CN/process/magic-number.rst    |    1 -
 Documentation/userspace-api/ioctl/ioctl-number.rst |    1 -
 MAINTAINERS                                        |    7 -
 arch/powerpc/configs/ppc6xx_defconfig              |    1 -
 drivers/tty/Kconfig                                |   31 +-
 drivers/tty/Makefile                               |    1 -
 drivers/tty/cyclades.c                             | 4119 --------------------
 drivers/tty/serial/8250/Kconfig                    |    5 +-
 include/linux/cyclades.h                           |  364 --
 include/linux/pci_ids.h                            |    8 -
 include/uapi/linux/cyclades.h                      |  494 ---
 include/uapi/linux/major.h                         |    2 -
 include/uapi/linux/serial.h                        |    4 +-
 18 files changed, 5 insertions(+), 5057 deletions(-)
 delete mode 100644 Documentation/driver-api/serial/cyclades_z.rst
 delete mode 100644 drivers/tty/cyclades.c
 delete mode 100644 include/linux/cyclades.h
 delete mode 100644 include/uapi/linux/cyclades.h

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index 63fd4e6a014b..b5bd9d46e031 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -477,11 +477,6 @@
   18 block	Sanyo CD-ROM
 		  0 = /dev/sjcd		Sanyo CD-ROM
 
-  19 char	Cyclades serial card
-		  0 = /dev/ttyC0	First Cyclades port
-		    ...
-		 31 = /dev/ttyC31	32nd Cyclades port
-
   19 block	"Double" compressed disk
 		  0 = /dev/double0	First compressed disk
 		    ...
@@ -493,11 +488,6 @@
 		See the Double documentation for the meaning of the
 		mirror devices.
 
-  20 char	Cyclades serial card - alternate devices
-		  0 = /dev/cub0		Callout device for ttyC0
-		    ...
-		 31 = /dev/cub31	Callout device for ttyC31
-
   20 block	Hitachi CD-ROM (under development)
 		  0 = /dev/hitcd	Hitachi CD-ROM
 
diff --git a/Documentation/driver-api/serial/cyclades_z.rst b/Documentation/driver-api/serial/cyclades_z.rst
deleted file mode 100644
index 532ff67e2f1c..000000000000
--- a/Documentation/driver-api/serial/cyclades_z.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-================
-Cyclades-Z notes
-================
-
-The Cyclades-Z must have firmware loaded onto the card before it will
-operate.  This operation should be performed during system startup,
-
-The firmware, loader program and the latest device driver code are
-available from Cyclades at
-
-    ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst
index 33ad10d05b26..21351b8c95a4 100644
--- a/Documentation/driver-api/serial/index.rst
+++ b/Documentation/driver-api/serial/index.rst
@@ -17,7 +17,6 @@ Serial drivers
 .. toctree::
     :maxdepth: 1
 
-    cyclades_z
     moxa-smartio
     n_gsm
     rocket
diff --git a/Documentation/process/magic-number.rst b/Documentation/process/magic-number.rst
index fa5a62f4150c..d4a30c09bd03 100644
--- a/Documentation/process/magic-number.rst
+++ b/Documentation/process/magic-number.rst
@@ -73,7 +73,6 @@ CMAGIC                0x0111           user                     ``include/linux/
 MKISS_DRIVER_MAGIC    0x04bf           mkiss_channel            ``drivers/net/mkiss.h``
 HDLC_MAGIC            0x239e           n_hdlc                   ``drivers/char/n_hdlc.c``
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
-CYCLADES_MAGIC        0x4359           cyclades_port            ``include/linux/cyclades.h``
 DB_MAGIC              0x4442           fc_info                  ``drivers/net/iph5526_novram.c``
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst
index 1af30f4228f2..0df2e7e32cd8 100644
--- a/Documentation/translations/it_IT/process/magic-number.rst
+++ b/Documentation/translations/it_IT/process/magic-number.rst
@@ -79,7 +79,6 @@ CMAGIC                0x0111           user                     ``include/linux/
 MKISS_DRIVER_MAGIC    0x04bf           mkiss_channel            ``drivers/net/mkiss.h``
 HDLC_MAGIC            0x239e           n_hdlc                   ``drivers/char/n_hdlc.c``
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
-CYCLADES_MAGIC        0x4359           cyclades_port            ``include/linux/cyclades.h``
 DB_MAGIC              0x4442           fc_info                  ``drivers/net/iph5526_novram.c``
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
diff --git a/Documentation/translations/zh_CN/process/magic-number.rst b/Documentation/translations/zh_CN/process/magic-number.rst
index 7bb9d4165ed3..82d62f6a4406 100644
--- a/Documentation/translations/zh_CN/process/magic-number.rst
+++ b/Documentation/translations/zh_CN/process/magic-number.rst
@@ -62,7 +62,6 @@ CMAGIC                0x0111           user                     ``include/linux/
 MKISS_DRIVER_MAGIC    0x04bf           mkiss_channel            ``drivers/net/mkiss.h``
 HDLC_MAGIC            0x239e           n_hdlc                   ``drivers/char/n_hdlc.c``
 APM_BIOS_MAGIC        0x4101           apm_user                 ``arch/x86/kernel/apm_32.c``
-CYCLADES_MAGIC        0x4359           cyclades_port            ``include/linux/cyclades.h``
 DB_MAGIC              0x4442           fc_info                  ``drivers/net/iph5526_novram.c``
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 599bd4493944..0a7b408c0ec7 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -209,7 +209,6 @@ Code  Seq#    Include File                                           Comments
              linux/fs.h,
 'X'   all    fs/ocfs2/ocfs_fs.h                                      conflict!
 'X'   01     linux/pktcdvd.h                                         conflict!
-'Y'   all    linux/cyclades.h
 'Z'   14-15  drivers/message/fusion/mptctl.h
 '['   00-3F  linux/usb/tmc.h                                         USB Test and Measurement Devices
                                                                      <mailto:gregkh@linuxfoundation.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index 363530db37ac..29f20a97d73d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4876,13 +4876,6 @@ S:	Maintained
 W:	http://www.armlinux.org.uk/
 F:	drivers/video/fbdev/cyber2000fb.*
 
-CYCLADES ASYNC MUX DRIVER
-S:	Orphan
-W:	http://www.cyclades.com/
-F:	drivers/tty/cyclades.c
-F:	include/linux/cyclades.h
-F:	include/uapi/linux/cyclades.h
-
 CYCLADES PC300 DRIVER
 S:	Orphan
 F:	drivers/net/wan/pc300*
diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig
index 6677ac0da45a..1fd9d1260f9e 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -595,7 +595,6 @@ CONFIG_GAMEPORT_FM801=m
 # CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_ROCKETPORT=m
-CONFIG_CYCLADES=m
 CONFIG_SYNCLINK_GT=m
 CONFIG_NOZOMI=m
 CONFIG_N_HDLC=m
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index e15cd6b5bb99..397523a8095e 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -181,7 +181,7 @@ config SERIAL_NONSTANDARD
 	help
 	  Say Y here if you have any non-standard serial boards -- boards
 	  which aren't supported using the standard "dumb" serial driver.
-	  This includes intelligent serial boards such as Cyclades,
+	  This includes intelligent serial boards such as
 	  Digiboards, etc. These are usually used for systems that need many
 	  serial ports because they serve many terminals or dial-in
 	  connections.
@@ -207,35 +207,6 @@ config ROCKETPORT
 	  If you want to compile this driver into the kernel, say Y here.  If
 	  you don't have a Comtrol RocketPort/RocketModem card installed, say N.
 
-config CYCLADES
-	tristate "Cyclades async mux support"
-	depends on SERIAL_NONSTANDARD && (PCI || ISA)
-	select FW_LOADER
-	help
-	  This driver supports Cyclades Z and Y multiserial boards.
-	  You would need something like this to connect more than two modems to
-	  your Linux box, for instance in order to become a dial-in server.
-
-	  For information about the Cyclades-Z card, read
-	  <file:Documentation/driver-api/serial/cyclades_z.rst>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called cyclades.
-
-	  If you haven't heard about it, it's safe to say N.
-
-config CYZ_INTR
-	bool "Cyclades-Z interrupt mode operation"
-	depends on CYCLADES && PCI
-	help
-	  The Cyclades-Z family of multiport cards allows 2 (two) driver op
-	  modes: polling and interrupt. In polling mode, the driver will check
-	  the status of the Cyclades-Z ports every certain amount of time
-	  (which is called polling cycle and is configurable). In interrupt
-	  mode, it will use an interrupt line (IRQ) in order to check the
-	  status of the Cyclades-Z ports. The default op mode is polling. If
-	  unsure, say N.
-
 config MOXA_INTELLIO
 	tristate "Moxa Intellio support"
 	depends on SERIAL_NONSTANDARD && (ISA || EISA || PCI)
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index 730de6bf048b..94eb2bf75763 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_SERIAL_DEV_BUS)	+= serdev/
 
 # tty drivers
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
-obj-$(CONFIG_CYCLADES)		+= cyclades.o
 obj-$(CONFIG_ISI)		+= isicom.o
 obj-$(CONFIG_MOXA_INTELLIO)	+= moxa.o
 obj-$(CONFIG_MOXA_SMARTIO)	+= mxser.o
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
deleted file mode 100644
index 097266342e5e..000000000000
--- a/drivers/tty/cyclades.c
+++ /dev/null
@@ -1,4119 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#undef	BLOCKMOVE
-#define	Z_WAKE
-#undef	Z_EXT_CHARS_IN_BUFFER
-
-/*
- * This file contains the driver for the Cyclades async multiport
- * serial boards.
- *
- * Initially written by Randolph Bentson <bentson@grieg.seaslug.org>.
- * Modified and maintained by Marcio Saito <marcio@cyclades.com>.
- *
- * Copyright (C) 2007-2009 Jiri Slaby <jirislaby@gmail.com>
- *
- * Much of the design and some of the code came from serial.c
- * which was copyright (C) 1991, 1992  Linus Torvalds.  It was
- * extensively rewritten by Theodore Ts'o, 8/16/92 -- 9/14/92,
- * and then fixed as suggested by Michael K. Johnson 12/12/92.
- * Converted to pci probing and cleaned up by Jiri Slaby.
- *
- */
-
-#define CY_VERSION	"2.6"
-
-/* If you need to install more boards than NR_CARDS, change the constant
-   in the definition below. No other change is necessary to support up to
-   eight boards. Beyond that you'll have to extend cy_isa_addresses. */
-
-#define NR_CARDS	4
-
-/*
-   If the total number of ports is larger than NR_PORTS, change this
-   constant in the definition below. No other change is necessary to
-   support more boards/ports. */
-
-#define NR_PORTS	256
-
-#define ZO_V1	0
-#define ZO_V2	1
-#define ZE_V1	2
-
-#define	SERIAL_PARANOIA_CHECK
-#undef	CY_DEBUG_OPEN
-#undef	CY_DEBUG_THROTTLE
-#undef	CY_DEBUG_OTHER
-#undef	CY_DEBUG_IO
-#undef	CY_DEBUG_COUNT
-#undef	CY_DEBUG_DTR
-#undef	CY_DEBUG_INTERRUPTS
-#undef	CY_16Y_HACK
-#undef	CY_ENABLE_MONITORING
-#undef	CY_PCI_DEBUG
-
-/*
- * Include section
- */
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/interrupt.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/serial.h>
-#include <linux/major.h>
-#include <linux/string.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/cyclades.h>
-#include <linux/mm.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/bitops.h>
-#include <linux/firmware.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <linux/kernel.h>
-#include <linux/pci.h>
-
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-
-static void cy_send_xchar(struct tty_struct *tty, char ch);
-
-#ifndef SERIAL_XMIT_SIZE
-#define	SERIAL_XMIT_SIZE	(min(PAGE_SIZE, 4096))
-#endif
-
-/* firmware stuff */
-#define ZL_MAX_BLOCKS	16
-#define DRIVER_VERSION	0x02010203
-#define RAM_SIZE 0x80000
-
-enum zblock_type {
-	ZBLOCK_PRG = 0,
-	ZBLOCK_FPGA = 1
-};
-
-struct zfile_header {
-	char name[64];
-	char date[32];
-	char aux[32];
-	u32 n_config;
-	u32 config_offset;
-	u32 n_blocks;
-	u32 block_offset;
-	u32 reserved[9];
-} __attribute__ ((packed));
-
-struct zfile_config {
-	char name[64];
-	u32 mailbox;
-	u32 function;
-	u32 n_blocks;
-	u32 block_list[ZL_MAX_BLOCKS];
-} __attribute__ ((packed));
-
-struct zfile_block {
-	u32 type;
-	u32 file_offset;
-	u32 ram_offset;
-	u32 size;
-} __attribute__ ((packed));
-
-static struct tty_driver *cy_serial_driver;
-
-#ifdef CONFIG_ISA
-/* This is the address lookup table. The driver will probe for
-   Cyclom-Y/ISA boards at all addresses in here. If you want the
-   driver to probe addresses at a different address, add it to
-   this table.  If the driver is probing some other board and
-   causing problems, remove the offending address from this table.
-*/
-
-static unsigned int cy_isa_addresses[] = {
-	0xD0000,
-	0xD2000,
-	0xD4000,
-	0xD6000,
-	0xD8000,
-	0xDA000,
-	0xDC000,
-	0xDE000,
-	0, 0, 0, 0, 0, 0, 0, 0
-};
-
-#define NR_ISA_ADDRS ARRAY_SIZE(cy_isa_addresses)
-
-static long maddr[NR_CARDS];
-static int irq[NR_CARDS];
-
-module_param_hw_array(maddr, long, iomem, NULL, 0);
-module_param_hw_array(irq, int, irq, NULL, 0);
-
-#endif				/* CONFIG_ISA */
-
-/* This is the per-card data structure containing address, irq, number of
-   channels, etc. This driver supports a maximum of NR_CARDS cards.
-*/
-static struct cyclades_card cy_card[NR_CARDS];
-
-static int cy_next_channel;	/* next minor available */
-
-/*
- * This is used to look up the divisor speeds and the timeouts
- * We're normally limited to 15 distinct baud rates.  The extra
- * are accessed via settings in info->port.flags.
- *      0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
- *     10,    11,    12,    13,    14,    15,    16,    17,    18,    19,
- *                                               HI            VHI
- *     20
- */
-static const int baud_table[] = {
-	0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
-	1800, 2400, 4800, 9600, 19200, 38400, 57600, 76800, 115200, 150000,
-	230400, 0
-};
-
-static const char baud_co_25[] = {	/* 25 MHz clock option table */
-	/* value =>    00    01   02    03    04 */
-	/* divide by    8    32   128   512  2048 */
-	0x00, 0x04, 0x04, 0x04, 0x04, 0x04, 0x03, 0x03, 0x03, 0x02,
-	0x02, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-};
-
-static const char baud_bpr_25[] = {	/* 25 MHz baud rate period table */
-	0x00, 0xf5, 0xa3, 0x6f, 0x5c, 0x51, 0xf5, 0xa3, 0x51, 0xa3,
-	0x6d, 0x51, 0xa3, 0x51, 0xa3, 0x51, 0x36, 0x29, 0x1b, 0x15
-};
-
-static const char baud_co_60[] = {	/* 60 MHz clock option table (CD1400 J) */
-	/* value =>    00    01   02    03    04 */
-	/* divide by    8    32   128   512  2048 */
-	0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x04, 0x03, 0x03,
-	0x03, 0x02, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00
-};
-
-static const char baud_bpr_60[] = {	/* 60 MHz baud rate period table (CD1400 J) */
-	0x00, 0x82, 0x21, 0xff, 0xdb, 0xc3, 0x92, 0x62, 0xc3, 0x62,
-	0x41, 0xc3, 0x62, 0xc3, 0x62, 0xc3, 0x82, 0x62, 0x41, 0x32,
-	0x21
-};
-
-static const char baud_cor3[] = {	/* receive threshold */
-	0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-	0x0a, 0x0a, 0x0a, 0x09, 0x09, 0x08, 0x08, 0x08, 0x08, 0x07,
-	0x07
-};
-
-/*
- * The Cyclades driver implements HW flow control as any serial driver.
- * The cyclades_port structure member rflow and the vector rflow_thr
- * allows us to take advantage of a special feature in the CD1400 to avoid
- * data loss even when the system interrupt latency is too high. These flags
- * are to be used only with very special applications. Setting these flags
- * requires the use of a special cable (DTR and RTS reversed). In the new
- * CD1400-based boards (rev. 6.00 or later), there is no need for special
- * cables.
- */
-
-static const char rflow_thr[] = {	/* rflow threshold */
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	0x00, 0x00, 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
-	0x0a
-};
-
-/*  The Cyclom-Ye has placed the sequential chips in non-sequential
- *  address order.  This look-up table overcomes that problem.
- */
-static const unsigned int cy_chip_offset[] = { 0x0000,
-	0x0400,
-	0x0800,
-	0x0C00,
-	0x0200,
-	0x0600,
-	0x0A00,
-	0x0E00
-};
-
-/* PCI related definitions */
-
-#ifdef CONFIG_PCI
-static const struct pci_device_id cy_pci_dev_id[] = {
-	/* PCI < 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_Y_Lo) },
-	/* PCI > 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_Y_Hi) },
-	/* 4Y PCI < 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_4Y_Lo) },
-	/* 4Y PCI > 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_4Y_Hi) },
-	/* 8Y PCI < 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_8Y_Lo) },
-	/* 8Y PCI > 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_8Y_Hi) },
-	/* Z PCI < 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_Z_Lo) },
-	/* Z PCI > 1Mb */
-	{ PCI_DEVICE(PCI_VENDOR_ID_CYCLADES, PCI_DEVICE_ID_CYCLOM_Z_Hi) },
-	{ }			/* end of table */
-};
-MODULE_DEVICE_TABLE(pci, cy_pci_dev_id);
-#endif
-
-static void cy_start(struct tty_struct *);
-static void cy_set_line_char(struct cyclades_port *, struct tty_struct *);
-static int cyz_issue_cmd(struct cyclades_card *, __u32, __u8, __u32);
-#ifdef CONFIG_ISA
-static unsigned detect_isa_irq(void __iomem *);
-#endif				/* CONFIG_ISA */
-
-#ifndef CONFIG_CYZ_INTR
-static void cyz_poll(struct timer_list *);
-
-/* The Cyclades-Z polling cycle is defined by this variable */
-static long cyz_polling_cycle = CZ_DEF_POLL;
-
-static DEFINE_TIMER(cyz_timerlist, cyz_poll);
-
-#else				/* CONFIG_CYZ_INTR */
-static void cyz_rx_restart(struct timer_list *);
-#endif				/* CONFIG_CYZ_INTR */
-
-static void cyy_writeb(struct cyclades_port *port, u32 reg, u8 val)
-{
-	struct cyclades_card *card = port->card;
-
-	cy_writeb(port->u.cyy.base_addr + (reg << card->bus_index), val);
-}
-
-static u8 cyy_readb(struct cyclades_port *port, u32 reg)
-{
-	struct cyclades_card *card = port->card;
-
-	return readb(port->u.cyy.base_addr + (reg << card->bus_index));
-}
-
-static inline bool cy_is_Z(struct cyclades_card *card)
-{
-	return card->num_chips == (unsigned int)-1;
-}
-
-static inline bool __cyz_fpga_loaded(struct RUNTIME_9060 __iomem *ctl_addr)
-{
-	return readl(&ctl_addr->init_ctrl) & (1 << 17);
-}
-
-static inline bool cyz_fpga_loaded(struct cyclades_card *card)
-{
-	return __cyz_fpga_loaded(card->ctl_addr.p9060);
-}
-
-static bool cyz_is_loaded(struct cyclades_card *card)
-{
-	struct FIRM_ID __iomem *fw_id = card->base_addr + ID_ADDRESS;
-
-	return (card->hw_ver == ZO_V1 || cyz_fpga_loaded(card)) &&
-			readl(&fw_id->signature) == ZFIRM_ID;
-}
-
-static int serial_paranoia_check(struct cyclades_port *info,
-		const char *name, const char *routine)
-{
-#ifdef SERIAL_PARANOIA_CHECK
-	if (!info) {
-		printk(KERN_WARNING "cyc Warning: null cyclades_port for (%s) "
-				"in %s\n", name, routine);
-		return 1;
-	}
-
-	if (info->magic != CYCLADES_MAGIC) {
-		printk(KERN_WARNING "cyc Warning: bad magic number for serial "
-				"struct (%s) in %s\n", name, routine);
-		return 1;
-	}
-#endif
-	return 0;
-}
-
-/***********************************************************/
-/********* Start of block of Cyclom-Y specific code ********/
-
-/* This routine waits up to 1000 micro-seconds for the previous
-   command to the Cirrus chip to complete and then issues the
-   new command.  An error is returned if the previous command
-   didn't finish within the time limit.
-
-   This function is only called from inside spinlock-protected code.
- */
-static int __cyy_issue_cmd(void __iomem *base_addr, u8 cmd, int index)
-{
-	void __iomem *ccr = base_addr + (CyCCR << index);
-	unsigned int i;
-
-	/* Check to see that the previous command has completed */
-	for (i = 0; i < 100; i++) {
-		if (readb(ccr) == 0)
-			break;
-		udelay(10L);
-	}
-	/* if the CCR never cleared, the previous command
-	   didn't finish within the "reasonable time" */
-	if (i == 100)
-		return -1;
-
-	/* Issue the new command */
-	cy_writeb(ccr, cmd);
-
-	return 0;
-}
-
-static inline int cyy_issue_cmd(struct cyclades_port *port, u8 cmd)
-{
-	return __cyy_issue_cmd(port->u.cyy.base_addr, cmd,
-			port->card->bus_index);
-}
-
-#ifdef CONFIG_ISA
-/* ISA interrupt detection code */
-static unsigned detect_isa_irq(void __iomem *address)
-{
-	int irq;
-	unsigned long irqs, flags;
-	int save_xir, save_car;
-	int index = 0;		/* IRQ probing is only for ISA */
-
-	/* forget possible initially masked and pending IRQ */
-	irq = probe_irq_off(probe_irq_on());
-
-	/* Clear interrupts on the board first */
-	cy_writeb(address + (Cy_ClrIntr << index), 0);
-	/* Cy_ClrIntr is 0x1800 */
-
-	irqs = probe_irq_on();
-	/* Wait ... */
-	msleep(5);
-
-	/* Enable the Tx interrupts on the CD1400 */
-	local_irq_save(flags);
-	cy_writeb(address + (CyCAR << index), 0);
-	__cyy_issue_cmd(address, CyCHAN_CTL | CyENB_XMTR, index);
-
-	cy_writeb(address + (CyCAR << index), 0);
-	cy_writeb(address + (CySRER << index),
-		  readb(address + (CySRER << index)) | CyTxRdy);
-	local_irq_restore(flags);
-
-	/* Wait ... */
-	msleep(5);
-
-	/* Check which interrupt is in use */
-	irq = probe_irq_off(irqs);
-
-	/* Clean up */
-	save_xir = (u_char) readb(address + (CyTIR << index));
-	save_car = readb(address + (CyCAR << index));
-	cy_writeb(address + (CyCAR << index), (save_xir & 0x3));
-	cy_writeb(address + (CySRER << index),
-		  readb(address + (CySRER << index)) & ~CyTxRdy);
-	cy_writeb(address + (CyTIR << index), (save_xir & 0x3f));
-	cy_writeb(address + (CyCAR << index), (save_car));
-	cy_writeb(address + (Cy_ClrIntr << index), 0);
-	/* Cy_ClrIntr is 0x1800 */
-
-	return (irq > 0) ? irq : 0;
-}
-#endif				/* CONFIG_ISA */
-
-static void cyy_chip_rx(struct cyclades_card *cinfo, int chip,
-		void __iomem *base_addr)
-{
-	struct cyclades_port *info;
-	struct tty_port *port;
-	int len, index = cinfo->bus_index;
-	u8 ivr, save_xir, channel, save_car, data, char_count;
-
-#ifdef CY_DEBUG_INTERRUPTS
-	printk(KERN_DEBUG "cyy_interrupt: rcvd intr, chip %d\n", chip);
-#endif
-	/* determine the channel & change to that context */
-	save_xir = readb(base_addr + (CyRIR << index));
-	channel = save_xir & CyIRChannel;
-	info = &cinfo->ports[channel + chip * 4];
-	port = &info->port;
-	save_car = cyy_readb(info, CyCAR);
-	cyy_writeb(info, CyCAR, save_xir);
-	ivr = cyy_readb(info, CyRIVR) & CyIVRMask;
-
-	/* there is an open port for this data */
-	if (ivr == CyIVRRxEx) {	/* exception */
-		data = cyy_readb(info, CyRDSR);
-
-		/* For statistics only */
-		if (data & CyBREAK)
-			info->icount.brk++;
-		else if (data & CyFRAME)
-			info->icount.frame++;
-		else if (data & CyPARITY)
-			info->icount.parity++;
-		else if (data & CyOVERRUN)
-			info->icount.overrun++;
-
-		if (data & info->ignore_status_mask) {
-			info->icount.rx++;
-			return;
-		}
-		if (tty_buffer_request_room(port, 1)) {
-			if (data & info->read_status_mask) {
-				if (data & CyBREAK) {
-					tty_insert_flip_char(port,
-						cyy_readb(info, CyRDSR),
-						TTY_BREAK);
-					info->icount.rx++;
-					if (port->flags & ASYNC_SAK) {
-						struct tty_struct *tty =
-							tty_port_tty_get(port);
-						if (tty) {
-							do_SAK(tty);
-							tty_kref_put(tty);
-						}
-					}
-				} else if (data & CyFRAME) {
-					tty_insert_flip_char(port,
-						cyy_readb(info, CyRDSR),
-						TTY_FRAME);
-					info->icount.rx++;
-					info->idle_stats.frame_errs++;
-				} else if (data & CyPARITY) {
-					/* Pieces of seven... */
-					tty_insert_flip_char(port,
-						cyy_readb(info, CyRDSR),
-						TTY_PARITY);
-					info->icount.rx++;
-					info->idle_stats.parity_errs++;
-				} else if (data & CyOVERRUN) {
-					tty_insert_flip_char(port, 0,
-							TTY_OVERRUN);
-					info->icount.rx++;
-					/* If the flip buffer itself is
-					   overflowing, we still lose
-					   the next incoming character.
-					 */
-					tty_insert_flip_char(port,
-						cyy_readb(info, CyRDSR),
-						TTY_FRAME);
-					info->icount.rx++;
-					info->idle_stats.overruns++;
-				/* These two conditions may imply */
-				/* a normal read should be done. */
-				/* } else if(data & CyTIMEOUT) { */
-				/* } else if(data & CySPECHAR) { */
-				} else {
-					tty_insert_flip_char(port, 0,
-							TTY_NORMAL);
-					info->icount.rx++;
-				}
-			} else {
-				tty_insert_flip_char(port, 0, TTY_NORMAL);
-				info->icount.rx++;
-			}
-		} else {
-			/* there was a software buffer overrun and nothing
-			 * could be done about it!!! */
-			info->icount.buf_overrun++;
-			info->idle_stats.overruns++;
-		}
-	} else {	/* normal character reception */
-		/* load # chars available from the chip */
-		char_count = cyy_readb(info, CyRDCR);
-
-#ifdef CY_ENABLE_MONITORING
-		++info->mon.int_count;
-		info->mon.char_count += char_count;
-		if (char_count > info->mon.char_max)
-			info->mon.char_max = char_count;
-		info->mon.char_last = char_count;
-#endif
-		len = tty_buffer_request_room(port, char_count);
-		while (len--) {
-			data = cyy_readb(info, CyRDSR);
-			tty_insert_flip_char(port, data, TTY_NORMAL);
-			info->idle_stats.recv_bytes++;
-			info->icount.rx++;
-#ifdef CY_16Y_HACK
-			udelay(10L);
-#endif
-		}
-		info->idle_stats.recv_idle = jiffies;
-	}
-	tty_schedule_flip(port);
-
-	/* end of service */
-	cyy_writeb(info, CyRIR, save_xir & 0x3f);
-	cyy_writeb(info, CyCAR, save_car);
-}
-
-static void cyy_chip_tx(struct cyclades_card *cinfo, unsigned int chip,
-		void __iomem *base_addr)
-{
-	struct cyclades_port *info;
-	struct tty_struct *tty;
-	int char_count, index = cinfo->bus_index;
-	u8 save_xir, channel, save_car, outch;
-
-	/* Since we only get here when the transmit buffer
-	   is empty, we know we can always stuff a dozen
-	   characters. */
-#ifdef CY_DEBUG_INTERRUPTS
-	printk(KERN_DEBUG "cyy_interrupt: xmit intr, chip %d\n", chip);
-#endif
-
-	/* determine the channel & change to that context */
-	save_xir = readb(base_addr + (CyTIR << index));
-	channel = save_xir & CyIRChannel;
-	save_car = readb(base_addr + (CyCAR << index));
-	cy_writeb(base_addr + (CyCAR << index), save_xir);
-
-	info = &cinfo->ports[channel + chip * 4];
-	tty = tty_port_tty_get(&info->port);
-	if (tty == NULL) {
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyTxRdy);
-		goto end;
-	}
-
-	/* load the on-chip space for outbound data */
-	char_count = info->xmit_fifo_size;
-
-	if (info->x_char) {	/* send special char */
-		outch = info->x_char;
-		cyy_writeb(info, CyTDR, outch);
-		char_count--;
-		info->icount.tx++;
-		info->x_char = 0;
-	}
-
-	if (info->breakon || info->breakoff) {
-		if (info->breakon) {
-			cyy_writeb(info, CyTDR, 0);
-			cyy_writeb(info, CyTDR, 0x81);
-			info->breakon = 0;
-			char_count -= 2;
-		}
-		if (info->breakoff) {
-			cyy_writeb(info, CyTDR, 0);
-			cyy_writeb(info, CyTDR, 0x83);
-			info->breakoff = 0;
-			char_count -= 2;
-		}
-	}
-
-	while (char_count-- > 0) {
-		if (!info->xmit_cnt) {
-			if (cyy_readb(info, CySRER) & CyTxMpty) {
-				cyy_writeb(info, CySRER,
-					cyy_readb(info, CySRER) & ~CyTxMpty);
-			} else {
-				cyy_writeb(info, CySRER, CyTxMpty |
-					(cyy_readb(info, CySRER) & ~CyTxRdy));
-			}
-			goto done;
-		}
-		if (info->port.xmit_buf == NULL) {
-			cyy_writeb(info, CySRER,
-				cyy_readb(info, CySRER) & ~CyTxRdy);
-			goto done;
-		}
-		if (tty->stopped || tty->hw_stopped) {
-			cyy_writeb(info, CySRER,
-				cyy_readb(info, CySRER) & ~CyTxRdy);
-			goto done;
-		}
-		/* Because the Embedded Transmit Commands have been enabled,
-		 * we must check to see if the escape character, NULL, is being
-		 * sent. If it is, we must ensure that there is room for it to
-		 * be doubled in the output stream.  Therefore we no longer
-		 * advance the pointer when the character is fetched, but
-		 * rather wait until after the check for a NULL output
-		 * character. This is necessary because there may not be room
-		 * for the two chars needed to send a NULL.)
-		 */
-		outch = info->port.xmit_buf[info->xmit_tail];
-		if (outch) {
-			info->xmit_cnt--;
-			info->xmit_tail = (info->xmit_tail + 1) &
-					(SERIAL_XMIT_SIZE - 1);
-			cyy_writeb(info, CyTDR, outch);
-			info->icount.tx++;
-		} else {
-			if (char_count > 1) {
-				info->xmit_cnt--;
-				info->xmit_tail = (info->xmit_tail + 1) &
-					(SERIAL_XMIT_SIZE - 1);
-				cyy_writeb(info, CyTDR, outch);
-				cyy_writeb(info, CyTDR, 0);
-				info->icount.tx++;
-				char_count--;
-			}
-		}
-	}
-
-done:
-	tty_wakeup(tty);
-	tty_kref_put(tty);
-end:
-	/* end of service */
-	cyy_writeb(info, CyTIR, save_xir & 0x3f);
-	cyy_writeb(info, CyCAR, save_car);
-}
-
-static void cyy_chip_modem(struct cyclades_card *cinfo, int chip,
-		void __iomem *base_addr)
-{
-	struct cyclades_port *info;
-	struct tty_struct *tty;
-	int index = cinfo->bus_index;
-	u8 save_xir, channel, save_car, mdm_change, mdm_status;
-
-	/* determine the channel & change to that context */
-	save_xir = readb(base_addr + (CyMIR << index));
-	channel = save_xir & CyIRChannel;
-	info = &cinfo->ports[channel + chip * 4];
-	save_car = cyy_readb(info, CyCAR);
-	cyy_writeb(info, CyCAR, save_xir);
-
-	mdm_change = cyy_readb(info, CyMISR);
-	mdm_status = cyy_readb(info, CyMSVR1);
-
-	tty = tty_port_tty_get(&info->port);
-	if (!tty)
-		goto end;
-
-	if (mdm_change & CyANY_DELTA) {
-		/* For statistics only */
-		if (mdm_change & CyDCD)
-			info->icount.dcd++;
-		if (mdm_change & CyCTS)
-			info->icount.cts++;
-		if (mdm_change & CyDSR)
-			info->icount.dsr++;
-		if (mdm_change & CyRI)
-			info->icount.rng++;
-
-		wake_up_interruptible(&info->port.delta_msr_wait);
-	}
-
-	if ((mdm_change & CyDCD) && tty_port_check_carrier(&info->port)) {
-		if (mdm_status & CyDCD)
-			wake_up_interruptible(&info->port.open_wait);
-		else
-			tty_hangup(tty);
-	}
-	if ((mdm_change & CyCTS) && tty_port_cts_enabled(&info->port)) {
-		if (tty->hw_stopped) {
-			if (mdm_status & CyCTS) {
-				/* cy_start isn't used
-				   because... !!! */
-				tty->hw_stopped = 0;
-				cyy_writeb(info, CySRER,
-					cyy_readb(info, CySRER) | CyTxRdy);
-				tty_wakeup(tty);
-			}
-		} else {
-			if (!(mdm_status & CyCTS)) {
-				/* cy_stop isn't used
-				   because ... !!! */
-				tty->hw_stopped = 1;
-				cyy_writeb(info, CySRER,
-					cyy_readb(info, CySRER) & ~CyTxRdy);
-			}
-		}
-	}
-/*	if (mdm_change & CyDSR) {
-	}
-	if (mdm_change & CyRI) {
-	}*/
-	tty_kref_put(tty);
-end:
-	/* end of service */
-	cyy_writeb(info, CyMIR, save_xir & 0x3f);
-	cyy_writeb(info, CyCAR, save_car);
-}
-
-/* The real interrupt service routine is called
-   whenever the card wants its hand held--chars
-   received, out buffer empty, modem change, etc.
- */
-static irqreturn_t cyy_interrupt(int irq, void *dev_id)
-{
-	int status;
-	struct cyclades_card *cinfo = dev_id;
-	void __iomem *base_addr, *card_base_addr;
-	unsigned int chip, too_many, had_work;
-	int index;
-
-	if (unlikely(cinfo == NULL)) {
-#ifdef CY_DEBUG_INTERRUPTS
-		printk(KERN_DEBUG "cyy_interrupt: spurious interrupt %d\n",
-				irq);
-#endif
-		return IRQ_NONE;	/* spurious interrupt */
-	}
-
-	card_base_addr = cinfo->base_addr;
-	index = cinfo->bus_index;
-
-	/* card was not initialized yet (e.g. DEBUG_SHIRQ) */
-	if (unlikely(card_base_addr == NULL))
-		return IRQ_HANDLED;
-
-	/* This loop checks all chips in the card.  Make a note whenever
-	   _any_ chip had some work to do, as this is considered an
-	   indication that there will be more to do.  Only when no chip
-	   has any work does this outermost loop exit.
-	 */
-	do {
-		had_work = 0;
-		for (chip = 0; chip < cinfo->num_chips; chip++) {
-			base_addr = cinfo->base_addr +
-					(cy_chip_offset[chip] << index);
-			too_many = 0;
-			while ((status = readb(base_addr +
-						(CySVRR << index))) != 0x00) {
-				had_work++;
-			/* The purpose of the following test is to ensure that
-			   no chip can monopolize the driver.  This forces the
-			   chips to be checked in a round-robin fashion (after
-			   draining each of a bunch (1000) of characters).
-			 */
-				if (1000 < too_many++)
-					break;
-				spin_lock(&cinfo->card_lock);
-				if (status & CySRReceive) /* rx intr */
-					cyy_chip_rx(cinfo, chip, base_addr);
-				if (status & CySRTransmit) /* tx intr */
-					cyy_chip_tx(cinfo, chip, base_addr);
-				if (status & CySRModem) /* modem intr */
-					cyy_chip_modem(cinfo, chip, base_addr);
-				spin_unlock(&cinfo->card_lock);
-			}
-		}
-	} while (had_work);
-
-	/* clear interrupts */
-	spin_lock(&cinfo->card_lock);
-	cy_writeb(card_base_addr + (Cy_ClrIntr << index), 0);
-	/* Cy_ClrIntr is 0x1800 */
-	spin_unlock(&cinfo->card_lock);
-	return IRQ_HANDLED;
-}				/* cyy_interrupt */
-
-static void cyy_change_rts_dtr(struct cyclades_port *info, unsigned int set,
-		unsigned int clear)
-{
-	struct cyclades_card *card = info->card;
-	int channel = info->line - card->first_line;
-	u32 rts, dtr, msvrr, msvrd;
-
-	channel &= 0x03;
-
-	if (info->rtsdtr_inv) {
-		msvrr = CyMSVR2;
-		msvrd = CyMSVR1;
-		rts = CyDTR;
-		dtr = CyRTS;
-	} else {
-		msvrr = CyMSVR1;
-		msvrd = CyMSVR2;
-		rts = CyRTS;
-		dtr = CyDTR;
-	}
-	if (set & TIOCM_RTS) {
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, msvrr, rts);
-	}
-	if (clear & TIOCM_RTS) {
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, msvrr, ~rts);
-	}
-	if (set & TIOCM_DTR) {
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, msvrd, dtr);
-#ifdef CY_DEBUG_DTR
-		printk(KERN_DEBUG "cyc:set_modem_info raising DTR\n");
-		printk(KERN_DEBUG "     status: 0x%x, 0x%x\n",
-			cyy_readb(info, CyMSVR1),
-			cyy_readb(info, CyMSVR2));
-#endif
-	}
-	if (clear & TIOCM_DTR) {
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, msvrd, ~dtr);
-#ifdef CY_DEBUG_DTR
-		printk(KERN_DEBUG "cyc:set_modem_info dropping DTR\n");
-		printk(KERN_DEBUG "     status: 0x%x, 0x%x\n",
-			cyy_readb(info, CyMSVR1),
-			cyy_readb(info, CyMSVR2));
-#endif
-	}
-}
-
-/***********************************************************/
-/********* End of block of Cyclom-Y specific code **********/
-/******** Start of block of Cyclades-Z specific code *******/
-/***********************************************************/
-
-static int
-cyz_fetch_msg(struct cyclades_card *cinfo,
-		__u32 *channel, __u8 *cmd, __u32 *param)
-{
-	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
-	unsigned long loc_doorbell;
-
-	loc_doorbell = readl(&cinfo->ctl_addr.p9060->loc_doorbell);
-	if (loc_doorbell) {
-		*cmd = (char)(0xff & loc_doorbell);
-		*channel = readl(&board_ctrl->fwcmd_channel);
-		*param = (__u32) readl(&board_ctrl->fwcmd_param);
-		cy_writel(&cinfo->ctl_addr.p9060->loc_doorbell, 0xffffffff);
-		return 1;
-	}
-	return 0;
-}				/* cyz_fetch_msg */
-
-static int
-cyz_issue_cmd(struct cyclades_card *cinfo,
-		__u32 channel, __u8 cmd, __u32 param)
-{
-	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
-	__u32 __iomem *pci_doorbell;
-	unsigned int index;
-
-	if (!cyz_is_loaded(cinfo))
-		return -1;
-
-	index = 0;
-	pci_doorbell = &cinfo->ctl_addr.p9060->pci_doorbell;
-	while ((readl(pci_doorbell) & 0xff) != 0) {
-		if (index++ == 1000)
-			return (int)(readl(pci_doorbell) & 0xff);
-		udelay(50L);
-	}
-	cy_writel(&board_ctrl->hcmd_channel, channel);
-	cy_writel(&board_ctrl->hcmd_param, param);
-	cy_writel(pci_doorbell, (long)cmd);
-
-	return 0;
-}				/* cyz_issue_cmd */
-
-static void cyz_handle_rx(struct cyclades_port *info)
-{
-	struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
-	struct cyclades_card *cinfo = info->card;
-	struct tty_port *port = &info->port;
-	unsigned int char_count;
-	int len;
-#ifdef BLOCKMOVE
-	unsigned char *buf;
-#else
-	char data;
-#endif
-	__u32 rx_put, rx_get, new_rx_get, rx_bufsize, rx_bufaddr;
-
-	rx_get = new_rx_get = readl(&buf_ctrl->rx_get);
-	rx_put = readl(&buf_ctrl->rx_put);
-	rx_bufsize = readl(&buf_ctrl->rx_bufsize);
-	rx_bufaddr = readl(&buf_ctrl->rx_bufaddr);
-	if (rx_put >= rx_get)
-		char_count = rx_put - rx_get;
-	else
-		char_count = rx_put - rx_get + rx_bufsize;
-
-	if (!char_count)
-		return;
-
-#ifdef CY_ENABLE_MONITORING
-	info->mon.int_count++;
-	info->mon.char_count += char_count;
-	if (char_count > info->mon.char_max)
-		info->mon.char_max = char_count;
-	info->mon.char_last = char_count;
-#endif
-
-#ifdef BLOCKMOVE
-	/* we'd like to use memcpy(t, f, n) and memset(s, c, count)
-	   for performance, but because of buffer boundaries, there
-	   may be several steps to the operation */
-	while (1) {
-		len = tty_prepare_flip_string(port, &buf,
-				char_count);
-		if (!len)
-			break;
-
-		len = min_t(unsigned int, min(len, char_count),
-				rx_bufsize - new_rx_get);
-
-		memcpy_fromio(buf, cinfo->base_addr +
-				rx_bufaddr + new_rx_get, len);
-
-		new_rx_get = (new_rx_get + len) &
-				(rx_bufsize - 1);
-		char_count -= len;
-		info->icount.rx += len;
-		info->idle_stats.recv_bytes += len;
-	}
-#else
-	len = tty_buffer_request_room(port, char_count);
-	while (len--) {
-		data = readb(cinfo->base_addr + rx_bufaddr +
-				new_rx_get);
-		new_rx_get = (new_rx_get + 1) &
-					(rx_bufsize - 1);
-		tty_insert_flip_char(port, data, TTY_NORMAL);
-		info->idle_stats.recv_bytes++;
-		info->icount.rx++;
-	}
-#endif
-#ifdef CONFIG_CYZ_INTR
-	/* Recalculate the number of chars in the RX buffer and issue
-	   a cmd in case it's higher than the RX high water mark */
-	rx_put = readl(&buf_ctrl->rx_put);
-	if (rx_put >= rx_get)
-		char_count = rx_put - rx_get;
-	else
-		char_count = rx_put - rx_get + rx_bufsize;
-	if (char_count >= readl(&buf_ctrl->rx_threshold) &&
-			!timer_pending(&info->rx_full_timer))
-		mod_timer(&info->rx_full_timer, jiffies + 1);
-#endif
-	info->idle_stats.recv_idle = jiffies;
-	tty_schedule_flip(&info->port);
-
-	/* Update rx_get */
-	cy_writel(&buf_ctrl->rx_get, new_rx_get);
-}
-
-static void cyz_handle_tx(struct cyclades_port *info)
-{
-	struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
-	struct cyclades_card *cinfo = info->card;
-	struct tty_struct *tty;
-	u8 data;
-	unsigned int char_count;
-#ifdef BLOCKMOVE
-	int small_count;
-#endif
-	__u32 tx_put, tx_get, tx_bufsize, tx_bufaddr;
-
-	if (info->xmit_cnt <= 0)	/* Nothing to transmit */
-		return;
-
-	tx_get = readl(&buf_ctrl->tx_get);
-	tx_put = readl(&buf_ctrl->tx_put);
-	tx_bufsize = readl(&buf_ctrl->tx_bufsize);
-	tx_bufaddr = readl(&buf_ctrl->tx_bufaddr);
-	if (tx_put >= tx_get)
-		char_count = tx_get - tx_put - 1 + tx_bufsize;
-	else
-		char_count = tx_get - tx_put - 1;
-
-	if (!char_count)
-		return;
-		
-	tty = tty_port_tty_get(&info->port);
-	if (tty == NULL)
-		goto ztxdone;
-
-	if (info->x_char) {	/* send special char */
-		data = info->x_char;
-
-		cy_writeb(cinfo->base_addr + tx_bufaddr + tx_put, data);
-		tx_put = (tx_put + 1) & (tx_bufsize - 1);
-		info->x_char = 0;
-		char_count--;
-		info->icount.tx++;
-	}
-#ifdef BLOCKMOVE
-	while (0 < (small_count = min_t(unsigned int,
-			tx_bufsize - tx_put, min_t(unsigned int,
-				(SERIAL_XMIT_SIZE - info->xmit_tail),
-				min_t(unsigned int, info->xmit_cnt,
-					char_count))))) {
-
-		memcpy_toio((char *)(cinfo->base_addr + tx_bufaddr + tx_put),
-				&info->port.xmit_buf[info->xmit_tail],
-				small_count);
-
-		tx_put = (tx_put + small_count) & (tx_bufsize - 1);
-		char_count -= small_count;
-		info->icount.tx += small_count;
-		info->xmit_cnt -= small_count;
-		info->xmit_tail = (info->xmit_tail + small_count) &
-				(SERIAL_XMIT_SIZE - 1);
-	}
-#else
-	while (info->xmit_cnt && char_count) {
-		data = info->port.xmit_buf[info->xmit_tail];
-		info->xmit_cnt--;
-		info->xmit_tail = (info->xmit_tail + 1) &
-				(SERIAL_XMIT_SIZE - 1);
-
-		cy_writeb(cinfo->base_addr + tx_bufaddr + tx_put, data);
-		tx_put = (tx_put + 1) & (tx_bufsize - 1);
-		char_count--;
-		info->icount.tx++;
-	}
-#endif
-	tty_wakeup(tty);
-	tty_kref_put(tty);
-ztxdone:
-	/* Update tx_put */
-	cy_writel(&buf_ctrl->tx_put, tx_put);
-}
-
-static void cyz_handle_cmd(struct cyclades_card *cinfo)
-{
-	struct BOARD_CTRL __iomem *board_ctrl = cinfo->board_ctrl;
-	struct cyclades_port *info;
-	__u32 channel, param, fw_ver;
-	__u8 cmd;
-	int special_count;
-	int delta_count;
-
-	fw_ver = readl(&board_ctrl->fw_version);
-
-	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
-		special_count = 0;
-		delta_count = 0;
-		info = &cinfo->ports[channel];
-
-		switch (cmd) {
-		case C_CM_PR_ERROR:
-			tty_insert_flip_char(&info->port, 0, TTY_PARITY);
-			info->icount.rx++;
-			special_count++;
-			break;
-		case C_CM_FR_ERROR:
-			tty_insert_flip_char(&info->port, 0, TTY_FRAME);
-			info->icount.rx++;
-			special_count++;
-			break;
-		case C_CM_RXBRK:
-			tty_insert_flip_char(&info->port, 0, TTY_BREAK);
-			info->icount.rx++;
-			special_count++;
-			break;
-		case C_CM_MDCD:
-			info->icount.dcd++;
-			delta_count++;
-			if (tty_port_check_carrier(&info->port)) {
-				u32 dcd = fw_ver > 241 ? param :
-					readl(&info->u.cyz.ch_ctrl->rs_status);
-				if (dcd & C_RS_DCD)
-					wake_up_interruptible(&info->port.open_wait);
-				else
-					tty_port_tty_hangup(&info->port, false);
-			}
-			break;
-		case C_CM_MCTS:
-			info->icount.cts++;
-			delta_count++;
-			break;
-		case C_CM_MRI:
-			info->icount.rng++;
-			delta_count++;
-			break;
-		case C_CM_MDSR:
-			info->icount.dsr++;
-			delta_count++;
-			break;
-#ifdef Z_WAKE
-		case C_CM_IOCTLW:
-			complete(&info->shutdown_wait);
-			break;
-#endif
-#ifdef CONFIG_CYZ_INTR
-		case C_CM_RXHIWM:
-		case C_CM_RXNNDT:
-		case C_CM_INTBACK2:
-			/* Reception Interrupt */
-#ifdef CY_DEBUG_INTERRUPTS
-			printk(KERN_DEBUG "cyz_interrupt: rcvd intr, card %d, "
-					"port %ld\n", info->card, channel);
-#endif
-			cyz_handle_rx(info);
-			break;
-		case C_CM_TXBEMPTY:
-		case C_CM_TXLOWWM:
-		case C_CM_INTBACK:
-			/* Transmission Interrupt */
-#ifdef CY_DEBUG_INTERRUPTS
-			printk(KERN_DEBUG "cyz_interrupt: xmit intr, card %d, "
-					"port %ld\n", info->card, channel);
-#endif
-			cyz_handle_tx(info);
-			break;
-#endif				/* CONFIG_CYZ_INTR */
-		case C_CM_FATAL:
-			/* should do something with this !!! */
-			break;
-		default:
-			break;
-		}
-		if (delta_count)
-			wake_up_interruptible(&info->port.delta_msr_wait);
-		if (special_count)
-			tty_schedule_flip(&info->port);
-	}
-}
-
-#ifdef CONFIG_CYZ_INTR
-static irqreturn_t cyz_interrupt(int irq, void *dev_id)
-{
-	struct cyclades_card *cinfo = dev_id;
-
-	if (unlikely(!cyz_is_loaded(cinfo))) {
-#ifdef CY_DEBUG_INTERRUPTS
-		printk(KERN_DEBUG "cyz_interrupt: board not yet loaded "
-				"(IRQ%d).\n", irq);
-#endif
-		return IRQ_NONE;
-	}
-
-	/* Handle the interrupts */
-	cyz_handle_cmd(cinfo);
-
-	return IRQ_HANDLED;
-}				/* cyz_interrupt */
-
-static void cyz_rx_restart(struct timer_list *t)
-{
-	struct cyclades_port *info = from_timer(info, t, rx_full_timer);
-	struct cyclades_card *card = info->card;
-	int retval;
-	__u32 channel = info->line - card->first_line;
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	retval = cyz_issue_cmd(card, channel, C_CM_INTBACK2, 0L);
-	if (retval != 0) {
-		printk(KERN_ERR "cyc:cyz_rx_restart retval on ttyC%d was %x\n",
-			info->line, retval);
-	}
-	spin_unlock_irqrestore(&card->card_lock, flags);
-}
-
-#else				/* CONFIG_CYZ_INTR */
-
-static void cyz_poll(struct timer_list *unused)
-{
-	struct cyclades_card *cinfo;
-	struct cyclades_port *info;
-	unsigned long expires = jiffies + HZ;
-	unsigned int port, card;
-
-	for (card = 0; card < NR_CARDS; card++) {
-		cinfo = &cy_card[card];
-
-		if (!cy_is_Z(cinfo))
-			continue;
-		if (!cyz_is_loaded(cinfo))
-			continue;
-
-	/* Skip first polling cycle to avoid racing conditions with the FW */
-		if (!cinfo->intr_enabled) {
-			cinfo->intr_enabled = 1;
-			continue;
-		}
-
-		cyz_handle_cmd(cinfo);
-
-		for (port = 0; port < cinfo->nports; port++) {
-			info = &cinfo->ports[port];
-
-			if (!info->throttle)
-				cyz_handle_rx(info);
-			cyz_handle_tx(info);
-		}
-		/* poll every 'cyz_polling_cycle' period */
-		expires = jiffies + cyz_polling_cycle;
-	}
-	mod_timer(&cyz_timerlist, expires);
-}				/* cyz_poll */
-
-#endif				/* CONFIG_CYZ_INTR */
-
-/********** End of block of Cyclades-Z specific code *********/
-/***********************************************************/
-
-/* This is called whenever a port becomes active;
-   interrupts are enabled and DTR & RTS are turned on.
- */
-static int cy_startup(struct cyclades_port *info, struct tty_struct *tty)
-{
-	struct cyclades_card *card;
-	unsigned long flags;
-	int retval = 0;
-	int channel;
-	unsigned long page;
-
-	card = info->card;
-	channel = info->line - card->first_line;
-
-	page = get_zeroed_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-
-	if (tty_port_initialized(&info->port))
-		goto errout;
-
-	if (!info->type) {
-		set_bit(TTY_IO_ERROR, &tty->flags);
-		goto errout;
-	}
-
-	if (info->port.xmit_buf)
-		free_page(page);
-	else
-		info->port.xmit_buf = (unsigned char *)page;
-
-	spin_unlock_irqrestore(&card->card_lock, flags);
-
-	cy_set_line_char(info, tty);
-
-	if (!cy_is_Z(card)) {
-		channel &= 0x03;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-
-		cyy_writeb(info, CyCAR, channel);
-
-		cyy_writeb(info, CyRTPR,
-			(info->default_timeout ? info->default_timeout : 0x02));
-		/* 10ms rx timeout */
-
-		cyy_issue_cmd(info, CyCHAN_CTL | CyENB_RCVR | CyENB_XMTR);
-
-		cyy_change_rts_dtr(info, TIOCM_RTS | TIOCM_DTR, 0);
-
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyRxData);
-	} else {
-		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-
-		if (!cyz_is_loaded(card))
-			return -ENODEV;
-
-#ifdef CY_DEBUG_OPEN
-		printk(KERN_DEBUG "cyc startup Z card %d, channel %d, "
-			"base_addr %p\n", card, channel, card->base_addr);
-#endif
-		spin_lock_irqsave(&card->card_lock, flags);
-
-		cy_writel(&ch_ctrl->op_mode, C_CH_ENABLE);
-#ifdef Z_WAKE
-#ifdef CONFIG_CYZ_INTR
-		cy_writel(&ch_ctrl->intr_enable,
-			  C_IN_TXBEMPTY | C_IN_TXLOWWM | C_IN_RXHIWM |
-			  C_IN_RXNNDT | C_IN_IOCTLW | C_IN_MDCD);
-#else
-		cy_writel(&ch_ctrl->intr_enable,
-			  C_IN_IOCTLW | C_IN_MDCD);
-#endif				/* CONFIG_CYZ_INTR */
-#else
-#ifdef CONFIG_CYZ_INTR
-		cy_writel(&ch_ctrl->intr_enable,
-			  C_IN_TXBEMPTY | C_IN_TXLOWWM | C_IN_RXHIWM |
-			  C_IN_RXNNDT | C_IN_MDCD);
-#else
-		cy_writel(&ch_ctrl->intr_enable, C_IN_MDCD);
-#endif				/* CONFIG_CYZ_INTR */
-#endif				/* Z_WAKE */
-
-		retval = cyz_issue_cmd(card, channel, C_CM_IOCTL, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:startup(1) retval on ttyC%d was "
-				"%x\n", info->line, retval);
-		}
-
-		/* Flush RX buffers before raising DTR and RTS */
-		retval = cyz_issue_cmd(card, channel, C_CM_FLUSH_RX, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:startup(2) retval on ttyC%d was "
-				"%x\n", info->line, retval);
-		}
-
-		/* set timeout !!! */
-		/* set RTS and DTR !!! */
-		tty_port_raise_dtr_rts(&info->port);
-
-		/* enable send, recv, modem !!! */
-	}
-
-	tty_port_set_initialized(&info->port, 1);
-
-	clear_bit(TTY_IO_ERROR, &tty->flags);
-	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
-	info->breakon = info->breakoff = 0;
-	memset((char *)&info->idle_stats, 0, sizeof(info->idle_stats));
-	info->idle_stats.in_use =
-	info->idle_stats.recv_idle =
-	info->idle_stats.xmit_idle = jiffies;
-
-	spin_unlock_irqrestore(&card->card_lock, flags);
-
-#ifdef CY_DEBUG_OPEN
-	printk(KERN_DEBUG "cyc startup done\n");
-#endif
-	return 0;
-
-errout:
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	free_page(page);
-	return retval;
-}				/* startup */
-
-static void start_xmit(struct cyclades_port *info)
-{
-	struct cyclades_card *card = info->card;
-	unsigned long flags;
-	int channel = info->line - card->first_line;
-
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyTxRdy);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	} else {
-#ifdef CONFIG_CYZ_INTR
-		int retval;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		retval = cyz_issue_cmd(card, channel, C_CM_INTBACK, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:start_xmit retval on ttyC%d was "
-				"%x\n", info->line, retval);
-		}
-		spin_unlock_irqrestore(&card->card_lock, flags);
-#else				/* CONFIG_CYZ_INTR */
-		/* Don't have to do anything at this time */
-#endif				/* CONFIG_CYZ_INTR */
-	}
-}				/* start_xmit */
-
-/*
- * This routine shuts down a serial port; interrupts are disabled,
- * and DTR is dropped if the hangup on close termio flag is on.
- */
-static void cy_shutdown(struct cyclades_port *info, struct tty_struct *tty)
-{
-	struct cyclades_card *card;
-	unsigned long flags;
-
-	if (!tty_port_initialized(&info->port))
-		return;
-
-	card = info->card;
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-
-		/* Clear delta_msr_wait queue to avoid mem leaks. */
-		wake_up_interruptible(&info->port.delta_msr_wait);
-
-		if (info->port.xmit_buf) {
-			unsigned char *temp;
-			temp = info->port.xmit_buf;
-			info->port.xmit_buf = NULL;
-			free_page((unsigned long)temp);
-		}
-		if (C_HUPCL(tty))
-			cyy_change_rts_dtr(info, 0, TIOCM_RTS | TIOCM_DTR);
-
-		cyy_issue_cmd(info, CyCHAN_CTL | CyDIS_RCVR);
-		/* it may be appropriate to clear _XMIT at
-		   some later date (after testing)!!! */
-
-		set_bit(TTY_IO_ERROR, &tty->flags);
-		tty_port_set_initialized(&info->port, 0);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	} else {
-#ifdef CY_DEBUG_OPEN
-		int channel = info->line - card->first_line;
-		printk(KERN_DEBUG "cyc shutdown Z card %d, channel %d, "
-			"base_addr %p\n", card, channel, card->base_addr);
-#endif
-
-		if (!cyz_is_loaded(card))
-			return;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-
-		if (info->port.xmit_buf) {
-			unsigned char *temp;
-			temp = info->port.xmit_buf;
-			info->port.xmit_buf = NULL;
-			free_page((unsigned long)temp);
-		}
-
-		if (C_HUPCL(tty))
-			tty_port_lower_dtr_rts(&info->port);
-
-		set_bit(TTY_IO_ERROR, &tty->flags);
-		tty_port_set_initialized(&info->port, 0);
-
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	}
-
-#ifdef CY_DEBUG_OPEN
-	printk(KERN_DEBUG "cyc shutdown done\n");
-#endif
-}				/* shutdown */
-
-/*
- * ------------------------------------------------------------
- * cy_open() and friends
- * ------------------------------------------------------------
- */
-
-/*
- * This routine is called whenever a serial port is opened.  It
- * performs the serial-specific initialization for the tty structure.
- */
-static int cy_open(struct tty_struct *tty, struct file *filp)
-{
-	struct cyclades_port *info;
-	unsigned int i, line = tty->index;
-	int retval;
-
-	for (i = 0; i < NR_CARDS; i++)
-		if (line < cy_card[i].first_line + cy_card[i].nports &&
-				line >= cy_card[i].first_line)
-			break;
-	if (i >= NR_CARDS)
-		return -ENODEV;
-	info = &cy_card[i].ports[line - cy_card[i].first_line];
-	if (info->line < 0)
-		return -ENODEV;
-
-	/* If the card's firmware hasn't been loaded,
-	   treat it as absent from the system.  This
-	   will make the user pay attention.
-	 */
-	if (cy_is_Z(info->card)) {
-		struct cyclades_card *cinfo = info->card;
-		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
-
-		if (!cyz_is_loaded(cinfo)) {
-			if (cinfo->hw_ver == ZE_V1 && cyz_fpga_loaded(cinfo) &&
-					readl(&firm_id->signature) ==
-					ZFIRM_HLT) {
-				printk(KERN_ERR "cyc:Cyclades-Z Error: you "
-					"need an external power supply for "
-					"this number of ports.\nFirmware "
-					"halted.\n");
-			} else {
-				printk(KERN_ERR "cyc:Cyclades-Z firmware not "
-					"yet loaded\n");
-			}
-			return -ENODEV;
-		}
-#ifdef CONFIG_CYZ_INTR
-		else {
-		/* In case this Z board is operating in interrupt mode, its
-		   interrupts should be enabled as soon as the first open
-		   happens to one of its ports. */
-			if (!cinfo->intr_enabled) {
-				u16 intr;
-
-				/* Enable interrupts on the PLX chip */
-				intr = readw(&cinfo->ctl_addr.p9060->
-						intr_ctrl_stat) | 0x0900;
-				cy_writew(&cinfo->ctl_addr.p9060->
-						intr_ctrl_stat, intr);
-				/* Enable interrupts on the FW */
-				retval = cyz_issue_cmd(cinfo, 0,
-						C_CM_IRQ_ENBL, 0L);
-				if (retval != 0) {
-					printk(KERN_ERR "cyc:IRQ enable retval "
-						"was %x\n", retval);
-				}
-				cinfo->intr_enabled = 1;
-			}
-		}
-#endif				/* CONFIG_CYZ_INTR */
-		/* Make sure this Z port really exists in hardware */
-		if (info->line > (cinfo->first_line + cinfo->nports - 1))
-			return -ENODEV;
-	}
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_open ttyC%d\n", info->line);
-#endif
-	tty->driver_data = info;
-	if (serial_paranoia_check(info, tty->name, "cy_open"))
-		return -ENODEV;
-
-#ifdef CY_DEBUG_OPEN
-	printk(KERN_DEBUG "cyc:cy_open ttyC%d, count = %d\n", info->line,
-			info->port.count);
-#endif
-	info->port.count++;
-#ifdef CY_DEBUG_COUNT
-	printk(KERN_DEBUG "cyc:cy_open (%d): incrementing count to %d\n",
-		current->pid, info->port.count);
-#endif
-
-	/*
-	 * Start up serial port
-	 */
-	retval = cy_startup(info, tty);
-	if (retval)
-		return retval;
-
-	retval = tty_port_block_til_ready(&info->port, tty, filp);
-	if (retval) {
-#ifdef CY_DEBUG_OPEN
-		printk(KERN_DEBUG "cyc:cy_open returning after block_til_ready "
-			"with %d\n", retval);
-#endif
-		return retval;
-	}
-
-	info->throttle = 0;
-	tty_port_tty_set(&info->port, tty);
-
-#ifdef CY_DEBUG_OPEN
-	printk(KERN_DEBUG "cyc:cy_open done\n");
-#endif
-	return 0;
-}				/* cy_open */
-
-/*
- * cy_wait_until_sent() --- wait until the transmitter is empty
- */
-static void cy_wait_until_sent(struct tty_struct *tty, int timeout)
-{
-	struct cyclades_card *card;
-	struct cyclades_port *info = tty->driver_data;
-	unsigned long orig_jiffies;
-	int char_time;
-
-	if (serial_paranoia_check(info, tty->name, "cy_wait_until_sent"))
-		return;
-
-	if (info->xmit_fifo_size == 0)
-		return;		/* Just in case.... */
-
-	orig_jiffies = jiffies;
-	/*
-	 * Set the check interval to be 1/5 of the estimated time to
-	 * send a single character, and make it at least 1.  The check
-	 * interval should also be less than the timeout.
-	 *
-	 * Note: we have to use pretty tight timings here to satisfy
-	 * the NIST-PCTS.
-	 */
-	char_time = (info->timeout - HZ / 50) / info->xmit_fifo_size;
-	char_time = char_time / 5;
-	if (char_time <= 0)
-		char_time = 1;
-	if (timeout < 0)
-		timeout = 0;
-	if (timeout)
-		char_time = min(char_time, timeout);
-	/*
-	 * If the transmitter hasn't cleared in twice the approximate
-	 * amount of time to send the entire FIFO, it probably won't
-	 * ever clear.  This assumes the UART isn't doing flow
-	 * control, which is currently the case.  Hence, if it ever
-	 * takes longer than info->timeout, this is probably due to a
-	 * UART bug of some kind.  So, we clamp the timeout parameter at
-	 * 2*info->timeout.
-	 */
-	if (!timeout || timeout > 2 * info->timeout)
-		timeout = 2 * info->timeout;
-
-	card = info->card;
-	if (!cy_is_Z(card)) {
-		while (cyy_readb(info, CySRER) & CyTxRdy) {
-			if (msleep_interruptible(jiffies_to_msecs(char_time)))
-				break;
-			if (timeout && time_after(jiffies, orig_jiffies +
-					timeout))
-				break;
-		}
-	}
-	/* Run one more char cycle */
-	msleep_interruptible(jiffies_to_msecs(char_time * 5));
-}
-
-static void cy_flush_buffer(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	int channel, retval;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_flush_buffer ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_flush_buffer"))
-		return;
-
-	card = info->card;
-	channel = info->line - card->first_line;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
-	spin_unlock_irqrestore(&card->card_lock, flags);
-
-	if (cy_is_Z(card)) {	/* If it is a Z card, flush the on-board
-					   buffers as well */
-		spin_lock_irqsave(&card->card_lock, flags);
-		retval = cyz_issue_cmd(card, channel, C_CM_FLUSH_TX, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc: flush_buffer retval on ttyC%d "
-				"was %x\n", info->line, retval);
-		}
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	}
-	tty_wakeup(tty);
-}				/* cy_flush_buffer */
-
-
-static void cy_do_close(struct tty_port *port)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-								port);
-	struct cyclades_card *card;
-	unsigned long flags;
-	int channel;
-
-	card = info->card;
-	channel = info->line - card->first_line;
-	spin_lock_irqsave(&card->card_lock, flags);
-
-	if (!cy_is_Z(card)) {
-		/* Stop accepting input */
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyRxData);
-		if (tty_port_initialized(&info->port)) {
-			/* Waiting for on-board buffers to be empty before
-			   closing the port */
-			spin_unlock_irqrestore(&card->card_lock, flags);
-			cy_wait_until_sent(port->tty, info->timeout);
-			spin_lock_irqsave(&card->card_lock, flags);
-		}
-	} else {
-#ifdef Z_WAKE
-		/* Waiting for on-board buffers to be empty before closing
-		   the port */
-		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-		int retval;
-
-		if (readl(&ch_ctrl->flow_status) != C_FS_TXIDLE) {
-			retval = cyz_issue_cmd(card, channel, C_CM_IOCTLW, 0L);
-			if (retval != 0) {
-				printk(KERN_DEBUG "cyc:cy_close retval on "
-					"ttyC%d was %x\n", info->line, retval);
-			}
-			spin_unlock_irqrestore(&card->card_lock, flags);
-			wait_for_completion_interruptible(&info->shutdown_wait);
-			spin_lock_irqsave(&card->card_lock, flags);
-		}
-#endif
-	}
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	cy_shutdown(info, port->tty);
-}
-
-/*
- * This routine is called when a particular tty device is closed.
- */
-static void cy_close(struct tty_struct *tty, struct file *filp)
-{
-	struct cyclades_port *info = tty->driver_data;
-	if (!info || serial_paranoia_check(info, tty->name, "cy_close"))
-		return;
-	tty_port_close(&info->port, tty, filp);
-}				/* cy_close */
-
-/* This routine gets called when tty_write has put something into
- * the write_queue.  The characters may come from user space or
- * kernel space.
- *
- * This routine will return the number of characters actually
- * accepted for writing.
- *
- * If the port is not already transmitting stuff, start it off by
- * enabling interrupts.  The interrupt service routine will then
- * ensure that the characters are sent.
- * If the port is already active, there is no need to kick it.
- *
- */
-static int cy_write(struct tty_struct *tty, const unsigned char *buf, int count)
-{
-	struct cyclades_port *info = tty->driver_data;
-	unsigned long flags;
-	int c, ret = 0;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_write ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_write"))
-		return 0;
-
-	if (!info->port.xmit_buf)
-		return 0;
-
-	spin_lock_irqsave(&info->card->card_lock, flags);
-	while (1) {
-		c = min(count, (int)(SERIAL_XMIT_SIZE - info->xmit_cnt - 1));
-		c = min(c, (int)(SERIAL_XMIT_SIZE - info->xmit_head));
-
-		if (c <= 0)
-			break;
-
-		memcpy(info->port.xmit_buf + info->xmit_head, buf, c);
-		info->xmit_head = (info->xmit_head + c) &
-			(SERIAL_XMIT_SIZE - 1);
-		info->xmit_cnt += c;
-		buf += c;
-		count -= c;
-		ret += c;
-	}
-	spin_unlock_irqrestore(&info->card->card_lock, flags);
-
-	info->idle_stats.xmit_bytes += ret;
-	info->idle_stats.xmit_idle = jiffies;
-
-	if (info->xmit_cnt && !tty->stopped && !tty->hw_stopped)
-		start_xmit(info);
-
-	return ret;
-}				/* cy_write */
-
-/*
- * This routine is called by the kernel to write a single
- * character to the tty device.  If the kernel uses this routine,
- * it must call the flush_chars() routine (if defined) when it is
- * done stuffing characters into the driver.  If there is no room
- * in the queue, the character is ignored.
- */
-static int cy_put_char(struct tty_struct *tty, unsigned char ch)
-{
-	struct cyclades_port *info = tty->driver_data;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_put_char ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_put_char"))
-		return 0;
-
-	if (!info->port.xmit_buf)
-		return 0;
-
-	spin_lock_irqsave(&info->card->card_lock, flags);
-	if (info->xmit_cnt >= (int)(SERIAL_XMIT_SIZE - 1)) {
-		spin_unlock_irqrestore(&info->card->card_lock, flags);
-		return 0;
-	}
-
-	info->port.xmit_buf[info->xmit_head++] = ch;
-	info->xmit_head &= SERIAL_XMIT_SIZE - 1;
-	info->xmit_cnt++;
-	info->idle_stats.xmit_bytes++;
-	info->idle_stats.xmit_idle = jiffies;
-	spin_unlock_irqrestore(&info->card->card_lock, flags);
-	return 1;
-}				/* cy_put_char */
-
-/*
- * This routine is called by the kernel after it has written a
- * series of characters to the tty device using put_char().
- */
-static void cy_flush_chars(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_flush_chars ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_flush_chars"))
-		return;
-
-	if (info->xmit_cnt <= 0 || tty->stopped || tty->hw_stopped ||
-			!info->port.xmit_buf)
-		return;
-
-	start_xmit(info);
-}				/* cy_flush_chars */
-
-/*
- * This routine returns the numbers of characters the tty driver
- * will accept for queuing to be written.  This number is subject
- * to change as output buffers get emptied, or if the output flow
- * control is activated.
- */
-static int cy_write_room(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	int ret;
-
-#ifdef CY_DEBUG_IO
-	printk(KERN_DEBUG "cyc:cy_write_room ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_write_room"))
-		return 0;
-	ret = SERIAL_XMIT_SIZE - info->xmit_cnt - 1;
-	if (ret < 0)
-		ret = 0;
-	return ret;
-}				/* cy_write_room */
-
-static int cy_chars_in_buffer(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-
-	if (serial_paranoia_check(info, tty->name, "cy_chars_in_buffer"))
-		return 0;
-
-#ifdef Z_EXT_CHARS_IN_BUFFER
-	if (!cy_is_Z(info->card)) {
-#endif				/* Z_EXT_CHARS_IN_BUFFER */
-#ifdef CY_DEBUG_IO
-		printk(KERN_DEBUG "cyc:cy_chars_in_buffer ttyC%d %d\n",
-			info->line, info->xmit_cnt);
-#endif
-		return info->xmit_cnt;
-#ifdef Z_EXT_CHARS_IN_BUFFER
-	} else {
-		struct BUF_CTRL __iomem *buf_ctrl = info->u.cyz.buf_ctrl;
-		int char_count;
-		__u32 tx_put, tx_get, tx_bufsize;
-
-		tx_get = readl(&buf_ctrl->tx_get);
-		tx_put = readl(&buf_ctrl->tx_put);
-		tx_bufsize = readl(&buf_ctrl->tx_bufsize);
-		if (tx_put >= tx_get)
-			char_count = tx_put - tx_get;
-		else
-			char_count = tx_put - tx_get + tx_bufsize;
-#ifdef CY_DEBUG_IO
-		printk(KERN_DEBUG "cyc:cy_chars_in_buffer ttyC%d %d\n",
-			info->line, info->xmit_cnt + char_count);
-#endif
-		return info->xmit_cnt + char_count;
-	}
-#endif				/* Z_EXT_CHARS_IN_BUFFER */
-}				/* cy_chars_in_buffer */
-
-/*
- * ------------------------------------------------------------
- * cy_ioctl() and friends
- * ------------------------------------------------------------
- */
-
-static void cyy_baud_calc(struct cyclades_port *info, __u32 baud)
-{
-	int co, co_val, bpr;
-	__u32 cy_clock = ((info->chip_rev >= CD1400_REV_J) ? 60000000 :
-			25000000);
-
-	if (baud == 0) {
-		info->tbpr = info->tco = info->rbpr = info->rco = 0;
-		return;
-	}
-
-	/* determine which prescaler to use */
-	for (co = 4, co_val = 2048; co; co--, co_val >>= 2) {
-		if (cy_clock / co_val / baud > 63)
-			break;
-	}
-
-	bpr = (cy_clock / co_val * 2 / baud + 1) / 2;
-	if (bpr > 255)
-		bpr = 255;
-
-	info->tbpr = info->rbpr = bpr;
-	info->tco = info->rco = co;
-}
-
-/*
- * This routine finds or computes the various line characteristics.
- * It used to be called config_setup
- */
-static void cy_set_line_char(struct cyclades_port *info, struct tty_struct *tty)
-{
-	struct cyclades_card *card;
-	unsigned long flags;
-	int channel;
-	unsigned cflag, iflag;
-	int baud, baud_rate = 0;
-	int i;
-
-	if (info->line == -1)
-		return;
-
-	cflag = tty->termios.c_cflag;
-	iflag = tty->termios.c_iflag;
-
-	card = info->card;
-	channel = info->line - card->first_line;
-
-	if (!cy_is_Z(card)) {
-		u32 cflags;
-
-		/* baud rate */
-		baud = tty_get_baud_rate(tty);
-		if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			if (info->custom_divisor)
-				baud_rate = info->baud / info->custom_divisor;
-			else
-				baud_rate = info->baud;
-		} else if (baud > CD1400_MAX_SPEED) {
-			baud = CD1400_MAX_SPEED;
-		}
-		/* find the baud index */
-		for (i = 0; i < 20; i++) {
-			if (baud == baud_table[i])
-				break;
-		}
-		if (i == 20)
-			i = 19;	/* CD1400_MAX_SPEED */
-
-		if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			cyy_baud_calc(info, baud_rate);
-		} else {
-			if (info->chip_rev >= CD1400_REV_J) {
-				/* It is a CD1400 rev. J or later */
-				info->tbpr = baud_bpr_60[i];	/* Tx BPR */
-				info->tco = baud_co_60[i];	/* Tx CO */
-				info->rbpr = baud_bpr_60[i];	/* Rx BPR */
-				info->rco = baud_co_60[i];	/* Rx CO */
-			} else {
-				info->tbpr = baud_bpr_25[i];	/* Tx BPR */
-				info->tco = baud_co_25[i];	/* Tx CO */
-				info->rbpr = baud_bpr_25[i];	/* Rx BPR */
-				info->rco = baud_co_25[i];	/* Rx CO */
-			}
-		}
-		if (baud_table[i] == 134) {
-			/* get it right for 134.5 baud */
-			info->timeout = (info->xmit_fifo_size * HZ * 30 / 269) +
-					2;
-		} else if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			info->timeout = (info->xmit_fifo_size * HZ * 15 /
-					baud_rate) + 2;
-		} else if (baud_table[i]) {
-			info->timeout = (info->xmit_fifo_size * HZ * 15 /
-					baud_table[i]) + 2;
-			/* this needs to be propagated into the card info */
-		} else {
-			info->timeout = 0;
-		}
-		/* By tradition (is it a standard?) a baud rate of zero
-		   implies the line should be/has been closed.  A bit
-		   later in this routine such a test is performed. */
-
-		/* byte size and parity */
-		info->cor5 = 0;
-		info->cor4 = 0;
-		/* receive threshold */
-		info->cor3 = (info->default_threshold ?
-				info->default_threshold : baud_cor3[i]);
-		info->cor2 = CyETC;
-		switch (cflag & CSIZE) {
-		case CS5:
-			info->cor1 = Cy_5_BITS;
-			break;
-		case CS6:
-			info->cor1 = Cy_6_BITS;
-			break;
-		case CS7:
-			info->cor1 = Cy_7_BITS;
-			break;
-		case CS8:
-			info->cor1 = Cy_8_BITS;
-			break;
-		}
-		if (cflag & CSTOPB)
-			info->cor1 |= Cy_2_STOP;
-
-		if (cflag & PARENB) {
-			if (cflag & PARODD)
-				info->cor1 |= CyPARITY_O;
-			else
-				info->cor1 |= CyPARITY_E;
-		} else
-			info->cor1 |= CyPARITY_NONE;
-
-		/* CTS flow control flag */
-		tty_port_set_cts_flow(&info->port, cflag & CRTSCTS);
-		if (cflag & CRTSCTS)
-			info->cor2 |= CyCtsAE;
-		else
-			info->cor2 &= ~CyCtsAE;
-		tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
-
-	 /***********************************************
-	    The hardware option, CyRtsAO, presents RTS when
-	    the chip has characters to send.  Since most modems
-	    use RTS as reverse (inbound) flow control, this
-	    option is not used.  If inbound flow control is
-	    necessary, DTR can be programmed to provide the
-	    appropriate signals for use with a non-standard
-	    cable.  Contact Marcio Saito for details.
-	 ***********************************************/
-
-		channel &= 0x03;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel);
-
-		/* tx and rx baud rate */
-
-		cyy_writeb(info, CyTCOR, info->tco);
-		cyy_writeb(info, CyTBPR, info->tbpr);
-		cyy_writeb(info, CyRCOR, info->rco);
-		cyy_writeb(info, CyRBPR, info->rbpr);
-
-		/* set line characteristics  according configuration */
-
-		cyy_writeb(info, CySCHR1, START_CHAR(tty));
-		cyy_writeb(info, CySCHR2, STOP_CHAR(tty));
-		cyy_writeb(info, CyCOR1, info->cor1);
-		cyy_writeb(info, CyCOR2, info->cor2);
-		cyy_writeb(info, CyCOR3, info->cor3);
-		cyy_writeb(info, CyCOR4, info->cor4);
-		cyy_writeb(info, CyCOR5, info->cor5);
-
-		cyy_issue_cmd(info, CyCOR_CHANGE | CyCOR1ch | CyCOR2ch |
-				CyCOR3ch);
-
-		/* !!! Is this needed? */
-		cyy_writeb(info, CyCAR, channel);
-		cyy_writeb(info, CyRTPR,
-			(info->default_timeout ? info->default_timeout : 0x02));
-		/* 10ms rx timeout */
-
-		cflags = CyCTS;
-		if (!C_CLOCAL(tty))
-			cflags |= CyDSR | CyRI | CyDCD;
-		/* without modem intr */
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyMdmCh);
-		/* act on 1->0 modem transitions */
-		if ((cflag & CRTSCTS) && info->rflow)
-			cyy_writeb(info, CyMCOR1, cflags | rflow_thr[i]);
-		else
-			cyy_writeb(info, CyMCOR1, cflags);
-		/* act on 0->1 modem transitions */
-		cyy_writeb(info, CyMCOR2, cflags);
-
-		if (i == 0)	/* baud rate is zero, turn off line */
-			cyy_change_rts_dtr(info, 0, TIOCM_DTR);
-		else
-			cyy_change_rts_dtr(info, TIOCM_DTR, 0);
-
-		clear_bit(TTY_IO_ERROR, &tty->flags);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-
-	} else {
-		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-		__u32 sw_flow;
-		int retval;
-
-		if (!cyz_is_loaded(card))
-			return;
-
-		/* baud rate */
-		baud = tty_get_baud_rate(tty);
-		if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			if (info->custom_divisor)
-				baud_rate = info->baud / info->custom_divisor;
-			else
-				baud_rate = info->baud;
-		} else if (baud > CYZ_MAX_SPEED) {
-			baud = CYZ_MAX_SPEED;
-		}
-		cy_writel(&ch_ctrl->comm_baud, baud);
-
-		if (baud == 134) {
-			/* get it right for 134.5 baud */
-			info->timeout = (info->xmit_fifo_size * HZ * 30 / 269) +
-					2;
-		} else if (baud == 38400 && (info->port.flags & ASYNC_SPD_MASK) ==
-				ASYNC_SPD_CUST) {
-			info->timeout = (info->xmit_fifo_size * HZ * 15 /
-					baud_rate) + 2;
-		} else if (baud) {
-			info->timeout = (info->xmit_fifo_size * HZ * 15 /
-					baud) + 2;
-			/* this needs to be propagated into the card info */
-		} else {
-			info->timeout = 0;
-		}
-
-		/* byte size and parity */
-		switch (cflag & CSIZE) {
-		case CS5:
-			cy_writel(&ch_ctrl->comm_data_l, C_DL_CS5);
-			break;
-		case CS6:
-			cy_writel(&ch_ctrl->comm_data_l, C_DL_CS6);
-			break;
-		case CS7:
-			cy_writel(&ch_ctrl->comm_data_l, C_DL_CS7);
-			break;
-		case CS8:
-			cy_writel(&ch_ctrl->comm_data_l, C_DL_CS8);
-			break;
-		}
-		if (cflag & CSTOPB) {
-			cy_writel(&ch_ctrl->comm_data_l,
-				  readl(&ch_ctrl->comm_data_l) | C_DL_2STOP);
-		} else {
-			cy_writel(&ch_ctrl->comm_data_l,
-				  readl(&ch_ctrl->comm_data_l) | C_DL_1STOP);
-		}
-		if (cflag & PARENB) {
-			if (cflag & PARODD)
-				cy_writel(&ch_ctrl->comm_parity, C_PR_ODD);
-			else
-				cy_writel(&ch_ctrl->comm_parity, C_PR_EVEN);
-		} else
-			cy_writel(&ch_ctrl->comm_parity, C_PR_NONE);
-
-		/* CTS flow control flag */
-		if (cflag & CRTSCTS) {
-			cy_writel(&ch_ctrl->hw_flow,
-				readl(&ch_ctrl->hw_flow) | C_RS_CTS | C_RS_RTS);
-		} else {
-			cy_writel(&ch_ctrl->hw_flow, readl(&ch_ctrl->hw_flow) &
-					~(C_RS_CTS | C_RS_RTS));
-		}
-		/* As the HW flow control is done in firmware, the driver
-		   doesn't need to care about it */
-		tty_port_set_cts_flow(&info->port, 0);
-
-		/* XON/XOFF/XANY flow control flags */
-		sw_flow = 0;
-		if (iflag & IXON) {
-			sw_flow |= C_FL_OXX;
-			if (iflag & IXANY)
-				sw_flow |= C_FL_OIXANY;
-		}
-		cy_writel(&ch_ctrl->sw_flow, sw_flow);
-
-		retval = cyz_issue_cmd(card, channel, C_CM_IOCTL, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:set_line_char retval on ttyC%d "
-				"was %x\n", info->line, retval);
-		}
-
-		/* CD sensitivity */
-		tty_port_set_check_carrier(&info->port, ~cflag & CLOCAL);
-
-		if (baud == 0) {	/* baud rate is zero, turn off line */
-			cy_writel(&ch_ctrl->rs_control,
-				  readl(&ch_ctrl->rs_control) & ~C_RS_DTR);
-#ifdef CY_DEBUG_DTR
-			printk(KERN_DEBUG "cyc:set_line_char dropping Z DTR\n");
-#endif
-		} else {
-			cy_writel(&ch_ctrl->rs_control,
-				  readl(&ch_ctrl->rs_control) | C_RS_DTR);
-#ifdef CY_DEBUG_DTR
-			printk(KERN_DEBUG "cyc:set_line_char raising Z DTR\n");
-#endif
-		}
-
-		retval = cyz_issue_cmd(card, channel, C_CM_IOCTLM, 0L);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:set_line_char(2) retval on ttyC%d "
-				"was %x\n", info->line, retval);
-		}
-
-		clear_bit(TTY_IO_ERROR, &tty->flags);
-	}
-}				/* set_line_char */
-
-static int cy_get_serial_info(struct tty_struct *tty,
-				struct serial_struct *ss)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *cinfo = info->card;
-
-	if (serial_paranoia_check(info, tty->name, "cy_ioctl"))
-		return -ENODEV;
-	ss->type = info->type;
-	ss->line = info->line;
-	ss->port = (info->card - cy_card) * 0x100 + info->line -
-			cinfo->first_line;
-	ss->irq = cinfo->irq;
-	ss->flags = info->port.flags;
-	ss->close_delay = info->port.close_delay;
-	ss->closing_wait = info->port.closing_wait;
-	ss->baud_base = info->baud;
-	ss->custom_divisor = info->custom_divisor;
-	return 0;
-}
-
-static int cy_set_serial_info(struct tty_struct *tty,
-				struct serial_struct *ss)
-{
-	struct cyclades_port *info = tty->driver_data;
-	int old_flags;
-	int ret;
-
-	if (serial_paranoia_check(info, tty->name, "cy_ioctl"))
-		return -ENODEV;
-
-	mutex_lock(&info->port.mutex);
-
-	old_flags = info->port.flags;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		if (ss->close_delay != info->port.close_delay ||
-				ss->baud_base != info->baud ||
-				(ss->flags & ASYNC_FLAGS &
-					~ASYNC_USR_MASK) !=
-				(info->port.flags & ASYNC_FLAGS & ~ASYNC_USR_MASK))
-		{
-			mutex_unlock(&info->port.mutex);
-			return -EPERM;
-		}
-		info->port.flags = (info->port.flags & ~ASYNC_USR_MASK) |
-				(ss->flags & ASYNC_USR_MASK);
-		info->baud = ss->baud_base;
-		info->custom_divisor = ss->custom_divisor;
-		goto check_and_exit;
-	}
-
-	/*
-	 * OK, past this point, all the error checking has been done.
-	 * At this point, we start making changes.....
-	 */
-
-	info->baud = ss->baud_base;
-	info->custom_divisor = ss->custom_divisor;
-	info->port.flags = (info->port.flags & ~ASYNC_FLAGS) |
-			(ss->flags & ASYNC_FLAGS);
-	info->port.close_delay = ss->close_delay * HZ / 100;
-	info->port.closing_wait = ss->closing_wait * HZ / 100;
-
-check_and_exit:
-	if (tty_port_initialized(&info->port)) {
-		if ((ss->flags ^ old_flags) & ASYNC_SPD_MASK) {
-			/* warn about deprecation unless clearing */
-			if (ss->flags & ASYNC_SPD_MASK)
-				dev_warn_ratelimited(tty->dev, "use of SPD flags is deprecated\n");
-		}
-		cy_set_line_char(info, tty);
-		ret = 0;
-	} else {
-		ret = cy_startup(info, tty);
-	}
-	mutex_unlock(&info->port.mutex);
-	return ret;
-}				/* set_serial_info */
-
-/*
- * get_lsr_info - get line status register info
- *
- * Purpose: Let user call ioctl() to get info when the UART physically
- *	    is emptied.  On bus types like RS485, the transmitter must
- *	    release the bus after transmitting. This must be done when
- *	    the transmit shift register is empty, not be done when the
- *	    transmit holding register is empty.  This functionality
- *	    allows an RS485 driver to be written in user space.
- */
-static int get_lsr_info(struct cyclades_port *info, unsigned int __user *value)
-{
-	struct cyclades_card *card = info->card;
-	unsigned int result;
-	unsigned long flags;
-	u8 status;
-
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-		status = cyy_readb(info, CySRER) & (CyTxRdy | CyTxMpty);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-		result = (status ? 0 : TIOCSER_TEMT);
-	} else {
-		/* Not supported yet */
-		return -EINVAL;
-	}
-	return put_user(result, value);
-}
-
-static int cy_tiocmget(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	int result;
-
-	if (serial_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-
-	card = info->card;
-
-	if (!cy_is_Z(card)) {
-		unsigned long flags;
-		int channel = info->line - card->first_line;
-		u8 status;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		status = cyy_readb(info, CyMSVR1);
-		status |= cyy_readb(info, CyMSVR2);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-
-		if (info->rtsdtr_inv) {
-			result = ((status & CyRTS) ? TIOCM_DTR : 0) |
-				((status & CyDTR) ? TIOCM_RTS : 0);
-		} else {
-			result = ((status & CyRTS) ? TIOCM_RTS : 0) |
-				((status & CyDTR) ? TIOCM_DTR : 0);
-		}
-		result |= ((status & CyDCD) ? TIOCM_CAR : 0) |
-			((status & CyRI) ? TIOCM_RNG : 0) |
-			((status & CyDSR) ? TIOCM_DSR : 0) |
-			((status & CyCTS) ? TIOCM_CTS : 0);
-	} else {
-		u32 lstatus;
-
-		if (!cyz_is_loaded(card)) {
-			result = -ENODEV;
-			goto end;
-		}
-
-		lstatus = readl(&info->u.cyz.ch_ctrl->rs_status);
-		result = ((lstatus & C_RS_RTS) ? TIOCM_RTS : 0) |
-			((lstatus & C_RS_DTR) ? TIOCM_DTR : 0) |
-			((lstatus & C_RS_DCD) ? TIOCM_CAR : 0) |
-			((lstatus & C_RS_RI) ? TIOCM_RNG : 0) |
-			((lstatus & C_RS_DSR) ? TIOCM_DSR : 0) |
-			((lstatus & C_RS_CTS) ? TIOCM_CTS : 0);
-	}
-end:
-	return result;
-}				/* cy_tiomget */
-
-static int
-cy_tiocmset(struct tty_struct *tty,
-		unsigned int set, unsigned int clear)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	unsigned long flags;
-
-	if (serial_paranoia_check(info, tty->name, __func__))
-		return -ENODEV;
-
-	card = info->card;
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_change_rts_dtr(info, set, clear);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	} else {
-		struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-		int retval, channel = info->line - card->first_line;
-		u32 rs;
-
-		if (!cyz_is_loaded(card))
-			return -ENODEV;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		rs = readl(&ch_ctrl->rs_control);
-		if (set & TIOCM_RTS)
-			rs |= C_RS_RTS;
-		if (clear & TIOCM_RTS)
-			rs &= ~C_RS_RTS;
-		if (set & TIOCM_DTR) {
-			rs |= C_RS_DTR;
-#ifdef CY_DEBUG_DTR
-			printk(KERN_DEBUG "cyc:set_modem_info raising Z DTR\n");
-#endif
-		}
-		if (clear & TIOCM_DTR) {
-			rs &= ~C_RS_DTR;
-#ifdef CY_DEBUG_DTR
-			printk(KERN_DEBUG "cyc:set_modem_info clearing "
-				"Z DTR\n");
-#endif
-		}
-		cy_writel(&ch_ctrl->rs_control, rs);
-		retval = cyz_issue_cmd(card, channel, C_CM_IOCTLM, 0L);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-		if (retval != 0) {
-			printk(KERN_ERR "cyc:set_modem_info retval on ttyC%d "
-				"was %x\n", info->line, retval);
-		}
-	}
-	return 0;
-}
-
-/*
- * cy_break() --- routine which turns the break handling on or off
- */
-static int cy_break(struct tty_struct *tty, int break_state)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	unsigned long flags;
-	int retval = 0;
-
-	if (serial_paranoia_check(info, tty->name, "cy_break"))
-		return -EINVAL;
-
-	card = info->card;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	if (!cy_is_Z(card)) {
-		/* Let the transmit ISR take care of this (since it
-		   requires stuffing characters into the output stream).
-		 */
-		if (break_state == -1) {
-			if (!info->breakon) {
-				info->breakon = 1;
-				if (!info->xmit_cnt) {
-					spin_unlock_irqrestore(&card->card_lock, flags);
-					start_xmit(info);
-					spin_lock_irqsave(&card->card_lock, flags);
-				}
-			}
-		} else {
-			if (!info->breakoff) {
-				info->breakoff = 1;
-				if (!info->xmit_cnt) {
-					spin_unlock_irqrestore(&card->card_lock, flags);
-					start_xmit(info);
-					spin_lock_irqsave(&card->card_lock, flags);
-				}
-			}
-		}
-	} else {
-		if (break_state == -1) {
-			retval = cyz_issue_cmd(card,
-				info->line - card->first_line,
-				C_CM_SET_BREAK, 0L);
-			if (retval != 0) {
-				printk(KERN_ERR "cyc:cy_break (set) retval on "
-					"ttyC%d was %x\n", info->line, retval);
-			}
-		} else {
-			retval = cyz_issue_cmd(card,
-				info->line - card->first_line,
-				C_CM_CLR_BREAK, 0L);
-			if (retval != 0) {
-				printk(KERN_DEBUG "cyc:cy_break (clr) retval "
-					"on ttyC%d was %x\n", info->line,
-					retval);
-			}
-		}
-	}
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	return retval;
-}				/* cy_break */
-
-static int set_threshold(struct cyclades_port *info, unsigned long value)
-{
-	struct cyclades_card *card = info->card;
-	unsigned long flags;
-
-	if (!cy_is_Z(card)) {
-		info->cor3 &= ~CyREC_FIFO;
-		info->cor3 |= value & CyREC_FIFO;
-
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyCOR3, info->cor3);
-		cyy_issue_cmd(info, CyCOR_CHANGE | CyCOR3ch);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	}
-	return 0;
-}				/* set_threshold */
-
-static int get_threshold(struct cyclades_port *info,
-						unsigned long __user *value)
-{
-	struct cyclades_card *card = info->card;
-
-	if (!cy_is_Z(card)) {
-		u8 tmp = cyy_readb(info, CyCOR3) & CyREC_FIFO;
-		return put_user(tmp, value);
-	}
-	return 0;
-}				/* get_threshold */
-
-static int set_timeout(struct cyclades_port *info, unsigned long value)
-{
-	struct cyclades_card *card = info->card;
-	unsigned long flags;
-
-	if (!cy_is_Z(card)) {
-		spin_lock_irqsave(&card->card_lock, flags);
-		cyy_writeb(info, CyRTPR, value & 0xff);
-		spin_unlock_irqrestore(&card->card_lock, flags);
-	}
-	return 0;
-}				/* set_timeout */
-
-static int get_timeout(struct cyclades_port *info,
-						unsigned long __user *value)
-{
-	struct cyclades_card *card = info->card;
-
-	if (!cy_is_Z(card)) {
-		u8 tmp = cyy_readb(info, CyRTPR);
-		return put_user(tmp, value);
-	}
-	return 0;
-}				/* get_timeout */
-
-static int cy_cflags_changed(struct cyclades_port *info, unsigned long arg,
-		struct cyclades_icount *cprev)
-{
-	struct cyclades_icount cnow;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&info->card->card_lock, flags);
-	cnow = info->icount;	/* atomic copy */
-	spin_unlock_irqrestore(&info->card->card_lock, flags);
-
-	ret =	((arg & TIOCM_RNG) && (cnow.rng != cprev->rng)) ||
-		((arg & TIOCM_DSR) && (cnow.dsr != cprev->dsr)) ||
-		((arg & TIOCM_CD)  && (cnow.dcd != cprev->dcd)) ||
-		((arg & TIOCM_CTS) && (cnow.cts != cprev->cts));
-
-	*cprev = cnow;
-
-	return ret;
-}
-
-/*
- * This routine allows the tty driver to implement device-
- * specific ioctl's.  If the ioctl number passed in cmd is
- * not recognized by the driver, it should return ENOIOCTLCMD.
- */
-static int
-cy_ioctl(struct tty_struct *tty,
-	 unsigned int cmd, unsigned long arg)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_icount cnow;	/* kernel counter temps */
-	int ret_val = 0;
-	unsigned long flags;
-	void __user *argp = (void __user *)arg;
-
-	if (serial_paranoia_check(info, tty->name, "cy_ioctl"))
-		return -ENODEV;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_ioctl ttyC%d, cmd = %x arg = %lx\n",
-		info->line, cmd, arg);
-#endif
-
-	switch (cmd) {
-	case CYGETMON:
-		if (copy_to_user(argp, &info->mon, sizeof(info->mon))) {
-			ret_val = -EFAULT;
-			break;
-		}
-		memset(&info->mon, 0, sizeof(info->mon));
-		break;
-	case CYGETTHRESH:
-		ret_val = get_threshold(info, argp);
-		break;
-	case CYSETTHRESH:
-		ret_val = set_threshold(info, arg);
-		break;
-	case CYGETDEFTHRESH:
-		ret_val = put_user(info->default_threshold,
-				(unsigned long __user *)argp);
-		break;
-	case CYSETDEFTHRESH:
-		info->default_threshold = arg & 0x0f;
-		break;
-	case CYGETTIMEOUT:
-		ret_val = get_timeout(info, argp);
-		break;
-	case CYSETTIMEOUT:
-		ret_val = set_timeout(info, arg);
-		break;
-	case CYGETDEFTIMEOUT:
-		ret_val = put_user(info->default_timeout,
-				(unsigned long __user *)argp);
-		break;
-	case CYSETDEFTIMEOUT:
-		info->default_timeout = arg & 0xff;
-		break;
-	case CYSETRFLOW:
-		info->rflow = (int)arg;
-		break;
-	case CYGETRFLOW:
-		ret_val = info->rflow;
-		break;
-	case CYSETRTSDTR_INV:
-		info->rtsdtr_inv = (int)arg;
-		break;
-	case CYGETRTSDTR_INV:
-		ret_val = info->rtsdtr_inv;
-		break;
-	case CYGETCD1400VER:
-		ret_val = info->chip_rev;
-		break;
-#ifndef CONFIG_CYZ_INTR
-	case CYZSETPOLLCYCLE:
-		if (arg > LONG_MAX / HZ)
-			return -ENODEV;
-		cyz_polling_cycle = (arg * HZ) / 1000;
-		break;
-	case CYZGETPOLLCYCLE:
-		ret_val = (cyz_polling_cycle * 1000) / HZ;
-		break;
-#endif				/* CONFIG_CYZ_INTR */
-	case CYSETWAIT:
-		info->port.closing_wait = (unsigned short)arg * HZ / 100;
-		break;
-	case CYGETWAIT:
-		ret_val = info->port.closing_wait / (HZ / 100);
-		break;
-	case TIOCSERGETLSR:	/* Get line status register */
-		ret_val = get_lsr_info(info, argp);
-		break;
-		/*
-		 * Wait for any of the 4 modem inputs (DCD,RI,DSR,CTS) to change
-		 * - mask passed in arg for lines of interest
-		 *   (use |'ed TIOCM_RNG/DSR/CD/CTS for masking)
-		 * Caller should use TIOCGICOUNT to see which one it was
-		 */
-	case TIOCMIWAIT:
-		spin_lock_irqsave(&info->card->card_lock, flags);
-		/* note the counters on entry */
-		cnow = info->icount;
-		spin_unlock_irqrestore(&info->card->card_lock, flags);
-		ret_val = wait_event_interruptible(info->port.delta_msr_wait,
-				cy_cflags_changed(info, arg, &cnow));
-		break;
-
-		/*
-		 * Get counter of input serial line interrupts (DCD,RI,DSR,CTS)
-		 * Return: write counters to the user passed counter struct
-		 * NB: both 1->0 and 0->1 transitions are counted except for
-		 *     RI where only 0->1 is counted.
-		 */
-	default:
-		ret_val = -ENOIOCTLCMD;
-	}
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_ioctl done\n");
-#endif
-	return ret_val;
-}				/* cy_ioctl */
-
-static int cy_get_icount(struct tty_struct *tty,
-				struct serial_icounter_struct *sic)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_icount cnow;	/* Used to snapshot */
-	unsigned long flags;
-
-	spin_lock_irqsave(&info->card->card_lock, flags);
-	cnow = info->icount;
-	spin_unlock_irqrestore(&info->card->card_lock, flags);
-
-	sic->cts = cnow.cts;
-	sic->dsr = cnow.dsr;
-	sic->rng = cnow.rng;
-	sic->dcd = cnow.dcd;
-	sic->rx = cnow.rx;
-	sic->tx = cnow.tx;
-	sic->frame = cnow.frame;
-	sic->overrun = cnow.overrun;
-	sic->parity = cnow.parity;
-	sic->brk = cnow.brk;
-	sic->buf_overrun = cnow.buf_overrun;
-	return 0;
-}
-
-/*
- * This routine allows the tty driver to be notified when
- * device's termios settings have changed.  Note that a
- * well-designed tty driver should be prepared to accept the case
- * where old == NULL, and try to do something rational.
- */
-static void cy_set_termios(struct tty_struct *tty, struct ktermios *old_termios)
-{
-	struct cyclades_port *info = tty->driver_data;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_set_termios ttyC%d\n", info->line);
-#endif
-
-	cy_set_line_char(info, tty);
-
-	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
-		tty->hw_stopped = 0;
-		cy_start(tty);
-	}
-#if 0
-	/*
-	 * No need to wake up processes in open wait, since they
-	 * sample the CLOCAL flag once, and don't recheck it.
-	 * XXX  It's not clear whether the current behavior is correct
-	 * or not.  Hence, this may change.....
-	 */
-	if (!(old_termios->c_cflag & CLOCAL) && C_CLOCAL(tty))
-		wake_up_interruptible(&info->port.open_wait);
-#endif
-}				/* cy_set_termios */
-
-/* This function is used to send a high-priority XON/XOFF character to
-   the device.
-*/
-static void cy_send_xchar(struct tty_struct *tty, char ch)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	int channel;
-
-	if (serial_paranoia_check(info, tty->name, "cy_send_xchar"))
-		return;
-
-	info->x_char = ch;
-
-	if (ch)
-		cy_start(tty);
-
-	card = info->card;
-	channel = info->line - card->first_line;
-
-	if (cy_is_Z(card)) {
-		if (ch == STOP_CHAR(tty))
-			cyz_issue_cmd(card, channel, C_CM_SENDXOFF, 0L);
-		else if (ch == START_CHAR(tty))
-			cyz_issue_cmd(card, channel, C_CM_SENDXON, 0L);
-	}
-}
-
-/* This routine is called by the upper-layer tty layer to signal
-   that incoming characters should be throttled because the input
-   buffers are close to full.
- */
-static void cy_throttle(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_THROTTLE
-	printk(KERN_DEBUG "cyc:throttle %s ...ttyC%d\n", tty_name(tty),
-			 info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_throttle"))
-		return;
-
-	card = info->card;
-
-	if (I_IXOFF(tty)) {
-		if (!cy_is_Z(card))
-			cy_send_xchar(tty, STOP_CHAR(tty));
-		else
-			info->throttle = 1;
-	}
-
-	if (C_CRTSCTS(tty)) {
-		if (!cy_is_Z(card)) {
-			spin_lock_irqsave(&card->card_lock, flags);
-			cyy_change_rts_dtr(info, 0, TIOCM_RTS);
-			spin_unlock_irqrestore(&card->card_lock, flags);
-		} else {
-			info->throttle = 1;
-		}
-	}
-}				/* cy_throttle */
-
-/*
- * This routine notifies the tty driver that it should signal
- * that characters can now be sent to the tty without fear of
- * overrunning the input buffers of the line disciplines.
- */
-static void cy_unthrottle(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-	struct cyclades_card *card;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_THROTTLE
-	printk(KERN_DEBUG "cyc:unthrottle %s ...ttyC%d\n",
-		tty_name(tty), info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_unthrottle"))
-		return;
-
-	if (I_IXOFF(tty)) {
-		if (info->x_char)
-			info->x_char = 0;
-		else
-			cy_send_xchar(tty, START_CHAR(tty));
-	}
-
-	if (C_CRTSCTS(tty)) {
-		card = info->card;
-		if (!cy_is_Z(card)) {
-			spin_lock_irqsave(&card->card_lock, flags);
-			cyy_change_rts_dtr(info, TIOCM_RTS, 0);
-			spin_unlock_irqrestore(&card->card_lock, flags);
-		} else {
-			info->throttle = 0;
-		}
-	}
-}				/* cy_unthrottle */
-
-/* cy_start and cy_stop provide software output flow control as a
-   function of XON/XOFF, software CTS, and other such stuff.
-*/
-static void cy_stop(struct tty_struct *tty)
-{
-	struct cyclades_card *cinfo;
-	struct cyclades_port *info = tty->driver_data;
-	int channel;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_stop ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_stop"))
-		return;
-
-	cinfo = info->card;
-	channel = info->line - cinfo->first_line;
-	if (!cy_is_Z(cinfo)) {
-		spin_lock_irqsave(&cinfo->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) & ~CyTxRdy);
-		spin_unlock_irqrestore(&cinfo->card_lock, flags);
-	}
-}				/* cy_stop */
-
-static void cy_start(struct tty_struct *tty)
-{
-	struct cyclades_card *cinfo;
-	struct cyclades_port *info = tty->driver_data;
-	int channel;
-	unsigned long flags;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_start ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_start"))
-		return;
-
-	cinfo = info->card;
-	channel = info->line - cinfo->first_line;
-	if (!cy_is_Z(cinfo)) {
-		spin_lock_irqsave(&cinfo->card_lock, flags);
-		cyy_writeb(info, CyCAR, channel & 0x03);
-		cyy_writeb(info, CySRER, cyy_readb(info, CySRER) | CyTxRdy);
-		spin_unlock_irqrestore(&cinfo->card_lock, flags);
-	}
-}				/* cy_start */
-
-/*
- * cy_hangup() --- called by tty_hangup() when a hangup is signaled.
- */
-static void cy_hangup(struct tty_struct *tty)
-{
-	struct cyclades_port *info = tty->driver_data;
-
-#ifdef CY_DEBUG_OTHER
-	printk(KERN_DEBUG "cyc:cy_hangup ttyC%d\n", info->line);
-#endif
-
-	if (serial_paranoia_check(info, tty->name, "cy_hangup"))
-		return;
-
-	cy_flush_buffer(tty);
-	cy_shutdown(info, tty);
-	tty_port_hangup(&info->port);
-}				/* cy_hangup */
-
-static int cyy_carrier_raised(struct tty_port *port)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-			port);
-	struct cyclades_card *cinfo = info->card;
-	unsigned long flags;
-	int channel = info->line - cinfo->first_line;
-	u32 cd;
-
-	spin_lock_irqsave(&cinfo->card_lock, flags);
-	cyy_writeb(info, CyCAR, channel & 0x03);
-	cd = cyy_readb(info, CyMSVR1) & CyDCD;
-	spin_unlock_irqrestore(&cinfo->card_lock, flags);
-
-	return cd;
-}
-
-static void cyy_dtr_rts(struct tty_port *port, int raise)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-			port);
-	struct cyclades_card *cinfo = info->card;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cinfo->card_lock, flags);
-	cyy_change_rts_dtr(info, raise ? TIOCM_RTS | TIOCM_DTR : 0,
-			raise ? 0 : TIOCM_RTS | TIOCM_DTR);
-	spin_unlock_irqrestore(&cinfo->card_lock, flags);
-}
-
-static int cyz_carrier_raised(struct tty_port *port)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-			port);
-
-	return readl(&info->u.cyz.ch_ctrl->rs_status) & C_RS_DCD;
-}
-
-static void cyz_dtr_rts(struct tty_port *port, int raise)
-{
-	struct cyclades_port *info = container_of(port, struct cyclades_port,
-			port);
-	struct cyclades_card *cinfo = info->card;
-	struct CH_CTRL __iomem *ch_ctrl = info->u.cyz.ch_ctrl;
-	int ret, channel = info->line - cinfo->first_line;
-	u32 rs;
-
-	rs = readl(&ch_ctrl->rs_control);
-	if (raise)
-		rs |= C_RS_RTS | C_RS_DTR;
-	else
-		rs &= ~(C_RS_RTS | C_RS_DTR);
-	cy_writel(&ch_ctrl->rs_control, rs);
-	ret = cyz_issue_cmd(cinfo, channel, C_CM_IOCTLM, 0L);
-	if (ret != 0)
-		printk(KERN_ERR "%s: retval on ttyC%d was %x\n",
-				__func__, info->line, ret);
-#ifdef CY_DEBUG_DTR
-	printk(KERN_DEBUG "%s: raising Z DTR\n", __func__);
-#endif
-}
-
-static const struct tty_port_operations cyy_port_ops = {
-	.carrier_raised = cyy_carrier_raised,
-	.dtr_rts = cyy_dtr_rts,
-	.shutdown = cy_do_close,
-};
-
-static const struct tty_port_operations cyz_port_ops = {
-	.carrier_raised = cyz_carrier_raised,
-	.dtr_rts = cyz_dtr_rts,
-	.shutdown = cy_do_close,
-};
-
-/*
- * ---------------------------------------------------------------------
- * cy_init() and friends
- *
- * cy_init() is called at boot-time to initialize the serial driver.
- * ---------------------------------------------------------------------
- */
-
-static int cy_init_card(struct cyclades_card *cinfo)
-{
-	struct cyclades_port *info;
-	unsigned int channel, port;
-
-	spin_lock_init(&cinfo->card_lock);
-	cinfo->intr_enabled = 0;
-
-	cinfo->ports = kcalloc(cinfo->nports, sizeof(*cinfo->ports),
-			GFP_KERNEL);
-	if (cinfo->ports == NULL) {
-		printk(KERN_ERR "Cyclades: cannot allocate ports\n");
-		return -ENOMEM;
-	}
-
-	for (channel = 0, port = cinfo->first_line; channel < cinfo->nports;
-			channel++, port++) {
-		info = &cinfo->ports[channel];
-		tty_port_init(&info->port);
-		info->magic = CYCLADES_MAGIC;
-		info->card = cinfo;
-		info->line = port;
-
-		info->port.closing_wait = CLOSING_WAIT_DELAY;
-		info->port.close_delay = 5 * HZ / 10;
-		init_completion(&info->shutdown_wait);
-
-		if (cy_is_Z(cinfo)) {
-			struct FIRM_ID *firm_id = cinfo->base_addr + ID_ADDRESS;
-			struct ZFW_CTRL *zfw_ctrl;
-
-			info->port.ops = &cyz_port_ops;
-			info->type = PORT_STARTECH;
-
-			zfw_ctrl = cinfo->base_addr +
-				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-			info->u.cyz.ch_ctrl = &zfw_ctrl->ch_ctrl[channel];
-			info->u.cyz.buf_ctrl = &zfw_ctrl->buf_ctrl[channel];
-
-			if (cinfo->hw_ver == ZO_V1)
-				info->xmit_fifo_size = CYZ_FIFO_SIZE;
-			else
-				info->xmit_fifo_size = 4 * CYZ_FIFO_SIZE;
-#ifdef CONFIG_CYZ_INTR
-			timer_setup(&info->rx_full_timer, cyz_rx_restart, 0);
-#endif
-		} else {
-			unsigned short chip_number;
-			int index = cinfo->bus_index;
-
-			info->port.ops = &cyy_port_ops;
-			info->type = PORT_CIRRUS;
-			info->xmit_fifo_size = CyMAX_CHAR_FIFO;
-			info->cor1 = CyPARITY_NONE | Cy_1_STOP | Cy_8_BITS;
-			info->cor2 = CyETC;
-			info->cor3 = 0x08;	/* _very_ small rcv threshold */
-
-			chip_number = channel / CyPORTS_PER_CHIP;
-			info->u.cyy.base_addr = cinfo->base_addr +
-				(cy_chip_offset[chip_number] << index);
-			info->chip_rev = cyy_readb(info, CyGFRCR);
-
-			if (info->chip_rev >= CD1400_REV_J) {
-				/* It is a CD1400 rev. J or later */
-				info->tbpr = baud_bpr_60[13];	/* Tx BPR */
-				info->tco = baud_co_60[13];	/* Tx CO */
-				info->rbpr = baud_bpr_60[13];	/* Rx BPR */
-				info->rco = baud_co_60[13];	/* Rx CO */
-				info->rtsdtr_inv = 1;
-			} else {
-				info->tbpr = baud_bpr_25[13];	/* Tx BPR */
-				info->tco = baud_co_25[13];	/* Tx CO */
-				info->rbpr = baud_bpr_25[13];	/* Rx BPR */
-				info->rco = baud_co_25[13];	/* Rx CO */
-				info->rtsdtr_inv = 0;
-			}
-			info->read_status_mask = CyTIMEOUT | CySPECHAR |
-				CyBREAK | CyPARITY | CyFRAME | CyOVERRUN;
-		}
-
-	}
-
-#ifndef CONFIG_CYZ_INTR
-	if (cy_is_Z(cinfo) && !timer_pending(&cyz_timerlist)) {
-		mod_timer(&cyz_timerlist, jiffies + 1);
-#ifdef CY_PCI_DEBUG
-		printk(KERN_DEBUG "Cyclades-Z polling initialized\n");
-#endif
-	}
-#endif
-	return 0;
-}
-
-/* initialize chips on Cyclom-Y card -- return number of valid
-   chips (which is number of ports/4) */
-static unsigned short cyy_init_card(void __iomem *true_base_addr,
-		int index)
-{
-	unsigned int chip_number;
-	void __iomem *base_addr;
-
-	cy_writeb(true_base_addr + (Cy_HwReset << index), 0);
-	/* Cy_HwReset is 0x1400 */
-	cy_writeb(true_base_addr + (Cy_ClrIntr << index), 0);
-	/* Cy_ClrIntr is 0x1800 */
-	udelay(500L);
-
-	for (chip_number = 0; chip_number < CyMAX_CHIPS_PER_CARD;
-							chip_number++) {
-		base_addr =
-		    true_base_addr + (cy_chip_offset[chip_number] << index);
-		mdelay(1);
-		if (readb(base_addr + (CyCCR << index)) != 0x00) {
-			/*************
-			printk(" chip #%d at %#6lx is never idle (CCR != 0)\n",
-			chip_number, (unsigned long)base_addr);
-			*************/
-			return chip_number;
-		}
-
-		cy_writeb(base_addr + (CyGFRCR << index), 0);
-		udelay(10L);
-
-		/* The Cyclom-16Y does not decode address bit 9 and therefore
-		   cannot distinguish between references to chip 0 and a non-
-		   existent chip 4.  If the preceding clearing of the supposed
-		   chip 4 GFRCR register appears at chip 0, there is no chip 4
-		   and this must be a Cyclom-16Y, not a Cyclom-32Ye.
-		 */
-		if (chip_number == 4 && readb(true_base_addr +
-				(cy_chip_offset[0] << index) +
-				(CyGFRCR << index)) == 0) {
-			return chip_number;
-		}
-
-		cy_writeb(base_addr + (CyCCR << index), CyCHIP_RESET);
-		mdelay(1);
-
-		if (readb(base_addr + (CyGFRCR << index)) == 0x00) {
-			/*
-			   printk(" chip #%d at %#6lx is not responding ",
-			   chip_number, (unsigned long)base_addr);
-			   printk("(GFRCR stayed 0)\n",
-			 */
-			return chip_number;
-		}
-		if ((0xf0 & (readb(base_addr + (CyGFRCR << index)))) !=
-				0x40) {
-			/*
-			printk(" chip #%d at %#6lx is not valid (GFRCR == "
-					"%#2x)\n",
-					chip_number, (unsigned long)base_addr,
-					base_addr[CyGFRCR<<index]);
-			 */
-			return chip_number;
-		}
-		cy_writeb(base_addr + (CyGCR << index), CyCH0_SERIAL);
-		if (readb(base_addr + (CyGFRCR << index)) >= CD1400_REV_J) {
-			/* It is a CD1400 rev. J or later */
-			/* Impossible to reach 5ms with this chip.
-			   Changed to 2ms instead (f = 500 Hz). */
-			cy_writeb(base_addr + (CyPPR << index), CyCLOCK_60_2MS);
-		} else {
-			/* f = 200 Hz */
-			cy_writeb(base_addr + (CyPPR << index), CyCLOCK_25_5MS);
-		}
-
-		/*
-		   printk(" chip #%d at %#6lx is rev 0x%2x\n",
-		   chip_number, (unsigned long)base_addr,
-		   readb(base_addr+(CyGFRCR<<index)));
-		 */
-	}
-	return chip_number;
-}				/* cyy_init_card */
-
-/*
- * ---------------------------------------------------------------------
- * cy_detect_isa() - Probe for Cyclom-Y/ISA boards.
- * sets global variables and return the number of ISA boards found.
- * ---------------------------------------------------------------------
- */
-static int __init cy_detect_isa(void)
-{
-#ifdef CONFIG_ISA
-	struct cyclades_card *card;
-	unsigned short cy_isa_irq, nboard;
-	void __iomem *cy_isa_address;
-	unsigned short i, j, k, cy_isa_nchan;
-	int isparam = 0;
-
-	nboard = 0;
-
-	/* Check for module parameters */
-	for (i = 0; i < NR_CARDS; i++) {
-		if (maddr[i] || i) {
-			isparam = 1;
-			cy_isa_addresses[i] = maddr[i];
-		}
-		if (!maddr[i])
-			break;
-	}
-
-	/* scan the address table probing for Cyclom-Y/ISA boards */
-	for (i = 0; i < NR_ISA_ADDRS; i++) {
-		unsigned int isa_address = cy_isa_addresses[i];
-		if (isa_address == 0x0000)
-			return nboard;
-
-		/* probe for CD1400... */
-		cy_isa_address = ioremap(isa_address, CyISA_Ywin);
-		if (cy_isa_address == NULL) {
-			printk(KERN_ERR "Cyclom-Y/ISA: can't remap base "
-					"address\n");
-			continue;
-		}
-		cy_isa_nchan = CyPORTS_PER_CHIP *
-			cyy_init_card(cy_isa_address, 0);
-		if (cy_isa_nchan == 0) {
-			iounmap(cy_isa_address);
-			continue;
-		}
-
-		if (isparam && i < NR_CARDS && irq[i])
-			cy_isa_irq = irq[i];
-		else
-			/* find out the board's irq by probing */
-			cy_isa_irq = detect_isa_irq(cy_isa_address);
-		if (cy_isa_irq == 0) {
-			printk(KERN_ERR "Cyclom-Y/ISA found at 0x%lx, but the "
-				"IRQ could not be detected.\n",
-				(unsigned long)cy_isa_address);
-			iounmap(cy_isa_address);
-			continue;
-		}
-
-		if ((cy_next_channel + cy_isa_nchan) > NR_PORTS) {
-			printk(KERN_ERR "Cyclom-Y/ISA found at 0x%lx, but no "
-				"more channels are available. Change NR_PORTS "
-				"in cyclades.c and recompile kernel.\n",
-				(unsigned long)cy_isa_address);
-			iounmap(cy_isa_address);
-			return nboard;
-		}
-		/* fill the next cy_card structure available */
-		for (j = 0; j < NR_CARDS; j++) {
-			card = &cy_card[j];
-			if (card->base_addr == NULL)
-				break;
-		}
-		if (j == NR_CARDS) {	/* no more cy_cards available */
-			printk(KERN_ERR "Cyclom-Y/ISA found at 0x%lx, but no "
-				"more cards can be used. Change NR_CARDS in "
-				"cyclades.c and recompile kernel.\n",
-				(unsigned long)cy_isa_address);
-			iounmap(cy_isa_address);
-			return nboard;
-		}
-
-		/* allocate IRQ */
-		if (request_irq(cy_isa_irq, cyy_interrupt,
-				0, "Cyclom-Y", card)) {
-			printk(KERN_ERR "Cyclom-Y/ISA found at 0x%lx, but "
-				"could not allocate IRQ#%d.\n",
-				(unsigned long)cy_isa_address, cy_isa_irq);
-			iounmap(cy_isa_address);
-			return nboard;
-		}
-
-		/* set cy_card */
-		card->base_addr = cy_isa_address;
-		card->ctl_addr.p9050 = NULL;
-		card->irq = (int)cy_isa_irq;
-		card->bus_index = 0;
-		card->first_line = cy_next_channel;
-		card->num_chips = cy_isa_nchan / CyPORTS_PER_CHIP;
-		card->nports = cy_isa_nchan;
-		if (cy_init_card(card)) {
-			card->base_addr = NULL;
-			free_irq(cy_isa_irq, card);
-			iounmap(cy_isa_address);
-			continue;
-		}
-		nboard++;
-
-		printk(KERN_INFO "Cyclom-Y/ISA #%d: 0x%lx-0x%lx, IRQ%d found: "
-			"%d channels starting from port %d\n",
-			j + 1, (unsigned long)cy_isa_address,
-			(unsigned long)(cy_isa_address + (CyISA_Ywin - 1)),
-			cy_isa_irq, cy_isa_nchan, cy_next_channel);
-
-		for (k = 0, j = cy_next_channel;
-				j < cy_next_channel + cy_isa_nchan; j++, k++)
-			tty_port_register_device(&card->ports[k].port,
-					cy_serial_driver, j, NULL);
-		cy_next_channel += cy_isa_nchan;
-	}
-	return nboard;
-#else
-	return 0;
-#endif				/* CONFIG_ISA */
-}				/* cy_detect_isa */
-
-#ifdef CONFIG_PCI
-static inline int cyc_isfwstr(const char *str, unsigned int size)
-{
-	unsigned int a;
-
-	for (a = 0; a < size && *str; a++, str++)
-		if (*str & 0x80)
-			return -EINVAL;
-
-	for (; a < size; a++, str++)
-		if (*str)
-			return -EINVAL;
-
-	return 0;
-}
-
-static inline void cyz_fpga_copy(void __iomem *fpga, const u8 *data,
-		unsigned int size)
-{
-	for (; size > 0; size--) {
-		cy_writel(fpga, *data++);
-		udelay(10);
-	}
-}
-
-static void plx_init(struct pci_dev *pdev, int irq,
-		struct RUNTIME_9060 __iomem *addr)
-{
-	/* Reset PLX */
-	cy_writel(&addr->init_ctrl, readl(&addr->init_ctrl) | 0x40000000);
-	udelay(100L);
-	cy_writel(&addr->init_ctrl, readl(&addr->init_ctrl) & ~0x40000000);
-
-	/* Reload Config. Registers from EEPROM */
-	cy_writel(&addr->init_ctrl, readl(&addr->init_ctrl) | 0x20000000);
-	udelay(100L);
-	cy_writel(&addr->init_ctrl, readl(&addr->init_ctrl) & ~0x20000000);
-
-	/* For some yet unknown reason, once the PLX9060 reloads the EEPROM,
-	 * the IRQ is lost and, thus, we have to re-write it to the PCI config.
-	 * registers. This will remain here until we find a permanent fix.
-	 */
-	pci_write_config_byte(pdev, PCI_INTERRUPT_LINE, irq);
-}
-
-static int __cyz_load_fw(const struct firmware *fw,
-		const char *name, const u32 mailbox, void __iomem *base,
-		void __iomem *fpga)
-{
-	const void *ptr = fw->data;
-	const struct zfile_header *h = ptr;
-	const struct zfile_config *c, *cs;
-	const struct zfile_block *b, *bs;
-	unsigned int a, tmp, len = fw->size;
-#define BAD_FW KERN_ERR "Bad firmware: "
-	if (len < sizeof(*h)) {
-		printk(BAD_FW "too short: %u<%zu\n", len, sizeof(*h));
-		return -EINVAL;
-	}
-
-	cs = ptr + h->config_offset;
-	bs = ptr + h->block_offset;
-
-	if ((void *)(cs + h->n_config) > ptr + len ||
-			(void *)(bs + h->n_blocks) > ptr + len) {
-		printk(BAD_FW "too short");
-		return  -EINVAL;
-	}
-
-	if (cyc_isfwstr(h->name, sizeof(h->name)) ||
-			cyc_isfwstr(h->date, sizeof(h->date))) {
-		printk(BAD_FW "bad formatted header string\n");
-		return -EINVAL;
-	}
-
-	if (strncmp(name, h->name, sizeof(h->name))) {
-		printk(BAD_FW "bad name '%s' (expected '%s')\n", h->name, name);
-		return -EINVAL;
-	}
-
-	tmp = 0;
-	for (c = cs; c < cs + h->n_config; c++) {
-		for (a = 0; a < c->n_blocks; a++)
-			if (c->block_list[a] > h->n_blocks) {
-				printk(BAD_FW "bad block ref number in cfgs\n");
-				return -EINVAL;
-			}
-		if (c->mailbox == mailbox && c->function == 0) /* 0 is normal */
-			tmp++;
-	}
-	if (!tmp) {
-		printk(BAD_FW "nothing appropriate\n");
-		return -EINVAL;
-	}
-
-	for (b = bs; b < bs + h->n_blocks; b++)
-		if (b->file_offset + b->size > len) {
-			printk(BAD_FW "bad block data offset\n");
-			return -EINVAL;
-		}
-
-	/* everything is OK, let's seek'n'load it */
-	for (c = cs; c < cs + h->n_config; c++)
-		if (c->mailbox == mailbox && c->function == 0)
-			break;
-
-	for (a = 0; a < c->n_blocks; a++) {
-		b = &bs[c->block_list[a]];
-		if (b->type == ZBLOCK_FPGA) {
-			if (fpga != NULL)
-				cyz_fpga_copy(fpga, ptr + b->file_offset,
-						b->size);
-		} else {
-			if (base != NULL)
-				memcpy_toio(base + b->ram_offset,
-					       ptr + b->file_offset, b->size);
-		}
-	}
-#undef BAD_FW
-	return 0;
-}
-
-static int cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
-		struct RUNTIME_9060 __iomem *ctl_addr, int irq)
-{
-	const struct firmware *fw;
-	struct FIRM_ID __iomem *fid = base_addr + ID_ADDRESS;
-	struct CUSTOM_REG __iomem *cust = base_addr;
-	struct ZFW_CTRL __iomem *pt_zfwctrl;
-	void __iomem *tmp;
-	u32 mailbox, status, nchan;
-	unsigned int i;
-	int retval;
-
-	retval = request_firmware(&fw, "cyzfirm.bin", &pdev->dev);
-	if (retval) {
-		dev_err(&pdev->dev, "can't get firmware\n");
-		goto err;
-	}
-
-	/* Check whether the firmware is already loaded and running. If
-	   positive, skip this board */
-	if (__cyz_fpga_loaded(ctl_addr) && readl(&fid->signature) == ZFIRM_ID) {
-		u32 cntval = readl(base_addr + 0x190);
-
-		udelay(100);
-		if (cntval != readl(base_addr + 0x190)) {
-			/* FW counter is working, FW is running */
-			dev_dbg(&pdev->dev, "Cyclades-Z FW already loaded. "
-					"Skipping board.\n");
-			retval = 0;
-			goto err_rel;
-		}
-	}
-
-	/* start boot */
-	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) &
-			~0x00030800UL);
-
-	mailbox = readl(&ctl_addr->mail_box_0);
-
-	if (mailbox == 0 || __cyz_fpga_loaded(ctl_addr)) {
-		/* stops CPU and set window to beginning of RAM */
-		cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
-		cy_writel(&cust->cpu_stop, 0);
-		cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-		udelay(100);
-	}
-
-	plx_init(pdev, irq, ctl_addr);
-
-	if (mailbox != 0) {
-		/* load FPGA */
-		retval = __cyz_load_fw(fw, "Cyclom-Z", mailbox, NULL,
-				base_addr);
-		if (retval)
-			goto err_rel;
-		if (!__cyz_fpga_loaded(ctl_addr)) {
-			dev_err(&pdev->dev, "fw upload successful, but fw is "
-					"not loaded\n");
-			goto err_rel;
-		}
-	}
-
-	/* stops CPU and set window to beginning of RAM */
-	cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
-	cy_writel(&cust->cpu_stop, 0);
-	cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-	udelay(100);
-
-	/* clear memory */
-	for (tmp = base_addr; tmp < base_addr + RAM_SIZE; tmp++)
-		cy_writeb(tmp, 255);
-	if (mailbox != 0) {
-		/* set window to last 512K of RAM */
-		cy_writel(&ctl_addr->loc_addr_base, WIN_RAM + RAM_SIZE);
-		for (tmp = base_addr; tmp < base_addr + RAM_SIZE; tmp++)
-			cy_writeb(tmp, 255);
-		/* set window to beginning of RAM */
-		cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-	}
-
-	retval = __cyz_load_fw(fw, "Cyclom-Z", mailbox, base_addr, NULL);
-	release_firmware(fw);
-	if (retval)
-		goto err;
-
-	/* finish boot and start boards */
-	cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
-	cy_writel(&cust->cpu_start, 0);
-	cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-	i = 0;
-	while ((status = readl(&fid->signature)) != ZFIRM_ID && i++ < 40)
-		msleep(100);
-	if (status != ZFIRM_ID) {
-		if (status == ZFIRM_HLT) {
-			dev_err(&pdev->dev, "you need an external power supply "
-				"for this number of ports. Firmware halted and "
-				"board reset.\n");
-			retval = -EIO;
-			goto err;
-		}
-		dev_warn(&pdev->dev, "fid->signature = 0x%x... Waiting "
-				"some more time\n", status);
-		while ((status = readl(&fid->signature)) != ZFIRM_ID &&
-				i++ < 200)
-			msleep(100);
-		if (status != ZFIRM_ID) {
-			dev_err(&pdev->dev, "Board not started in 20 seconds! "
-					"Giving up. (fid->signature = 0x%x)\n",
-					status);
-			dev_info(&pdev->dev, "*** Warning ***: if you are "
-				"upgrading the FW, please power cycle the "
-				"system before loading the new FW to the "
-				"Cyclades-Z.\n");
-
-			if (__cyz_fpga_loaded(ctl_addr))
-				plx_init(pdev, irq, ctl_addr);
-
-			retval = -EIO;
-			goto err;
-		}
-		dev_dbg(&pdev->dev, "Firmware started after %d seconds.\n",
-				i / 10);
-	}
-	pt_zfwctrl = base_addr + readl(&fid->zfwctrl_addr);
-
-	dev_dbg(&pdev->dev, "fid=> %p, zfwctrl_addr=> %x, npt_zfwctrl=> %p\n",
-			base_addr + ID_ADDRESS, readl(&fid->zfwctrl_addr),
-			base_addr + readl(&fid->zfwctrl_addr));
-
-	nchan = readl(&pt_zfwctrl->board_ctrl.n_channel);
-	dev_info(&pdev->dev, "Cyclades-Z FW loaded: version = %x, ports = %u\n",
-		readl(&pt_zfwctrl->board_ctrl.fw_version), nchan);
-
-	if (nchan == 0) {
-		dev_warn(&pdev->dev, "no Cyclades-Z ports were found. Please "
-			"check the connection between the Z host card and the "
-			"serial expanders.\n");
-
-		if (__cyz_fpga_loaded(ctl_addr))
-			plx_init(pdev, irq, ctl_addr);
-
-		dev_info(&pdev->dev, "Null number of ports detected. Board "
-				"reset.\n");
-		retval = 0;
-		goto err;
-	}
-
-	cy_writel(&pt_zfwctrl->board_ctrl.op_system, C_OS_LINUX);
-	cy_writel(&pt_zfwctrl->board_ctrl.dr_version, DRIVER_VERSION);
-
-	/*
-	   Early firmware failed to start looking for commands.
-	   This enables firmware interrupts for those commands.
-	 */
-	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
-			(1 << 17));
-	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
-			0x00030800UL);
-
-	return nchan;
-err_rel:
-	release_firmware(fw);
-err:
-	return retval;
-}
-
-static int cy_pci_probe(struct pci_dev *pdev,
-		const struct pci_device_id *ent)
-{
-	struct cyclades_card *card;
-	void __iomem *addr0 = NULL, *addr2 = NULL;
-	char *card_name = NULL;
-	u32 mailbox;
-	unsigned int device_id, nchan = 0, card_no, i, j;
-	unsigned char plx_ver;
-	int retval, irq;
-
-	retval = pci_enable_device(pdev);
-	if (retval) {
-		dev_err(&pdev->dev, "cannot enable device\n");
-		goto err;
-	}
-
-	/* read PCI configuration area */
-	irq = pdev->irq;
-	device_id = pdev->device & ~PCI_DEVICE_ID_MASK;
-
-#if defined(__alpha__)
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Y_Lo) {	/* below 1M? */
-		dev_err(&pdev->dev, "Cyclom-Y/PCI not supported for low "
-			"addresses on Alpha systems.\n");
-		retval = -EIO;
-		goto err_dis;
-	}
-#endif
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Z_Lo) {
-		dev_err(&pdev->dev, "Cyclades-Z/PCI not supported for low "
-			"addresses\n");
-		retval = -EIO;
-		goto err_dis;
-	}
-
-	if (pci_resource_flags(pdev, 2) & IORESOURCE_IO) {
-		dev_warn(&pdev->dev, "PCI I/O bit incorrectly set. Ignoring "
-				"it...\n");
-		pdev->resource[2].flags &= ~IORESOURCE_IO;
-	}
-
-	retval = pci_request_regions(pdev, "cyclades");
-	if (retval) {
-		dev_err(&pdev->dev, "failed to reserve resources\n");
-		goto err_dis;
-	}
-
-	retval = -EIO;
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Y_Lo ||
-			device_id == PCI_DEVICE_ID_CYCLOM_Y_Hi) {
-		card_name = "Cyclom-Y";
-
-		addr0 = ioremap(pci_resource_start(pdev, 0),
-				CyPCI_Yctl);
-		if (addr0 == NULL) {
-			dev_err(&pdev->dev, "can't remap ctl region\n");
-			goto err_reg;
-		}
-		addr2 = ioremap(pci_resource_start(pdev, 2),
-				CyPCI_Ywin);
-		if (addr2 == NULL) {
-			dev_err(&pdev->dev, "can't remap base region\n");
-			goto err_unmap;
-		}
-
-		nchan = CyPORTS_PER_CHIP * cyy_init_card(addr2, 1);
-		if (nchan == 0) {
-			dev_err(&pdev->dev, "Cyclom-Y PCI host card with no "
-					"Serial-Modules\n");
-			goto err_unmap;
-		}
-	} else if (device_id == PCI_DEVICE_ID_CYCLOM_Z_Hi) {
-		struct RUNTIME_9060 __iomem *ctl_addr;
-
-		ctl_addr = addr0 = ioremap(pci_resource_start(pdev, 0),
-				CyPCI_Zctl);
-		if (addr0 == NULL) {
-			dev_err(&pdev->dev, "can't remap ctl region\n");
-			goto err_reg;
-		}
-
-		/* Disable interrupts on the PLX before resetting it */
-		cy_writew(&ctl_addr->intr_ctrl_stat,
-				readw(&ctl_addr->intr_ctrl_stat) & ~0x0900);
-
-		plx_init(pdev, irq, addr0);
-
-		mailbox = readl(&ctl_addr->mail_box_0);
-
-		addr2 = ioremap(pci_resource_start(pdev, 2),
-				mailbox == ZE_V1 ? CyPCI_Ze_win : CyPCI_Zwin);
-		if (addr2 == NULL) {
-			dev_err(&pdev->dev, "can't remap base region\n");
-			goto err_unmap;
-		}
-
-		if (mailbox == ZE_V1) {
-			card_name = "Cyclades-Ze";
-		} else {
-			card_name = "Cyclades-8Zo";
-#ifdef CY_PCI_DEBUG
-			if (mailbox == ZO_V1) {
-				cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
-				dev_info(&pdev->dev, "Cyclades-8Zo/PCI: FPGA "
-					"id %lx, ver %lx\n", (ulong)(0xff &
-					readl(&((struct CUSTOM_REG *)addr2)->
-						fpga_id)), (ulong)(0xff &
-					readl(&((struct CUSTOM_REG *)addr2)->
-						fpga_version)));
-				cy_writel(&ctl_addr->loc_addr_base, WIN_RAM);
-			} else {
-				dev_info(&pdev->dev, "Cyclades-Z/PCI: New "
-					"Cyclades-Z board.  FPGA not loaded\n");
-			}
-#endif
-			/* The following clears the firmware id word.  This
-			   ensures that the driver will not attempt to talk to
-			   the board until it has been properly initialized.
-			 */
-			if ((mailbox == ZO_V1) || (mailbox == ZO_V2))
-				cy_writel(addr2 + ID_ADDRESS, 0L);
-		}
-
-		retval = cyz_load_fw(pdev, addr2, addr0, irq);
-		if (retval <= 0)
-			goto err_unmap;
-		nchan = retval;
-	}
-
-	if ((cy_next_channel + nchan) > NR_PORTS) {
-		dev_err(&pdev->dev, "Cyclades-8Zo/PCI found, but no "
-			"channels are available. Change NR_PORTS in "
-			"cyclades.c and recompile kernel.\n");
-		goto err_unmap;
-	}
-	/* fill the next cy_card structure available */
-	for (card_no = 0; card_no < NR_CARDS; card_no++) {
-		card = &cy_card[card_no];
-		if (card->base_addr == NULL)
-			break;
-	}
-	if (card_no == NR_CARDS) {	/* no more cy_cards available */
-		dev_err(&pdev->dev, "Cyclades-8Zo/PCI found, but no "
-			"more cards can be used. Change NR_CARDS in "
-			"cyclades.c and recompile kernel.\n");
-		goto err_unmap;
-	}
-
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Y_Lo ||
-			device_id == PCI_DEVICE_ID_CYCLOM_Y_Hi) {
-		/* allocate IRQ */
-		retval = request_irq(irq, cyy_interrupt,
-				IRQF_SHARED, "Cyclom-Y", card);
-		if (retval) {
-			dev_err(&pdev->dev, "could not allocate IRQ\n");
-			goto err_unmap;
-		}
-		card->num_chips = nchan / CyPORTS_PER_CHIP;
-	} else {
-		struct FIRM_ID __iomem *firm_id = addr2 + ID_ADDRESS;
-		struct ZFW_CTRL __iomem *zfw_ctrl;
-
-		zfw_ctrl = addr2 + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-
-		card->hw_ver = mailbox;
-		card->num_chips = (unsigned int)-1;
-		card->board_ctrl = &zfw_ctrl->board_ctrl;
-#ifdef CONFIG_CYZ_INTR
-		/* allocate IRQ only if board has an IRQ */
-		if (irq != 0 && irq != 255) {
-			retval = request_irq(irq, cyz_interrupt,
-					IRQF_SHARED, "Cyclades-Z", card);
-			if (retval) {
-				dev_err(&pdev->dev, "could not allocate IRQ\n");
-				goto err_unmap;
-			}
-		}
-#endif				/* CONFIG_CYZ_INTR */
-	}
-
-	/* set cy_card */
-	card->base_addr = addr2;
-	card->ctl_addr.p9050 = addr0;
-	card->irq = irq;
-	card->bus_index = 1;
-	card->first_line = cy_next_channel;
-	card->nports = nchan;
-	retval = cy_init_card(card);
-	if (retval)
-		goto err_null;
-
-	pci_set_drvdata(pdev, card);
-
-	if (device_id == PCI_DEVICE_ID_CYCLOM_Y_Lo ||
-			device_id == PCI_DEVICE_ID_CYCLOM_Y_Hi) {
-		/* enable interrupts in the PCI interface */
-		plx_ver = readb(addr2 + CyPLX_VER) & 0x0f;
-		switch (plx_ver) {
-		case PLX_9050:
-			cy_writeb(addr0 + 0x4c, 0x43);
-			break;
-
-		case PLX_9060:
-		case PLX_9080:
-		default:	/* Old boards, use PLX_9060 */
-		{
-			struct RUNTIME_9060 __iomem *ctl_addr = addr0;
-			plx_init(pdev, irq, ctl_addr);
-			cy_writew(&ctl_addr->intr_ctrl_stat,
-				readw(&ctl_addr->intr_ctrl_stat) | 0x0900);
-			break;
-		}
-		}
-	}
-
-	dev_info(&pdev->dev, "%s/PCI #%d found: %d channels starting from "
-		"port %d.\n", card_name, card_no + 1, nchan, cy_next_channel);
-	for (j = 0, i = cy_next_channel; i < cy_next_channel + nchan; i++, j++)
-		tty_port_register_device(&card->ports[j].port,
-				cy_serial_driver, i, &pdev->dev);
-	cy_next_channel += nchan;
-
-	return 0;
-err_null:
-	card->base_addr = NULL;
-	free_irq(irq, card);
-err_unmap:
-	iounmap(addr0);
-	if (addr2)
-		iounmap(addr2);
-err_reg:
-	pci_release_regions(pdev);
-err_dis:
-	pci_disable_device(pdev);
-err:
-	return retval;
-}
-
-static void cy_pci_remove(struct pci_dev *pdev)
-{
-	struct cyclades_card *cinfo = pci_get_drvdata(pdev);
-	unsigned int i, channel;
-
-	/* non-Z with old PLX */
-	if (!cy_is_Z(cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
-			PLX_9050)
-		cy_writeb(cinfo->ctl_addr.p9050 + 0x4c, 0);
-	else
-#ifndef CONFIG_CYZ_INTR
-		if (!cy_is_Z(cinfo))
-#endif
-		cy_writew(&cinfo->ctl_addr.p9060->intr_ctrl_stat,
-			readw(&cinfo->ctl_addr.p9060->intr_ctrl_stat) &
-			~0x0900);
-
-	iounmap(cinfo->base_addr);
-	if (cinfo->ctl_addr.p9050)
-		iounmap(cinfo->ctl_addr.p9050);
-	if (cinfo->irq
-#ifndef CONFIG_CYZ_INTR
-		&& !cy_is_Z(cinfo)
-#endif /* CONFIG_CYZ_INTR */
-		)
-		free_irq(cinfo->irq, cinfo);
-	pci_release_regions(pdev);
-
-	cinfo->base_addr = NULL;
-	for (channel = 0, i = cinfo->first_line; i < cinfo->first_line +
-			cinfo->nports; i++, channel++) {
-		tty_unregister_device(cy_serial_driver, i);
-		tty_port_destroy(&cinfo->ports[channel].port);
-	}
-	cinfo->nports = 0;
-	kfree(cinfo->ports);
-}
-
-static struct pci_driver cy_pci_driver = {
-	.name = "cyclades",
-	.id_table = cy_pci_dev_id,
-	.probe = cy_pci_probe,
-	.remove = cy_pci_remove
-};
-#endif
-
-static int cyclades_proc_show(struct seq_file *m, void *v)
-{
-	struct cyclades_port *info;
-	unsigned int i, j;
-	__u32 cur_jifs = jiffies;
-
-	seq_puts(m, "Dev TimeOpen   BytesOut  IdleOut    BytesIn   "
-			"IdleIn  Overruns  Ldisc\n");
-
-	/* Output one line for each known port */
-	for (i = 0; i < NR_CARDS; i++)
-		for (j = 0; j < cy_card[i].nports; j++) {
-			info = &cy_card[i].ports[j];
-
-			if (info->port.count) {
-				/* XXX is the ldisc num worth this? */
-				struct tty_struct *tty;
-				struct tty_ldisc *ld;
-				int num = 0;
-				tty = tty_port_tty_get(&info->port);
-				if (tty) {
-					ld = tty_ldisc_ref(tty);
-					if (ld) {
-						num = ld->ops->num;
-						tty_ldisc_deref(ld);
-					}
-					tty_kref_put(tty);
-				}
-				seq_printf(m, "%3d %8lu %10lu %8lu "
-					"%10lu %8lu %9lu %6d\n", info->line,
-					(cur_jifs - info->idle_stats.in_use) /
-					HZ, info->idle_stats.xmit_bytes,
-					(cur_jifs - info->idle_stats.xmit_idle)/
-					HZ, info->idle_stats.recv_bytes,
-					(cur_jifs - info->idle_stats.recv_idle)/
-					HZ, info->idle_stats.overruns,
-					num);
-			} else
-				seq_printf(m, "%3d %8lu %10lu %8lu "
-					"%10lu %8lu %9lu %6ld\n",
-					info->line, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
-		}
-	return 0;
-}
-
-/* The serial driver boot-time initialization code!
-    Hardware I/O ports are mapped to character special devices on a
-    first found, first allocated manner.  That is, this code searches
-    for Cyclom cards in the system.  As each is found, it is probed
-    to discover how many chips (and thus how many ports) are present.
-    These ports are mapped to the tty ports 32 and upward in monotonic
-    fashion.  If an 8-port card is replaced with a 16-port card, the
-    port mapping on a following card will shift.
-
-    This approach is different from what is used in the other serial
-    device driver because the Cyclom is more properly a multiplexer,
-    not just an aggregation of serial ports on one card.
-
-    If there are more cards with more ports than have been
-    statically allocated above, a warning is printed and the
-    extra ports are ignored.
- */
-
-static const struct tty_operations cy_ops = {
-	.open = cy_open,
-	.close = cy_close,
-	.write = cy_write,
-	.put_char = cy_put_char,
-	.flush_chars = cy_flush_chars,
-	.write_room = cy_write_room,
-	.chars_in_buffer = cy_chars_in_buffer,
-	.flush_buffer = cy_flush_buffer,
-	.ioctl = cy_ioctl,
-	.throttle = cy_throttle,
-	.unthrottle = cy_unthrottle,
-	.set_termios = cy_set_termios,
-	.stop = cy_stop,
-	.start = cy_start,
-	.hangup = cy_hangup,
-	.break_ctl = cy_break,
-	.wait_until_sent = cy_wait_until_sent,
-	.tiocmget = cy_tiocmget,
-	.tiocmset = cy_tiocmset,
-	.get_icount = cy_get_icount,
-	.set_serial = cy_set_serial_info,
-	.get_serial = cy_get_serial_info,
-	.proc_show = cyclades_proc_show,
-};
-
-static int __init cy_init(void)
-{
-	unsigned int nboards;
-	int retval = -ENOMEM;
-
-	cy_serial_driver = alloc_tty_driver(NR_PORTS);
-	if (!cy_serial_driver)
-		goto err;
-
-	printk(KERN_INFO "Cyclades driver " CY_VERSION "\n");
-
-	/* Initialize the tty_driver structure */
-
-	cy_serial_driver->driver_name = "cyclades";
-	cy_serial_driver->name = "ttyC";
-	cy_serial_driver->major = CYCLADES_MAJOR;
-	cy_serial_driver->minor_start = 0;
-	cy_serial_driver->type = TTY_DRIVER_TYPE_SERIAL;
-	cy_serial_driver->subtype = SERIAL_TYPE_NORMAL;
-	cy_serial_driver->init_termios = tty_std_termios;
-	cy_serial_driver->init_termios.c_cflag =
-	    B9600 | CS8 | CREAD | HUPCL | CLOCAL;
-	cy_serial_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
-	tty_set_operations(cy_serial_driver, &cy_ops);
-
-	retval = tty_register_driver(cy_serial_driver);
-	if (retval) {
-		printk(KERN_ERR "Couldn't register Cyclades serial driver\n");
-		goto err_frtty;
-	}
-
-	/* the code below is responsible to find the boards. Each different
-	   type of board has its own detection routine. If a board is found,
-	   the next cy_card structure available is set by the detection
-	   routine. These functions are responsible for checking the
-	   availability of cy_card and cy_port data structures and updating
-	   the cy_next_channel. */
-
-	/* look for isa boards */
-	nboards = cy_detect_isa();
-
-#ifdef CONFIG_PCI
-	/* look for pci boards */
-	retval = pci_register_driver(&cy_pci_driver);
-	if (retval && !nboards) {
-		tty_unregister_driver(cy_serial_driver);
-		goto err_frtty;
-	}
-#endif
-
-	return 0;
-err_frtty:
-	put_tty_driver(cy_serial_driver);
-err:
-	return retval;
-}				/* cy_init */
-
-static void __exit cy_cleanup_module(void)
-{
-	struct cyclades_card *card;
-	unsigned int i, e1;
-
-#ifndef CONFIG_CYZ_INTR
-	del_timer_sync(&cyz_timerlist);
-#endif /* CONFIG_CYZ_INTR */
-
-	e1 = tty_unregister_driver(cy_serial_driver);
-	if (e1)
-		printk(KERN_ERR "failed to unregister Cyclades serial "
-				"driver(%d)\n", e1);
-
-#ifdef CONFIG_PCI
-	pci_unregister_driver(&cy_pci_driver);
-#endif
-
-	for (i = 0; i < NR_CARDS; i++) {
-		card = &cy_card[i];
-		if (card->base_addr) {
-			/* clear interrupt */
-			cy_writeb(card->base_addr + Cy_ClrIntr, 0);
-			iounmap(card->base_addr);
-			if (card->ctl_addr.p9050)
-				iounmap(card->ctl_addr.p9050);
-			if (card->irq
-#ifndef CONFIG_CYZ_INTR
-				&& !cy_is_Z(card)
-#endif /* CONFIG_CYZ_INTR */
-				)
-				free_irq(card->irq, card);
-			for (e1 = card->first_line; e1 < card->first_line +
-					card->nports; e1++)
-				tty_unregister_device(cy_serial_driver, e1);
-			kfree(card->ports);
-		}
-	}
-
-	put_tty_driver(cy_serial_driver);
-} /* cy_cleanup_module */
-
-module_init(cy_init);
-module_exit(cy_cleanup_module);
-
-MODULE_LICENSE("GPL");
-MODULE_VERSION(CY_VERSION);
-MODULE_ALIAS_CHARDEV_MAJOR(CYCLADES_MAJOR);
-MODULE_FIRMWARE("cyzfirm.bin");
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index 603137da4736..7ec05fdb1fc3 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -15,8 +15,7 @@ config SERIAL_8250
 	  here are those that are setting up dedicated Ethernet WWW/FTP
 	  servers, or users that have one of the various bus mice instead of a
 	  serial mouse and don't intend to use their machine's standard serial
-	  port for anything.  (Note that the Cyclades multi serial port driver
-	  does not need this driver built in for it to work.)
+	  port for anything.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called 8250.
@@ -226,7 +225,7 @@ config SERIAL_8250_MANY_PORTS
 	  serial port hardware which acts similar to standard serial port
 	  hardware. If you only use the standard COM 1/2/3/4 ports, you can
 	  say N here to save some memory. You can also say Y if you have an
-	  "intelligent" multiport card such as Cyclades, Digiboards, etc.
+	  "intelligent" multiport card such as Digiboards, etc.
 
 #
 # Multi-port serial cards
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
deleted file mode 100644
index 05ee0f19448a..000000000000
--- a/include/linux/cyclades.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* $Revision: 3.0 $$Date: 1998/11/02 14:20:59 $
- * linux/include/linux/cyclades.h
- *
- * This file was initially written by
- * Randolph Bentson <bentson@grieg.seaslug.org> and is maintained by
- * Ivan Passos <ivan@cyclades.com>.
- *
- * This file contains the general definitions for the cyclades.c driver
- *$Log: cyclades.h,v $
- *Revision 3.1  2002/01/29 11:36:16  henrique
- *added throttle field on struct cyclades_port to indicate whether the
- *port is throttled or not
- *
- *Revision 3.1  2000/04/19 18:52:52  ivan
- *converted address fields to unsigned long and added fields for physical
- *addresses on cyclades_card structure;
- *
- *Revision 3.0  1998/11/02 14:20:59  ivan
- *added nports field on cyclades_card structure;
- *
- *Revision 2.5  1998/08/03 16:57:01  ivan
- *added cyclades_idle_stats structure;
- * 
- *Revision 2.4  1998/06/01 12:09:53  ivan
- *removed closing_wait2 from cyclades_port structure;
- *
- *Revision 2.3  1998/03/16 18:01:12  ivan
- *changes in the cyclades_port structure to get it closer to the 
- *standard serial port structure;
- *added constants for new ioctls;
- *
- *Revision 2.2  1998/02/17 16:50:00  ivan
- *changes in the cyclades_port structure (addition of shutdown_wait and 
- *chip_rev variables);
- *added constants for new ioctls and for CD1400 rev. numbers.
- *
- *Revision 2.1	1997/10/24 16:03:00  ivan
- *added rflow (which allows enabling the CD1400 special flow control 
- *feature) and rtsdtr_inv (which allows DTR/RTS pin inversion) to 
- *cyclades_port structure;
- *added Alpha support
- *
- *Revision 2.0  1997/06/30 10:30:00  ivan
- *added some new doorbell command constants related to IOCTLW and
- *UART error signaling
- *
- *Revision 1.8  1997/06/03 15:30:00  ivan
- *added constant ZFIRM_HLT
- *added constant CyPCI_Ze_win ( = 2 * Cy_PCI_Zwin)
- *
- *Revision 1.7  1997/03/26 10:30:00  daniel
- *new entries at the end of cyclades_port struct to reallocate
- *variables illegally allocated within card memory.
- *
- *Revision 1.6  1996/09/09 18:35:30  bentson
- *fold in changes for Cyclom-Z -- including structures for
- *communicating with board as well modest changes to original
- *structures to support new features.
- *
- *Revision 1.5  1995/11/13 21:13:31  bentson
- *changes suggested by Michael Chastain <mec@duracef.shout.net>
- *to support use of this file in non-kernel applications
- *
- *
- */
-#ifndef _LINUX_CYCLADES_H
-#define _LINUX_CYCLADES_H
-
-#include <uapi/linux/cyclades.h>
-
-
-/* Per card data structure */
-struct cyclades_card {
-	void __iomem *base_addr;
-	union {
-		void __iomem *p9050;
-		struct RUNTIME_9060 __iomem *p9060;
-	} ctl_addr;
-	struct BOARD_CTRL __iomem *board_ctrl;	/* cyz specific */
-	int irq;
-	unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
-	unsigned int first_line;	/* minor number of first channel on card */
-	unsigned int nports;	/* Number of ports in the card */
-	int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
-	int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
-	u32 hw_ver;
-	spinlock_t card_lock;
-	struct cyclades_port *ports;
-};
-
-/***************************************
- * Memory access functions/macros      *
- * (required to support Alpha systems) *
- ***************************************/
-
-#define cy_writeb(port,val)     do { writeb((val), (port)); mb(); } while (0)
-#define cy_writew(port,val)     do { writew((val), (port)); mb(); } while (0)
-#define cy_writel(port,val)     do { writel((val), (port)); mb(); } while (0)
-
-/*
- * Statistics counters
- */
-struct cyclades_icount {
-	__u32	cts, dsr, rng, dcd, tx, rx;
-	__u32	frame, parity, overrun, brk;
-	__u32	buf_overrun;
-};
-
-/*
- * This is our internal structure for each serial port's state.
- * 
- * Many fields are paralleled by the structure used by the serial_struct
- * structure.
- *
- * For definitions of the flags field, see tty.h
- */
-
-struct cyclades_port {
-	int                     magic;
-	struct tty_port		port;
-	struct cyclades_card	*card;
-	union {
-		struct {
-			void __iomem *base_addr;
-		} cyy;
-		struct {
-			struct CH_CTRL __iomem	*ch_ctrl;
-			struct BUF_CTRL __iomem	*buf_ctrl;
-		} cyz;
-	} u;
-	int			line;
-	int			flags; 		/* defined in tty.h */
-	int                     type;		/* UART type */
-	int			read_status_mask;
-	int			ignore_status_mask;
-	int			timeout;
-	int			xmit_fifo_size;
-	int                     cor1,cor2,cor3,cor4,cor5;
-	int                     tbpr,tco,rbpr,rco;
-	int			baud;
-	int			rflow;
-	int			rtsdtr_inv;
-	int			chip_rev;
-	int			custom_divisor;
-	u8			x_char; /* to be pushed out ASAP */
-	int                     breakon;
-	int                     breakoff;
-	int			xmit_head;
-	int			xmit_tail;
-	int			xmit_cnt;
-        int                     default_threshold;
-        int                     default_timeout;
-	unsigned long		rflush_count;
-	struct cyclades_monitor	mon;
-	struct cyclades_idle_stats	idle_stats;
-	struct cyclades_icount	icount;
-	struct completion       shutdown_wait;
-	int throttle;
-#ifdef CONFIG_CYZ_INTR
-	struct timer_list	rx_full_timer;
-#endif
-};
-
-#define	CLOSING_WAIT_DELAY	30*HZ
-#define CY_CLOSING_WAIT_NONE	ASYNC_CLOSING_WAIT_NONE
-#define CY_CLOSING_WAIT_INF	ASYNC_CLOSING_WAIT_INF
-
-
-#define CyMAX_CHIPS_PER_CARD	8
-#define CyMAX_CHAR_FIFO		12
-#define CyPORTS_PER_CHIP	4
-#define	CD1400_MAX_SPEED	115200
-
-#define	CyISA_Ywin	0x2000
-
-#define CyPCI_Ywin 	0x4000
-#define CyPCI_Yctl 	0x80
-#define CyPCI_Zctl 	CTRL_WINDOW_SIZE
-#define CyPCI_Zwin 	0x80000
-#define CyPCI_Ze_win 	(2 * CyPCI_Zwin)
-
-#define PCI_DEVICE_ID_MASK	0x06
-
-/**** CD1400 registers ****/
-
-#define CD1400_REV_G	0x46
-#define CD1400_REV_J	0x48
-
-#define CyRegSize  	0x0400
-#define Cy_HwReset 	0x1400
-#define Cy_ClrIntr 	0x1800
-#define Cy_EpldRev 	0x1e00
-
-/* Global Registers */
-
-#define CyGFRCR		(0x40*2)
-#define      CyRevE		(44)
-#define CyCAR		(0x68*2)
-#define      CyCHAN_0		(0x00)
-#define      CyCHAN_1		(0x01)
-#define      CyCHAN_2		(0x02)
-#define      CyCHAN_3		(0x03)
-#define CyGCR		(0x4B*2)
-#define      CyCH0_SERIAL	(0x00)
-#define      CyCH0_PARALLEL	(0x80)
-#define CySVRR		(0x67*2)
-#define      CySRModem		(0x04)
-#define      CySRTransmit	(0x02)
-#define      CySRReceive	(0x01)
-#define CyRICR		(0x44*2)
-#define CyTICR		(0x45*2)
-#define CyMICR		(0x46*2)
-#define      CyICR0		(0x00)
-#define      CyICR1		(0x01)
-#define      CyICR2		(0x02)
-#define      CyICR3		(0x03)
-#define CyRIR		(0x6B*2)
-#define CyTIR		(0x6A*2)
-#define CyMIR		(0x69*2)
-#define      CyIRDirEq		(0x80)
-#define      CyIRBusy		(0x40)
-#define      CyIRUnfair		(0x20)
-#define      CyIRContext	(0x1C)
-#define      CyIRChannel	(0x03)
-#define CyPPR 		(0x7E*2)
-#define      CyCLOCK_20_1MS	(0x27)
-#define      CyCLOCK_25_1MS	(0x31)
-#define      CyCLOCK_25_5MS	(0xf4)
-#define      CyCLOCK_60_1MS	(0x75)
-#define      CyCLOCK_60_2MS	(0xea)
-
-/* Virtual Registers */
-
-#define CyRIVR		(0x43*2)
-#define CyTIVR		(0x42*2)
-#define CyMIVR		(0x41*2)
-#define      CyIVRMask (0x07)
-#define      CyIVRRxEx (0x07)
-#define      CyIVRRxOK (0x03)
-#define      CyIVRTxOK (0x02)
-#define      CyIVRMdmOK (0x01)
-#define CyTDR		(0x63*2)
-#define CyRDSR		(0x62*2)
-#define      CyTIMEOUT		(0x80)
-#define      CySPECHAR		(0x70)
-#define      CyBREAK		(0x08)
-#define      CyPARITY		(0x04)
-#define      CyFRAME		(0x02)
-#define      CyOVERRUN		(0x01)
-#define CyMISR		(0x4C*2)
-/* see CyMCOR_ and CyMSVR_ for bits*/
-#define CyEOSRR		(0x60*2)
-
-/* Channel Registers */
-
-#define CyLIVR		(0x18*2)
-#define      CyMscsr		(0x01)
-#define      CyTdsr		(0x02)
-#define      CyRgdsr		(0x03)
-#define      CyRedsr		(0x07)
-#define CyCCR		(0x05*2)
-/* Format 1 */
-#define      CyCHAN_RESET	(0x80)
-#define      CyCHIP_RESET	(0x81)
-#define      CyFlushTransFIFO	(0x82)
-/* Format 2 */
-#define      CyCOR_CHANGE	(0x40)
-#define      CyCOR1ch		(0x02)
-#define      CyCOR2ch		(0x04)
-#define      CyCOR3ch		(0x08)
-/* Format 3 */
-#define      CySEND_SPEC_1	(0x21)
-#define      CySEND_SPEC_2	(0x22)
-#define      CySEND_SPEC_3	(0x23)
-#define      CySEND_SPEC_4	(0x24)
-/* Format 4 */
-#define      CyCHAN_CTL		(0x10)
-#define      CyDIS_RCVR		(0x01)
-#define      CyENB_RCVR		(0x02)
-#define      CyDIS_XMTR		(0x04)
-#define      CyENB_XMTR		(0x08)
-#define CySRER		(0x06*2)
-#define      CyMdmCh		(0x80)
-#define      CyRxData		(0x10)
-#define      CyTxRdy		(0x04)
-#define      CyTxMpty		(0x02)
-#define      CyNNDT		(0x01)
-#define CyCOR1		(0x08*2)
-#define      CyPARITY_NONE	(0x00)
-#define      CyPARITY_0		(0x20)
-#define      CyPARITY_1		(0xA0)
-#define      CyPARITY_E		(0x40)
-#define      CyPARITY_O		(0xC0)
-#define      Cy_1_STOP		(0x00)
-#define      Cy_1_5_STOP	(0x04)
-#define      Cy_2_STOP		(0x08)
-#define      Cy_5_BITS		(0x00)
-#define      Cy_6_BITS		(0x01)
-#define      Cy_7_BITS		(0x02)
-#define      Cy_8_BITS		(0x03)
-#define CyCOR2		(0x09*2)
-#define      CyIXM		(0x80)
-#define      CyTxIBE		(0x40)
-#define      CyETC		(0x20)
-#define      CyAUTO_TXFL	(0x60)
-#define      CyLLM		(0x10)
-#define      CyRLM		(0x08)
-#define      CyRtsAO		(0x04)
-#define      CyCtsAE		(0x02)
-#define      CyDsrAE		(0x01)
-#define CyCOR3		(0x0A*2)
-#define      CySPL_CH_DRANGE	(0x80)  /* special character detect range */
-#define      CySPL_CH_DET1	(0x40)  /* enable special character detection
-                                                               on SCHR4-SCHR3 */
-#define      CyFL_CTRL_TRNSP	(0x20)  /* Flow Control Transparency */
-#define      CySPL_CH_DET2	(0x10)  /* Enable special character detection
-                                                               on SCHR2-SCHR1 */
-#define      CyREC_FIFO		(0x0F)  /* Receive FIFO threshold */
-#define CyCOR4		(0x1E*2)
-#define CyCOR5		(0x1F*2)
-#define CyCCSR		(0x0B*2)
-#define      CyRxEN		(0x80)
-#define      CyRxFloff		(0x40)
-#define      CyRxFlon		(0x20)
-#define      CyTxEN		(0x08)
-#define      CyTxFloff		(0x04)
-#define      CyTxFlon		(0x02)
-#define CyRDCR		(0x0E*2)
-#define CySCHR1		(0x1A*2)
-#define CySCHR2 	(0x1B*2)
-#define CySCHR3		(0x1C*2)
-#define CySCHR4		(0x1D*2)
-#define CySCRL		(0x22*2)
-#define CySCRH		(0x23*2)
-#define CyLNC		(0x24*2)
-#define CyMCOR1 	(0x15*2)
-#define CyMCOR2		(0x16*2)
-#define CyRTPR		(0x21*2)
-#define CyMSVR1		(0x6C*2)
-#define CyMSVR2		(0x6D*2)
-#define      CyANY_DELTA	(0xF0)
-#define      CyDSR		(0x80)
-#define      CyCTS		(0x40)
-#define      CyRI		(0x20)
-#define      CyDCD		(0x10)
-#define      CyDTR              (0x02)
-#define      CyRTS              (0x01)
-#define CyPVSR		(0x6F*2)
-#define CyRBPR		(0x78*2)
-#define CyRCOR		(0x7C*2)
-#define CyTBPR		(0x72*2)
-#define CyTCOR		(0x76*2)
-
-/* Custom Registers */
-
-#define	CyPLX_VER	(0x3400)
-#define	PLX_9050	0x0b
-#define	PLX_9060	0x0c
-#define	PLX_9080	0x0d
-
-/***************************************************************************/
-
-#endif /* _LINUX_CYCLADES_H */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 8a18517696c1..056d2074f07a 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1711,14 +1711,6 @@
 #define PCI_DEVICE_ID_CRP16INTF		0x0903
 
 #define PCI_VENDOR_ID_CYCLADES		0x120e
-#define PCI_DEVICE_ID_CYCLOM_Y_Lo	0x0100
-#define PCI_DEVICE_ID_CYCLOM_Y_Hi	0x0101
-#define PCI_DEVICE_ID_CYCLOM_4Y_Lo	0x0102
-#define PCI_DEVICE_ID_CYCLOM_4Y_Hi	0x0103
-#define PCI_DEVICE_ID_CYCLOM_8Y_Lo	0x0104
-#define PCI_DEVICE_ID_CYCLOM_8Y_Hi	0x0105
-#define PCI_DEVICE_ID_CYCLOM_Z_Lo	0x0200
-#define PCI_DEVICE_ID_CYCLOM_Z_Hi	0x0201
 #define PCI_DEVICE_ID_PC300_RX_2	0x0300
 #define PCI_DEVICE_ID_PC300_RX_1	0x0301
 #define PCI_DEVICE_ID_PC300_TE_2	0x0310
diff --git a/include/uapi/linux/cyclades.h b/include/uapi/linux/cyclades.h
deleted file mode 100644
index fc0add2194a9..000000000000
--- a/include/uapi/linux/cyclades.h
+++ /dev/null
@@ -1,494 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/* $Revision: 3.0 $$Date: 1998/11/02 14:20:59 $
- * linux/include/linux/cyclades.h
- *
- * This file was initially written by
- * Randolph Bentson <bentson@grieg.seaslug.org> and is maintained by
- * Ivan Passos <ivan@cyclades.com>.
- *
- * This file contains the general definitions for the cyclades.c driver
- *$Log: cyclades.h,v $
- *Revision 3.1  2002/01/29 11:36:16  henrique
- *added throttle field on struct cyclades_port to indicate whether the
- *port is throttled or not
- *
- *Revision 3.1  2000/04/19 18:52:52  ivan
- *converted address fields to unsigned long and added fields for physical
- *addresses on cyclades_card structure;
- *
- *Revision 3.0  1998/11/02 14:20:59  ivan
- *added nports field on cyclades_card structure;
- *
- *Revision 2.5  1998/08/03 16:57:01  ivan
- *added cyclades_idle_stats structure;
- * 
- *Revision 2.4  1998/06/01 12:09:53  ivan
- *removed closing_wait2 from cyclades_port structure;
- *
- *Revision 2.3  1998/03/16 18:01:12  ivan
- *changes in the cyclades_port structure to get it closer to the 
- *standard serial port structure;
- *added constants for new ioctls;
- *
- *Revision 2.2  1998/02/17 16:50:00  ivan
- *changes in the cyclades_port structure (addition of shutdown_wait and 
- *chip_rev variables);
- *added constants for new ioctls and for CD1400 rev. numbers.
- *
- *Revision 2.1	1997/10/24 16:03:00  ivan
- *added rflow (which allows enabling the CD1400 special flow control 
- *feature) and rtsdtr_inv (which allows DTR/RTS pin inversion) to 
- *cyclades_port structure;
- *added Alpha support
- *
- *Revision 2.0  1997/06/30 10:30:00  ivan
- *added some new doorbell command constants related to IOCTLW and
- *UART error signaling
- *
- *Revision 1.8  1997/06/03 15:30:00  ivan
- *added constant ZFIRM_HLT
- *added constant CyPCI_Ze_win ( = 2 * Cy_PCI_Zwin)
- *
- *Revision 1.7  1997/03/26 10:30:00  daniel
- *new entries at the end of cyclades_port struct to reallocate
- *variables illegally allocated within card memory.
- *
- *Revision 1.6  1996/09/09 18:35:30  bentson
- *fold in changes for Cyclom-Z -- including structures for
- *communicating with board as well modest changes to original
- *structures to support new features.
- *
- *Revision 1.5  1995/11/13 21:13:31  bentson
- *changes suggested by Michael Chastain <mec@duracef.shout.net>
- *to support use of this file in non-kernel applications
- *
- *
- */
-
-#ifndef _UAPI_LINUX_CYCLADES_H
-#define _UAPI_LINUX_CYCLADES_H
-
-#include <linux/types.h>
-
-struct cyclades_monitor {
-        unsigned long           int_count;
-        unsigned long           char_count;
-        unsigned long           char_max;
-        unsigned long           char_last;
-};
-
-/*
- * These stats all reflect activity since the device was last initialized.
- * (i.e., since the port was opened with no other processes already having it
- * open)
- */
-struct cyclades_idle_stats {
-    __kernel_old_time_t in_use;	/* Time device has been in use (secs) */
-    __kernel_old_time_t recv_idle; /* Time since last char received (secs) */
-    __kernel_old_time_t xmit_idle; /* Time since last char transmitted (secs) */
-    unsigned long  recv_bytes;	/* Bytes received */
-    unsigned long  xmit_bytes;	/* Bytes transmitted */
-    unsigned long  overruns;	/* Input overruns */
-    unsigned long  frame_errs;	/* Input framing errors */
-    unsigned long  parity_errs;	/* Input parity errors */
-};
-
-#define CYCLADES_MAGIC  0x4359
-
-#define CYGETMON                0x435901
-#define CYGETTHRESH             0x435902
-#define CYSETTHRESH             0x435903
-#define CYGETDEFTHRESH          0x435904
-#define CYSETDEFTHRESH          0x435905
-#define CYGETTIMEOUT            0x435906
-#define CYSETTIMEOUT            0x435907
-#define CYGETDEFTIMEOUT         0x435908
-#define CYSETDEFTIMEOUT         0x435909
-#define CYSETRFLOW		0x43590a
-#define CYGETRFLOW		0x43590b
-#define CYSETRTSDTR_INV		0x43590c
-#define CYGETRTSDTR_INV		0x43590d
-#define CYZSETPOLLCYCLE		0x43590e
-#define CYZGETPOLLCYCLE		0x43590f
-#define CYGETCD1400VER		0x435910
-#define	CYSETWAIT		0x435912
-#define	CYGETWAIT		0x435913
-
-/*************** CYCLOM-Z ADDITIONS ***************/
-
-#define CZIOC           ('M' << 8)
-#define CZ_NBOARDS      (CZIOC|0xfa)
-#define CZ_BOOT_START   (CZIOC|0xfb)
-#define CZ_BOOT_DATA    (CZIOC|0xfc)
-#define CZ_BOOT_END     (CZIOC|0xfd)
-#define CZ_TEST         (CZIOC|0xfe)
-
-#define CZ_DEF_POLL	(HZ/25)
-
-#define MAX_BOARD       4       /* Max number of boards */
-#define MAX_DEV         256     /* Max number of ports total */
-#define	CYZ_MAX_SPEED	921600
-
-#define	CYZ_FIFO_SIZE	16
-
-#define CYZ_BOOT_NWORDS 0x100
-struct CYZ_BOOT_CTRL {
-        unsigned short  nboard;
-        int             status[MAX_BOARD];
-        int             nchannel[MAX_BOARD];
-        int             fw_rev[MAX_BOARD];
-        unsigned long   offset;
-        unsigned long   data[CYZ_BOOT_NWORDS];
-};
-
-
-#ifndef DP_WINDOW_SIZE
-/*
- *	Memory Window Sizes
- */
-
-#define	DP_WINDOW_SIZE		(0x00080000)	/* window size 512 Kb */
-#define	ZE_DP_WINDOW_SIZE	(0x00100000)	/* window size 1 Mb (Ze and
-						  8Zo V.2 */
-#define	CTRL_WINDOW_SIZE	(0x00000080)	/* runtime regs 128 bytes */
-
-/*
- *	CUSTOM_REG - Cyclom-Z/PCI Custom Registers Set. The driver
- *	normally will access only interested on the fpga_id, fpga_version,
- *	start_cpu and stop_cpu.
- */
-
-struct	CUSTOM_REG {
-	__u32	fpga_id;		/* FPGA Identification Register */
-	__u32	fpga_version;		/* FPGA Version Number Register */
-	__u32	cpu_start;		/* CPU start Register (write) */
-	__u32	cpu_stop;		/* CPU stop Register (write) */
-	__u32	misc_reg;		/* Miscellaneous Register */
-	__u32	idt_mode;		/* IDT mode Register */
-	__u32	uart_irq_status;	/* UART IRQ status Register */
-	__u32	clear_timer0_irq;	/* Clear timer interrupt Register */
-	__u32	clear_timer1_irq;	/* Clear timer interrupt Register */
-	__u32	clear_timer2_irq;	/* Clear timer interrupt Register */
-	__u32	test_register;		/* Test Register */
-	__u32	test_count;		/* Test Count Register */
-	__u32	timer_select;		/* Timer select register */
-	__u32	pr_uart_irq_status;	/* Prioritized UART IRQ stat Reg */
-	__u32	ram_wait_state;		/* RAM wait-state Register */
-	__u32	uart_wait_state;	/* UART wait-state Register */
-	__u32	timer_wait_state;	/* timer wait-state Register */
-	__u32	ack_wait_state;		/* ACK wait State Register */
-};
-
-/*
- *	RUNTIME_9060 - PLX PCI9060ES local configuration and shared runtime
- *	registers. This structure can be used to access the 9060 registers
- *	(memory mapped).
- */
-
-struct RUNTIME_9060 {
-	__u32	loc_addr_range;	/* 00h - Local Address Range */
-	__u32	loc_addr_base;	/* 04h - Local Address Base */
-	__u32	loc_arbitr;	/* 08h - Local Arbitration */
-	__u32	endian_descr;	/* 0Ch - Big/Little Endian Descriptor */
-	__u32	loc_rom_range;	/* 10h - Local ROM Range */
-	__u32	loc_rom_base;	/* 14h - Local ROM Base */
-	__u32	loc_bus_descr;	/* 18h - Local Bus descriptor */
-	__u32	loc_range_mst;	/* 1Ch - Local Range for Master to PCI */
-	__u32	loc_base_mst;	/* 20h - Local Base for Master PCI */
-	__u32	loc_range_io;	/* 24h - Local Range for Master IO */
-	__u32	pci_base_mst;	/* 28h - PCI Base for Master PCI */
-	__u32	pci_conf_io;	/* 2Ch - PCI configuration for Master IO */
-	__u32	filler1;	/* 30h */
-	__u32	filler2;	/* 34h */
-	__u32	filler3;	/* 38h */
-	__u32	filler4;	/* 3Ch */
-	__u32	mail_box_0;	/* 40h - Mail Box 0 */
-	__u32	mail_box_1;	/* 44h - Mail Box 1 */
-	__u32	mail_box_2;	/* 48h - Mail Box 2 */
-	__u32	mail_box_3;	/* 4Ch - Mail Box 3 */
-	__u32	filler5;	/* 50h */
-	__u32	filler6;	/* 54h */
-	__u32	filler7;	/* 58h */
-	__u32	filler8;	/* 5Ch */
-	__u32	pci_doorbell;	/* 60h - PCI to Local Doorbell */
-	__u32	loc_doorbell;	/* 64h - Local to PCI Doorbell */
-	__u32	intr_ctrl_stat;	/* 68h - Interrupt Control/Status */
-	__u32	init_ctrl;	/* 6Ch - EEPROM control, Init Control, etc */
-};
-
-/* Values for the Local Base Address re-map register */
-
-#define	WIN_RAM		0x00000001L	/* set the sliding window to RAM */
-#define	WIN_CREG	0x14000001L	/* set the window to custom Registers */
-
-/* Values timer select registers */
-
-#define	TIMER_BY_1M	0x00		/* clock divided by 1M */
-#define	TIMER_BY_256K	0x01		/* clock divided by 256k */
-#define	TIMER_BY_128K	0x02		/* clock divided by 128k */
-#define	TIMER_BY_32K	0x03		/* clock divided by 32k */
-
-/****************** ****************** *******************/
-#endif
-
-#ifndef ZFIRM_ID
-/* #include "zfwint.h" */
-/****************** ****************** *******************/
-/*
- *	This file contains the definitions for interfacing with the
- *	Cyclom-Z ZFIRM Firmware.
- */
-
-/* General Constant definitions */
-
-#define	MAX_CHAN	64		/* max number of channels per board */
-
-/* firmware id structure (set after boot) */
-
-#define ID_ADDRESS	0x00000180L	/* signature/pointer address */
-#define	ZFIRM_ID	0x5557465AL	/* ZFIRM/U signature */
-#define	ZFIRM_HLT	0x59505B5CL	/* ZFIRM needs external power supply */
-#define	ZFIRM_RST	0x56040674L	/* RST signal (due to FW reset) */
-
-#define	ZF_TINACT_DEF	1000		/* default inactivity timeout 
-					   (1000 ms) */
-#define	ZF_TINACT	ZF_TINACT_DEF
-
-struct	FIRM_ID {
-	__u32	signature;		/* ZFIRM/U signature */
-	__u32	zfwctrl_addr;		/* pointer to ZFW_CTRL structure */
-};
-
-/* Op. System id */
-
-#define	C_OS_LINUX	0x00000030	/* generic Linux system */
-
-/* channel op_mode */
-
-#define	C_CH_DISABLE	0x00000000	/* channel is disabled */
-#define	C_CH_TXENABLE	0x00000001	/* channel Tx enabled */
-#define	C_CH_RXENABLE	0x00000002	/* channel Rx enabled */
-#define	C_CH_ENABLE	0x00000003	/* channel Tx/Rx enabled */
-#define	C_CH_LOOPBACK	0x00000004	/* Loopback mode */
-
-/* comm_parity - parity */
-
-#define	C_PR_NONE	0x00000000	/* None */
-#define	C_PR_ODD	0x00000001	/* Odd */
-#define C_PR_EVEN	0x00000002	/* Even */
-#define C_PR_MARK	0x00000004	/* Mark */
-#define C_PR_SPACE	0x00000008	/* Space */
-#define C_PR_PARITY	0x000000ff
-
-#define	C_PR_DISCARD	0x00000100	/* discard char with frame/par error */
-#define C_PR_IGNORE	0x00000200	/* ignore frame/par error */
-
-/* comm_data_l - data length and stop bits */
-
-#define C_DL_CS5	0x00000001
-#define C_DL_CS6	0x00000002
-#define C_DL_CS7	0x00000004
-#define C_DL_CS8	0x00000008
-#define	C_DL_CS		0x0000000f
-#define C_DL_1STOP	0x00000010
-#define C_DL_15STOP	0x00000020
-#define C_DL_2STOP	0x00000040
-#define	C_DL_STOP	0x000000f0
-
-/* interrupt enabling/status */
-
-#define	C_IN_DISABLE	0x00000000	/* zero, disable interrupts */
-#define	C_IN_TXBEMPTY	0x00000001	/* tx buffer empty */
-#define	C_IN_TXLOWWM	0x00000002	/* tx buffer below LWM */
-#define	C_IN_RXHIWM	0x00000010	/* rx buffer above HWM */
-#define	C_IN_RXNNDT	0x00000020	/* rx no new data timeout */
-#define	C_IN_MDCD	0x00000100	/* modem DCD change */
-#define	C_IN_MDSR	0x00000200	/* modem DSR change */
-#define	C_IN_MRI	0x00000400	/* modem RI change */
-#define	C_IN_MCTS	0x00000800	/* modem CTS change */
-#define	C_IN_RXBRK	0x00001000	/* Break received */
-#define	C_IN_PR_ERROR	0x00002000	/* parity error */
-#define	C_IN_FR_ERROR	0x00004000	/* frame error */
-#define C_IN_OVR_ERROR  0x00008000      /* overrun error */
-#define C_IN_RXOFL	0x00010000      /* RX buffer overflow */
-#define C_IN_IOCTLW	0x00020000      /* I/O control w/ wait */
-#define C_IN_MRTS	0x00040000	/* modem RTS drop */
-#define C_IN_ICHAR	0x00080000
- 
-/* flow control */
-
-#define	C_FL_OXX	0x00000001	/* output Xon/Xoff flow control */
-#define	C_FL_IXX	0x00000002	/* output Xon/Xoff flow control */
-#define C_FL_OIXANY	0x00000004	/* output Xon/Xoff (any xon) */
-#define	C_FL_SWFLOW	0x0000000f
-
-/* flow status */
-
-#define	C_FS_TXIDLE	0x00000000	/* no Tx data in the buffer or UART */
-#define	C_FS_SENDING	0x00000001	/* UART is sending data */
-#define	C_FS_SWFLOW	0x00000002	/* Tx is stopped by received Xoff */
-
-/* rs_control/rs_status RS-232 signals */
-
-#define C_RS_PARAM	0x80000000	/* Indicates presence of parameter in 
-					   IOCTLM command */
-#define	C_RS_RTS	0x00000001	/* RTS */
-#define	C_RS_DTR	0x00000004	/* DTR */
-#define	C_RS_DCD	0x00000100	/* CD */
-#define	C_RS_DSR	0x00000200	/* DSR */
-#define	C_RS_RI		0x00000400	/* RI */
-#define	C_RS_CTS	0x00000800	/* CTS */
-
-/* commands Host <-> Board */
-
-#define	C_CM_RESET	0x01		/* reset/flush buffers */
-#define	C_CM_IOCTL	0x02		/* re-read CH_CTRL */
-#define	C_CM_IOCTLW	0x03		/* re-read CH_CTRL, intr when done */
-#define	C_CM_IOCTLM	0x04		/* RS-232 outputs change */
-#define	C_CM_SENDXOFF	0x10		/* send Xoff */
-#define	C_CM_SENDXON	0x11		/* send Xon */
-#define C_CM_CLFLOW	0x12		/* Clear flow control (resume) */
-#define	C_CM_SENDBRK	0x41		/* send break */
-#define	C_CM_INTBACK	0x42		/* Interrupt back */
-#define	C_CM_SET_BREAK	0x43		/* Tx break on */
-#define	C_CM_CLR_BREAK	0x44		/* Tx break off */
-#define	C_CM_CMD_DONE	0x45		/* Previous command done */
-#define C_CM_INTBACK2	0x46		/* Alternate Interrupt back */
-#define	C_CM_TINACT	0x51		/* set inactivity detection */
-#define	C_CM_IRQ_ENBL	0x52		/* enable generation of interrupts */
-#define	C_CM_IRQ_DSBL	0x53		/* disable generation of interrupts */
-#define	C_CM_ACK_ENBL	0x54		/* enable acknowledged interrupt mode */
-#define	C_CM_ACK_DSBL	0x55		/* disable acknowledged intr mode */
-#define	C_CM_FLUSH_RX	0x56		/* flushes Rx buffer */
-#define	C_CM_FLUSH_TX	0x57		/* flushes Tx buffer */
-#define C_CM_Q_ENABLE	0x58		/* enables queue access from the 
-					   driver */
-#define C_CM_Q_DISABLE  0x59            /* disables queue access from the 
-					   driver */
-
-#define	C_CM_TXBEMPTY	0x60		/* Tx buffer is empty */
-#define	C_CM_TXLOWWM	0x61		/* Tx buffer low water mark */
-#define	C_CM_RXHIWM	0x62		/* Rx buffer high water mark */
-#define	C_CM_RXNNDT	0x63		/* rx no new data timeout */
-#define	C_CM_TXFEMPTY	0x64
-#define	C_CM_ICHAR	0x65
-#define	C_CM_MDCD	0x70		/* modem DCD change */
-#define	C_CM_MDSR	0x71		/* modem DSR change */
-#define	C_CM_MRI	0x72		/* modem RI change */
-#define	C_CM_MCTS	0x73		/* modem CTS change */
-#define C_CM_MRTS	0x74		/* modem RTS drop */
-#define	C_CM_RXBRK	0x84		/* Break received */
-#define	C_CM_PR_ERROR	0x85		/* Parity error */
-#define	C_CM_FR_ERROR	0x86		/* Frame error */
-#define C_CM_OVR_ERROR  0x87            /* Overrun error */
-#define C_CM_RXOFL	0x88            /* RX buffer overflow */
-#define	C_CM_CMDERROR	0x90		/* command error */
-#define	C_CM_FATAL	0x91		/* fatal error */
-#define	C_CM_HW_RESET	0x92		/* reset board */
-
-/*
- *	CH_CTRL - This per port structure contains all parameters
- *	that control an specific port. It can be seen as the
- *	configuration registers of a "super-serial-controller".
- */
-
-struct CH_CTRL {
-	__u32	op_mode;	/* operation mode */
-	__u32	intr_enable;	/* interrupt masking */
-	__u32	sw_flow;	/* SW flow control */
-	__u32	flow_status;	/* output flow status */
-	__u32	comm_baud;	/* baud rate  - numerically specified */
-	__u32	comm_parity;	/* parity */
-	__u32	comm_data_l;	/* data length/stop */
-	__u32	comm_flags;	/* other flags */
-	__u32	hw_flow;	/* HW flow control */
-	__u32	rs_control;	/* RS-232 outputs */
-	__u32	rs_status;	/* RS-232 inputs */
-	__u32	flow_xon;	/* xon char */
-	__u32	flow_xoff;	/* xoff char */
-	__u32	hw_overflow;	/* hw overflow counter */
-	__u32	sw_overflow;	/* sw overflow counter */
-	__u32	comm_error;	/* frame/parity error counter */
-	__u32 ichar;
-	__u32 filler[7];
-};
-
-
-/*
- *	BUF_CTRL - This per channel structure contains
- *	all Tx and Rx buffer control for a given channel.
- */
-
-struct	BUF_CTRL	{
-	__u32	flag_dma;	/* buffers are in Host memory */
-	__u32	tx_bufaddr;	/* address of the tx buffer */
-	__u32	tx_bufsize;	/* tx buffer size */
-	__u32	tx_threshold;	/* tx low water mark */
-	__u32	tx_get;		/* tail index tx buf */
-	__u32	tx_put;		/* head index tx buf */
-	__u32	rx_bufaddr;	/* address of the rx buffer */
-	__u32	rx_bufsize;	/* rx buffer size */
-	__u32	rx_threshold;	/* rx high water mark */
-	__u32	rx_get;		/* tail index rx buf */
-	__u32	rx_put;		/* head index rx buf */
-	__u32	filler[5];	/* filler to align structures */
-};
-
-/*
- *	BOARD_CTRL - This per board structure contains all global 
- *	control fields related to the board.
- */
-
-struct BOARD_CTRL {
-
-	/* static info provided by the on-board CPU */
-	__u32	n_channel;	/* number of channels */
-	__u32	fw_version;	/* firmware version */
-
-	/* static info provided by the driver */
-	__u32	op_system;	/* op_system id */
-	__u32	dr_version;	/* driver version */
-
-	/* board control area */
-	__u32	inactivity;	/* inactivity control */
-
-	/* host to FW commands */
-	__u32	hcmd_channel;	/* channel number */
-	__u32	hcmd_param;	/* pointer to parameters */
-
-	/* FW to Host commands */
-	__u32	fwcmd_channel;	/* channel number */
-	__u32	fwcmd_param;	/* pointer to parameters */
-	__u32	zf_int_queue_addr; /* offset for INT_QUEUE structure */
-
-	/* filler so the structures are aligned */
-	__u32	filler[6];
-};
-
-/* Host Interrupt Queue */
-
-#define QUEUE_SIZE	(10*MAX_CHAN)
-
-struct	INT_QUEUE {
-	unsigned char	intr_code[QUEUE_SIZE];
-	unsigned long	channel[QUEUE_SIZE];
-	unsigned long	param[QUEUE_SIZE];
-	unsigned long	put;
-	unsigned long	get;
-};
-
-/*
- *	ZFW_CTRL - This is the data structure that includes all other
- *	data structures used by the Firmware.
- */
- 
-struct ZFW_CTRL {
-	struct BOARD_CTRL	board_ctrl;
-	struct CH_CTRL		ch_ctrl[MAX_CHAN];
-	struct BUF_CTRL		buf_ctrl[MAX_CHAN];
-};
-
-/****************** ****************** *******************/
-#endif
-
-#endif /* _UAPI_LINUX_CYCLADES_H */
diff --git a/include/uapi/linux/major.h b/include/uapi/linux/major.h
index 7e5fa8e15c43..4e5f2b3a3d54 100644
--- a/include/uapi/linux/major.h
+++ b/include/uapi/linux/major.h
@@ -34,8 +34,6 @@
 #define GOLDSTAR_CDROM_MAJOR	16
 #define OPTICS_CDROM_MAJOR	17
 #define SANYO_CDROM_MAJOR	18
-#define CYCLADES_MAJOR		19
-#define CYCLADESAUX_MAJOR	20
 #define MITSUMI_X_CDROM_MAJOR	20
 #define MFM_ACORN_MAJOR		21	/* ARM Linux /dev/mfm */
 #define SCSI_GENERIC_MAJOR	21
diff --git a/include/uapi/linux/serial.h b/include/uapi/linux/serial.h
index 93eb3c496ff1..fa6b16e5fdd8 100644
--- a/include/uapi/linux/serial.h
+++ b/include/uapi/linux/serial.h
@@ -52,11 +52,11 @@ struct serial_struct {
 #define PORT_16450	2
 #define PORT_16550	3
 #define PORT_16550A	4
-#define PORT_CIRRUS     5	/* usurped by cyclades.c */
+#define PORT_CIRRUS     5
 #define PORT_16650	6
 #define PORT_16650V2	7
 #define PORT_16750	8
-#define PORT_STARTECH	9	/* usurped by cyclades.c */
+#define PORT_STARTECH	9
 #define PORT_16C950	10	/* Oxford Semiconductor */
 #define PORT_16654	11
 #define PORT_16850	12
-- 
cgit v1.2.3


From 67b1544a55c94b62f68488d5fcbc93cca293dc32 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 2 Mar 2021 07:21:36 +0100
Subject: tty: isicom, remove this orphan

The Isicom driver was orphaned by commit d86b3001a1a6 (MAINTAINERS:
orphan isicom) 10 years ago. Noone stepped up to take care of them and
to fix all the issues the driver has.

So it's time to drop the driver with all its traces.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210302062214.29627-6-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/devices.txt              |    2 +-
 Documentation/process/magic-number.rst             |    1 -
 .../translations/it_IT/process/magic-number.rst    |    1 -
 .../translations/zh_CN/process/magic-number.rst    |    1 -
 MAINTAINERS                                        |    5 -
 drivers/tty/Kconfig                                |   10 -
 drivers/tty/Makefile                               |    1 -
 drivers/tty/isicom.c                               | 1699 --------------------
 include/linux/isicom.h                             |   85 -
 9 files changed, 1 insertion(+), 1804 deletions(-)
 delete mode 100644 drivers/tty/isicom.c
 delete mode 100644 include/linux/isicom.h

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index b5bd9d46e031..ef41f77cb979 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -289,7 +289,7 @@
 		152 = /dev/kpoll	Kernel Poll Driver
 		153 = /dev/mergemem	Memory merge device
 		154 = /dev/pmu		Macintosh PowerBook power manager
-		155 = /dev/isictl	MultiTech ISICom serial control
+		155 =
 		156 = /dev/lcd		Front panel LCD display
 		157 = /dev/ac		Applicom Intl Profibus card
 		158 = /dev/nwbutton	Netwinder external button
diff --git a/Documentation/process/magic-number.rst b/Documentation/process/magic-number.rst
index d4a30c09bd03..c36f21eecefb 100644
--- a/Documentation/process/magic-number.rst
+++ b/Documentation/process/magic-number.rst
@@ -77,7 +77,6 @@ DB_MAGIC              0x4442           fc_info                  ``drivers/net/ip
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
 FF_MAGIC              0x4646           fc_info                  ``drivers/net/iph5526_novram.c``
-ISICOM_MAGIC          0x4d54           isi_port                 ``include/linux/isicom.h``
 PTY_MAGIC             0x5001                                    ``drivers/char/pty.c``
 PPP_MAGIC             0x5002           ppp                      ``include/linux/if_pppvar.h``
 SSTATE_MAGIC          0x5302           serial_state             ``include/linux/serial.h``
diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst
index 0df2e7e32cd8..440087f9f402 100644
--- a/Documentation/translations/it_IT/process/magic-number.rst
+++ b/Documentation/translations/it_IT/process/magic-number.rst
@@ -83,7 +83,6 @@ DB_MAGIC              0x4442           fc_info                  ``drivers/net/ip
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
 FF_MAGIC              0x4646           fc_info                  ``drivers/net/iph5526_novram.c``
-ISICOM_MAGIC          0x4d54           isi_port                 ``include/linux/isicom.h``
 PTY_MAGIC             0x5001                                    ``drivers/char/pty.c``
 PPP_MAGIC             0x5002           ppp                      ``include/linux/if_pppvar.h``
 SSTATE_MAGIC          0x5302           serial_state             ``include/linux/serial.h``
diff --git a/Documentation/translations/zh_CN/process/magic-number.rst b/Documentation/translations/zh_CN/process/magic-number.rst
index 82d62f6a4406..e91bec4ec156 100644
--- a/Documentation/translations/zh_CN/process/magic-number.rst
+++ b/Documentation/translations/zh_CN/process/magic-number.rst
@@ -66,7 +66,6 @@ DB_MAGIC              0x4442           fc_info                  ``drivers/net/ip
 DL_MAGIC              0x444d           fc_info                  ``drivers/net/iph5526_novram.c``
 FASYNC_MAGIC          0x4601           fasync_struct            ``include/linux/fs.h``
 FF_MAGIC              0x4646           fc_info                  ``drivers/net/iph5526_novram.c``
-ISICOM_MAGIC          0x4d54           isi_port                 ``include/linux/isicom.h``
 PTY_MAGIC             0x5001                                    ``drivers/char/pty.c``
 PPP_MAGIC             0x5002           ppp                      ``include/linux/if_pppvar.h``
 SSTATE_MAGIC          0x5302           serial_state             ``include/linux/serial.h``
diff --git a/MAINTAINERS b/MAINTAINERS
index 29f20a97d73d..f62df0494d11 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12224,11 +12224,6 @@ F:	drivers/mux/
 F:	include/dt-bindings/mux/
 F:	include/linux/mux/
 
-MULTITECH MULTIPORT CARD (ISICOM)
-S:	Orphan
-F:	drivers/tty/isicom.c
-F:	include/linux/isicom.h
-
 MUSB MULTIPOINT HIGH SPEED DUAL-ROLE CONTROLLER
 M:	Bin Liu <b-liu@ti.com>
 L:	linux-usb@vger.kernel.org
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 397523a8095e..0031aa8f8b16 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -238,16 +238,6 @@ config SYNCLINK_GT
 	  synchronous and asynchronous serial adapters
 	  manufactured by Microgate Systems, Ltd. (www.microgate.com)
 
-config ISI
-	tristate "Multi-Tech multiport card support"
-	depends on SERIAL_NONSTANDARD && PCI
-	select FW_LOADER
-	help
-	  This is a driver for the Multi-Tech cards which provide several
-	  serial ports.  The driver is experimental and can currently only be
-	  built as a module. The module will be called isicom.
-	  If you want to do that, choose M here.
-
 config N_HDLC
 	tristate "HDLC line discipline support"
 	depends on SERIAL_NONSTANDARD
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index 94eb2bf75763..a34055bc8b7a 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -18,7 +18,6 @@ obj-$(CONFIG_SERIAL_DEV_BUS)	+= serdev/
 
 # tty drivers
 obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o
-obj-$(CONFIG_ISI)		+= isicom.o
 obj-$(CONFIG_MOXA_INTELLIO)	+= moxa.o
 obj-$(CONFIG_MOXA_SMARTIO)	+= mxser.o
 obj-$(CONFIG_NOZOMI)		+= nozomi.o
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
deleted file mode 100644
index 3b2f9fb01aa0..000000000000
--- a/drivers/tty/isicom.c
+++ /dev/null
@@ -1,1699 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- *	Original driver code supplied by Multi-Tech
- *
- *	Changes
- *	1/9/98	alan@lxorguk.ukuu.org.uk
- *					Merge to 2.0.x kernel tree
- *					Obtain and use official major/minors
- *					Loader switched to a misc device
- *					(fixed range check bug as a side effect)
- *					Printk clean up
- *	9/12/98	alan@lxorguk.ukuu.org.uk
- *					Rough port to 2.1.x
- *
- *	10/6/99 sameer			Merged the ISA and PCI drivers to
- *					a new unified driver.
- *
- *	3/9/99	sameer			Added support for ISI4616 cards.
- *
- *	16/9/99	sameer			We do not force RTS low anymore.
- *					This is to prevent the firmware
- *					from getting confused.
- *
- *	26/10/99 sameer			Cosmetic changes:The driver now
- *					dumps the Port Count information
- *					along with I/O address and IRQ.
- *
- *	13/12/99 sameer			Fixed the problem with IRQ sharing.
- *
- *	10/5/00  sameer			Fixed isicom_shutdown_board()
- *					to not lower DTR on all the ports
- *					when the last port on the card is
- *					closed.
- *
- *	10/5/00  sameer			Signal mask setup command added
- *					to  isicom_setup_port and
- *					isicom_shutdown_port.
- *
- *	24/5/00  sameer			The driver is now SMP aware.
- *
- *
- *	27/11/00 Vinayak P Risbud	Fixed the Driver Crash Problem
- *
- *
- *	03/01/01  anil .s		Added support for resetting the
- *					internal modems on ISI cards.
- *
- *	08/02/01  anil .s		Upgraded the driver for kernel
- *					2.4.x
- *
- *	11/04/01  Kevin			Fixed firmware load problem with
- *					ISIHP-4X card
- *
- *	30/04/01  anil .s		Fixed the remote login through
- *					ISI port problem. Now the link
- *					does not go down before password
- *					prompt.
- *
- *	03/05/01  anil .s		Fixed the problem with IRQ sharing
- *					among ISI-PCI cards.
- *
- *	03/05/01  anil .s		Added support to display the version
- *					info during insmod as well as module
- *					listing by lsmod.
- *
- *	10/05/01  anil .s		Done the modifications to the source
- *					file and Install script so that the
- *					same installation can be used for
- *					2.2.x and 2.4.x kernel.
- *
- *	06/06/01  anil .s		Now we drop both dtr and rts during
- *					shutdown_port as well as raise them
- *					during isicom_config_port.
- *
- *	09/06/01 acme@conectiva.com.br	use capable, not suser, do
- *					restore_flags on failure in
- *					isicom_send_break, verify put_user
- *					result
- *
- *	11/02/03  ranjeeth		Added support for 230 Kbps and 460 Kbps
- *					Baud index extended to 21
- *
- *	20/03/03  ranjeeth		Made to work for Linux Advanced server.
- *					Taken care of license warning.
- *
- *	10/12/03  Ravindra		Made to work for Fedora Core 1 of
- *					Red Hat Distribution
- *
- *	06/01/05  Alan Cox 		Merged the ISI and base kernel strands
- *					into a single 2.6 driver
- *
- *	***********************************************************
- *
- *	To use this driver you also need the support package. You
- *	can find this in RPM format on
- *		ftp://ftp.linux.org.uk/pub/linux/alan
- *
- *	You can find the original tools for this direct from Multitech
- *		ftp://ftp.multitech.com/ISI-Cards/
- *
- *	Having installed the cards the module options (/etc/modprobe.d/)
- *
- *	options isicom   io=card1,card2,card3,card4 irq=card1,card2,card3,card4
- *
- *	Omit those entries for boards you don't have installed.
- *
- *	TODO
- *		Merge testing
- *		64-bit verification
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/firmware.h>
-#include <linux/kernel.h>
-#include <linux/tty.h>
-#include <linux/tty_flip.h>
-#include <linux/termios.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/serial.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/timer.h>
-#include <linux/delay.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-
-#include <linux/uaccess.h>
-#include <linux/io.h>
-
-#include <linux/pci.h>
-
-#include <linux/isicom.h>
-
-#define InterruptTheCard(base) outw(0, (base) + 0xc)
-#define ClearInterrupt(base) inw((base) + 0x0a)
-
-#ifdef DEBUG
-#define isicom_paranoia_check(a, b, c) __isicom_paranoia_check((a), (b), (c))
-#else
-#define isicom_paranoia_check(a, b, c) 0
-#endif
-
-static int isicom_probe(struct pci_dev *, const struct pci_device_id *);
-static void isicom_remove(struct pci_dev *);
-
-static const struct pci_device_id isicom_pci_tbl[] = {
-	{ PCI_DEVICE(VENDOR_ID, 0x2028) },
-	{ PCI_DEVICE(VENDOR_ID, 0x2051) },
-	{ PCI_DEVICE(VENDOR_ID, 0x2052) },
-	{ PCI_DEVICE(VENDOR_ID, 0x2053) },
-	{ PCI_DEVICE(VENDOR_ID, 0x2054) },
-	{ PCI_DEVICE(VENDOR_ID, 0x2055) },
-	{ PCI_DEVICE(VENDOR_ID, 0x2056) },
-	{ PCI_DEVICE(VENDOR_ID, 0x2057) },
-	{ PCI_DEVICE(VENDOR_ID, 0x2058) },
-	{ 0 }
-};
-MODULE_DEVICE_TABLE(pci, isicom_pci_tbl);
-
-static struct pci_driver isicom_driver = {
-	.name		= "isicom",
-	.id_table	= isicom_pci_tbl,
-	.probe		= isicom_probe,
-	.remove		= isicom_remove
-};
-
-static int prev_card = 3;	/*	start servicing isi_card[0]	*/
-static struct tty_driver *isicom_normal;
-
-static void isicom_tx(struct timer_list *unused);
-static void isicom_start(struct tty_struct *tty);
-
-static DEFINE_TIMER(tx, isicom_tx);
-
-/*   baud index mappings from linux defns to isi */
-
-static signed char linuxb_to_isib[] = {
-	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 13, 15, 16, 17, 18, 19, 20, 21
-};
-
-struct	isi_board {
-	unsigned long		base;
-	int			irq;
-	unsigned char		port_count;
-	unsigned short		status;
-	unsigned short		port_status; /* each bit for each port */
-	unsigned short		shift_count;
-	struct isi_port		*ports;
-	signed char		count;
-	spinlock_t		card_lock; /* Card wide lock 11/5/00 -sameer */
-	unsigned long		flags;
-	unsigned int		index;
-};
-
-struct	isi_port {
-	unsigned short		magic;
-	struct tty_port		port;
-	u16			channel;
-	u16			status;
-	struct isi_board	*card;
-	unsigned char		*xmit_buf;
-	int			xmit_head;
-	int			xmit_tail;
-	int			xmit_cnt;
-};
-
-static struct isi_board isi_card[BOARD_COUNT];
-static struct isi_port  isi_ports[PORT_COUNT];
-
-/*
- *	Locking functions for card level locking. We need to own both
- *	the kernel lock for the card and have the card in a position that
- *	it wants to talk.
- */
-
-static int WaitTillCardIsFree(unsigned long base)
-{
-	unsigned int count = 0;
-
-	while (!(inw(base + 0xe) & 0x1) && count++ < 100)
-		mdelay(1);
-
-	return !(inw(base + 0xe) & 0x1);
-}
-
-static int lock_card(struct isi_board *card)
-{
-	unsigned long base = card->base;
-	unsigned int retries, a;
-
-	for (retries = 0; retries < 10; retries++) {
-		spin_lock_irqsave(&card->card_lock, card->flags);
-		for (a = 0; a < 10; a++) {
-			if (inw(base + 0xe) & 0x1)
-				return 1;
-			udelay(10);
-		}
-		spin_unlock_irqrestore(&card->card_lock, card->flags);
-		msleep(10);
-	}
-	pr_warn("Failed to lock Card (0x%lx)\n", card->base);
-
-	return 0;	/* Failed to acquire the card! */
-}
-
-static void unlock_card(struct isi_board *card)
-{
-	spin_unlock_irqrestore(&card->card_lock, card->flags);
-}
-
-/*
- *  ISI Card specific ops ...
- */
-
-/* card->lock HAS to be held */
-static void raise_dtr(struct isi_port *port)
-{
-	struct isi_board *card = port->card;
-	unsigned long base = card->base;
-	u16 channel = port->channel;
-
-	if (WaitTillCardIsFree(base))
-		return;
-
-	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-	outw(0x0504, base);
-	InterruptTheCard(base);
-	port->status |= ISI_DTR;
-}
-
-/* card->lock HAS to be held */
-static void drop_dtr(struct isi_port *port)
-{
-	struct isi_board *card = port->card;
-	unsigned long base = card->base;
-	u16 channel = port->channel;
-
-	if (WaitTillCardIsFree(base))
-		return;
-
-	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-	outw(0x0404, base);
-	InterruptTheCard(base);
-	port->status &= ~ISI_DTR;
-}
-
-/* card->lock HAS to be held */
-static inline void raise_rts(struct isi_port *port)
-{
-	struct isi_board *card = port->card;
-	unsigned long base = card->base;
-	u16 channel = port->channel;
-
-	if (WaitTillCardIsFree(base))
-		return;
-
-	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-	outw(0x0a04, base);
-	InterruptTheCard(base);
-	port->status |= ISI_RTS;
-}
-
-/* card->lock HAS to be held */
-static inline void drop_rts(struct isi_port *port)
-{
-	struct isi_board *card = port->card;
-	unsigned long base = card->base;
-	u16 channel = port->channel;
-
-	if (WaitTillCardIsFree(base))
-		return;
-
-	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-	outw(0x0804, base);
-	InterruptTheCard(base);
-	port->status &= ~ISI_RTS;
-}
-
-/* card->lock MUST NOT be held */
-
-static void isicom_dtr_rts(struct tty_port *port, int on)
-{
-	struct isi_port *ip = container_of(port, struct isi_port, port);
-	struct isi_board *card = ip->card;
-	unsigned long base = card->base;
-	u16 channel = ip->channel;
-
-	if (!lock_card(card))
-		return;
-
-	if (on) {
-		outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-		outw(0x0f04, base);
-		InterruptTheCard(base);
-		ip->status |= (ISI_DTR | ISI_RTS);
-	} else {
-		outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-		outw(0x0C04, base);
-		InterruptTheCard(base);
-		ip->status &= ~(ISI_DTR | ISI_RTS);
-	}
-	unlock_card(card);
-}
-
-/* card->lock HAS to be held */
-static void drop_dtr_rts(struct isi_port *port)
-{
-	struct isi_board *card = port->card;
-	unsigned long base = card->base;
-	u16 channel = port->channel;
-
-	if (WaitTillCardIsFree(base))
-		return;
-
-	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-	outw(0x0c04, base);
-	InterruptTheCard(base);
-	port->status &= ~(ISI_RTS | ISI_DTR);
-}
-
-/*
- *	ISICOM Driver specific routines ...
- *
- */
-
-static inline int __isicom_paranoia_check(struct isi_port const *port,
-	char *name, const char *routine)
-{
-	if (!port) {
-		pr_warn("Warning: bad isicom magic for dev %s in %s\n",
-			name, routine);
-		return 1;
-	}
-	if (port->magic != ISICOM_MAGIC) {
-		pr_warn("Warning: NULL isicom port for dev %s in %s\n",
-			name, routine);
-		return 1;
-	}
-
-	return 0;
-}
-
-/*
- *	Transmitter.
- *
- *	We shovel data into the card buffers on a regular basis. The card
- *	will do the rest of the work for us.
- */
-
-static void isicom_tx(struct timer_list *unused)
-{
-	unsigned long flags, base;
-	unsigned int retries;
-	short count = (BOARD_COUNT-1), card;
-	short txcount, wrd, residue, word_count, cnt;
-	struct isi_port *port;
-	struct tty_struct *tty;
-
-	/*	find next active board	*/
-	card = (prev_card + 1) & 0x0003;
-	while (count-- > 0) {
-		if (isi_card[card].status & BOARD_ACTIVE)
-			break;
-		card = (card + 1) & 0x0003;
-	}
-	if (!(isi_card[card].status & BOARD_ACTIVE))
-		goto sched_again;
-
-	prev_card = card;
-
-	count = isi_card[card].port_count;
-	port = isi_card[card].ports;
-	base = isi_card[card].base;
-
-	spin_lock_irqsave(&isi_card[card].card_lock, flags);
-	for (retries = 0; retries < 100; retries++) {
-		if (inw(base + 0xe) & 0x1)
-			break;
-		udelay(2);
-	}
-	if (retries >= 100)
-		goto unlock;
-
-	tty = tty_port_tty_get(&port->port);
-	if (tty == NULL)
-		goto put_unlock;
-
-	for (; count > 0; count--, port++) {
-		/* port not active or tx disabled to force flow control */
-		if (!tty_port_initialized(&port->port) ||
-			!(port->status & ISI_TXOK))
-			continue;
-
-		txcount = min_t(short, TX_SIZE, port->xmit_cnt);
-		if (txcount <= 0 || tty->stopped || tty->hw_stopped)
-			continue;
-
-		if (!(inw(base + 0x02) & (1 << port->channel)))
-			continue;
-
-		pr_debug("txing %d bytes, port%d.\n",
-			 txcount, port->channel + 1);
-		outw((port->channel << isi_card[card].shift_count) | txcount,
-			base);
-		residue = NO;
-		wrd = 0;
-		while (1) {
-			cnt = min_t(int, txcount, (SERIAL_XMIT_SIZE
-					- port->xmit_tail));
-			if (residue == YES) {
-				residue = NO;
-				if (cnt > 0) {
-					wrd |= (port->port.xmit_buf[port->xmit_tail]
-									<< 8);
-					port->xmit_tail = (port->xmit_tail + 1)
-						& (SERIAL_XMIT_SIZE - 1);
-					port->xmit_cnt--;
-					txcount--;
-					cnt--;
-					outw(wrd, base);
-				} else {
-					outw(wrd, base);
-					break;
-				}
-			}
-			if (cnt <= 0)
-				break;
-			word_count = cnt >> 1;
-			outsw(base, port->port.xmit_buf+port->xmit_tail, word_count);
-			port->xmit_tail = (port->xmit_tail
-				+ (word_count << 1)) & (SERIAL_XMIT_SIZE - 1);
-			txcount -= (word_count << 1);
-			port->xmit_cnt -= (word_count << 1);
-			if (cnt & 0x0001) {
-				residue = YES;
-				wrd = port->port.xmit_buf[port->xmit_tail];
-				port->xmit_tail = (port->xmit_tail + 1)
-					& (SERIAL_XMIT_SIZE - 1);
-				port->xmit_cnt--;
-				txcount--;
-			}
-		}
-
-		InterruptTheCard(base);
-		if (port->xmit_cnt <= 0)
-			port->status &= ~ISI_TXOK;
-		if (port->xmit_cnt <= WAKEUP_CHARS)
-			tty_wakeup(tty);
-	}
-
-put_unlock:
-	tty_kref_put(tty);
-unlock:
-	spin_unlock_irqrestore(&isi_card[card].card_lock, flags);
-	/*	schedule another tx for hopefully in about 10ms	*/
-sched_again:
-	mod_timer(&tx, jiffies + msecs_to_jiffies(10));
-}
-
-/*
- *	Main interrupt handler routine
- */
-
-static irqreturn_t isicom_interrupt(int irq, void *dev_id)
-{
-	struct isi_board *card = dev_id;
-	struct isi_port *port;
-	struct tty_struct *tty;
-	unsigned long base;
-	u16 header, word_count, count, channel;
-	short byte_count;
-	unsigned char *rp;
-
-	if (!card || !(card->status & FIRMWARE_LOADED))
-		return IRQ_NONE;
-
-	base = card->base;
-
-	/* did the card interrupt us? */
-	if (!(inw(base + 0x0e) & 0x02))
-		return IRQ_NONE;
-
-	spin_lock(&card->card_lock);
-
-	/*
-	 * disable any interrupts from the PCI card and lower the
-	 * interrupt line
-	 */
-	outw(0x8000, base+0x04);
-	ClearInterrupt(base);
-
-	inw(base);		/* get the dummy word out */
-	header = inw(base);
-	channel = (header & 0x7800) >> card->shift_count;
-	byte_count = header & 0xff;
-
-	if (channel + 1 > card->port_count) {
-		pr_warn("%s(0x%lx): %d(channel) > port_count\n",
-			__func__, base, channel + 1);
-		outw(0x0000, base+0x04); /* enable interrupts */
-		spin_unlock(&card->card_lock);
-		return IRQ_HANDLED;
-	}
-	port = card->ports + channel;
-	if (!tty_port_initialized(&port->port)) {
-		outw(0x0000, base+0x04); /* enable interrupts */
-		spin_unlock(&card->card_lock);
-		return IRQ_HANDLED;
-	}
-
-	tty = tty_port_tty_get(&port->port);
-	if (tty == NULL) {
-		while (byte_count > 1) {
-			inw(base);
-			byte_count -= 2;
-		}
-		if (byte_count & 0x01)
-			inw(base);
-		outw(0x0000, base+0x04); /* enable interrupts */
-		spin_unlock(&card->card_lock);
-		return IRQ_HANDLED;
-	}
-
-	if (header & 0x8000) {		/* Status Packet */
-		header = inw(base);
-		switch (header & 0xff) {
-		case 0:	/* Change in EIA signals */
-			if (tty_port_check_carrier(&port->port)) {
-				if (port->status & ISI_DCD) {
-					if (!(header & ISI_DCD)) {
-					/* Carrier has been lost  */
-						pr_debug("%s: DCD->low.\n",
-							 __func__);
-						port->status &= ~ISI_DCD;
-						tty_hangup(tty);
-					}
-				} else if (header & ISI_DCD) {
-				/* Carrier has been detected */
-					pr_debug("%s: DCD->high.\n",
-						__func__);
-					port->status |= ISI_DCD;
-					wake_up_interruptible(&port->port.open_wait);
-				}
-			} else {
-				if (header & ISI_DCD)
-					port->status |= ISI_DCD;
-				else
-					port->status &= ~ISI_DCD;
-			}
-
-			if (tty_port_cts_enabled(&port->port)) {
-				if (tty->hw_stopped) {
-					if (header & ISI_CTS) {
-						tty->hw_stopped = 0;
-						/* start tx ing */
-						port->status |= (ISI_TXOK
-							| ISI_CTS);
-						tty_wakeup(tty);
-					}
-				} else if (!(header & ISI_CTS)) {
-					tty->hw_stopped = 1;
-					/* stop tx ing */
-					port->status &= ~(ISI_TXOK | ISI_CTS);
-				}
-			} else {
-				if (header & ISI_CTS)
-					port->status |= ISI_CTS;
-				else
-					port->status &= ~ISI_CTS;
-			}
-
-			if (header & ISI_DSR)
-				port->status |= ISI_DSR;
-			else
-				port->status &= ~ISI_DSR;
-
-			if (header & ISI_RI)
-				port->status |= ISI_RI;
-			else
-				port->status &= ~ISI_RI;
-
-			break;
-
-		case 1:	/* Received Break !!! */
-			tty_insert_flip_char(&port->port, 0, TTY_BREAK);
-			if (port->port.flags & ASYNC_SAK)
-				do_SAK(tty);
-			tty_flip_buffer_push(&port->port);
-			break;
-
-		case 2:	/* Statistics		 */
-			pr_debug("%s: stats!!!\n", __func__);
-			break;
-
-		default:
-			pr_debug("%s: Unknown code in status packet.\n",
-				 __func__);
-			break;
-		}
-	} else {				/* Data   Packet */
-		count = tty_prepare_flip_string(&port->port, &rp,
-				byte_count & ~1);
-		pr_debug("%s: Can rx %d of %d bytes.\n",
-			 __func__, count, byte_count);
-		word_count = count >> 1;
-		insw(base, rp, word_count);
-		byte_count -= (word_count << 1);
-		if (count & 0x0001) {
-			tty_insert_flip_char(&port->port, inw(base) & 0xff,
-				TTY_NORMAL);
-			byte_count -= 2;
-		}
-		if (byte_count > 0) {
-			pr_debug("%s(0x%lx:%d): Flip buffer overflow! dropping bytes...\n",
-				 __func__, base, channel + 1);
-		/* drain out unread xtra data */
-		while (byte_count > 0) {
-				inw(base);
-				byte_count -= 2;
-			}
-		}
-		tty_flip_buffer_push(&port->port);
-	}
-	outw(0x0000, base+0x04); /* enable interrupts */
-	spin_unlock(&card->card_lock);
-	tty_kref_put(tty);
-
-	return IRQ_HANDLED;
-}
-
-static void isicom_config_port(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-	struct isi_board *card = port->card;
-	unsigned long baud;
-	unsigned long base = card->base;
-	u16 channel_setup, channel = port->channel,
-		shift_count = card->shift_count;
-	unsigned char flow_ctrl;
-
-	/* FIXME: Switch to new tty baud API */
-	baud = C_BAUD(tty);
-	if (baud & CBAUDEX) {
-		baud &= ~CBAUDEX;
-
-		/*  if CBAUDEX bit is on and the baud is set to either 50 or 75
-		 *  then the card is programmed for 57.6Kbps or 115Kbps
-		 *  respectively.
-		 */
-
-		/* 1,2,3,4 => 57.6, 115.2, 230, 460 kbps resp. */
-		if (baud < 1 || baud > 4)
-			tty->termios.c_cflag &= ~CBAUDEX;
-		else
-			baud += 15;
-	}
-	if (baud == 15) {
-
-		/*  the ASYNC_SPD_HI and ASYNC_SPD_VHI options are set
-		 *  by the set_serial_info ioctl ... this is done by
-		 *  the 'setserial' utility.
-		 */
-
-		if ((port->port.flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI)
-			baud++; /*  57.6 Kbps */
-		if ((port->port.flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI)
-			baud += 2; /*  115  Kbps */
-		if ((port->port.flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI)
-			baud += 3; /* 230 kbps*/
-		if ((port->port.flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP)
-			baud += 4; /* 460 kbps*/
-	}
-	if (linuxb_to_isib[baud] == -1) {
-		/* hang up */
-		drop_dtr(port);
-		return;
-	} else
-		raise_dtr(port);
-
-	if (WaitTillCardIsFree(base) == 0) {
-		outw(0x8000 | (channel << shift_count) | 0x03, base);
-		outw(linuxb_to_isib[baud] << 8 | 0x03, base);
-		channel_setup = 0;
-		switch (C_CSIZE(tty)) {
-		case CS5:
-			channel_setup |= ISICOM_CS5;
-			break;
-		case CS6:
-			channel_setup |= ISICOM_CS6;
-			break;
-		case CS7:
-			channel_setup |= ISICOM_CS7;
-			break;
-		case CS8:
-			channel_setup |= ISICOM_CS8;
-			break;
-		}
-
-		if (C_CSTOPB(tty))
-			channel_setup |= ISICOM_2SB;
-		if (C_PARENB(tty)) {
-			channel_setup |= ISICOM_EVPAR;
-			if (C_PARODD(tty))
-				channel_setup |= ISICOM_ODPAR;
-		}
-		outw(channel_setup, base);
-		InterruptTheCard(base);
-	}
-	tty_port_set_check_carrier(&port->port, !C_CLOCAL(tty));
-
-	/* flow control settings ...*/
-	flow_ctrl = 0;
-	tty_port_set_cts_flow(&port->port, C_CRTSCTS(tty));
-	if (C_CRTSCTS(tty))
-		flow_ctrl |= ISICOM_CTSRTS;
-	if (I_IXON(tty))
-		flow_ctrl |= ISICOM_RESPOND_XONXOFF;
-	if (I_IXOFF(tty))
-		flow_ctrl |= ISICOM_INITIATE_XONXOFF;
-
-	if (WaitTillCardIsFree(base) == 0) {
-		outw(0x8000 | (channel << shift_count) | 0x04, base);
-		outw(flow_ctrl << 8 | 0x05, base);
-		outw((STOP_CHAR(tty)) << 8 | (START_CHAR(tty)), base);
-		InterruptTheCard(base);
-	}
-
-	/*	rx enabled -> enable port for rx on the card	*/
-	if (C_CREAD(tty)) {
-		card->port_status |= (1 << channel);
-		outw(card->port_status, base + 0x02);
-	}
-}
-
-/* open et all */
-
-static inline void isicom_setup_board(struct isi_board *bp)
-{
-	int channel;
-	struct isi_port *port;
-
-	bp->count++;
-	if (!(bp->status & BOARD_INIT)) {
-		port = bp->ports;
-		for (channel = 0; channel < bp->port_count; channel++, port++)
-			drop_dtr_rts(port);
-	}
-	bp->status |= BOARD_ACTIVE | BOARD_INIT;
-}
-
-/* Activate and thus setup board are protected from races against shutdown
-   by the tty_port mutex */
-
-static int isicom_activate(struct tty_port *tport, struct tty_struct *tty)
-{
-	struct isi_port *port = container_of(tport, struct isi_port, port);
-	struct isi_board *card = port->card;
-	unsigned long flags;
-
-	if (tty_port_alloc_xmit_buf(tport) < 0)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	isicom_setup_board(card);
-
-	port->xmit_cnt = port->xmit_head = port->xmit_tail = 0;
-
-	/*	discard any residual data	*/
-	if (WaitTillCardIsFree(card->base) == 0) {
-		outw(0x8000 | (port->channel << card->shift_count) | 0x02,
-				card->base);
-		outw(((ISICOM_KILLTX | ISICOM_KILLRX) << 8) | 0x06, card->base);
-		InterruptTheCard(card->base);
-	}
-	isicom_config_port(tty);
-	spin_unlock_irqrestore(&card->card_lock, flags);
-
-	return 0;
-}
-
-static int isicom_carrier_raised(struct tty_port *port)
-{
-	struct isi_port *ip = container_of(port, struct isi_port, port);
-	return (ip->status & ISI_DCD)?1 : 0;
-}
-
-static struct tty_port *isicom_find_port(struct tty_struct *tty)
-{
-	struct isi_port *port;
-	struct isi_board *card;
-	unsigned int board;
-	int line = tty->index;
-
-	board = BOARD(line);
-	card = &isi_card[board];
-
-	if (!(card->status & FIRMWARE_LOADED))
-		return NULL;
-
-	/*  open on a port greater than the port count for the card !!! */
-	if (line > ((board * 16) + card->port_count - 1))
-		return NULL;
-
-	port = &isi_ports[line];
-	if (isicom_paranoia_check(port, tty->name, "isicom_open"))
-		return NULL;
-
-	return &port->port;
-}
-
-static int isicom_open(struct tty_struct *tty, struct file *filp)
-{
-	struct isi_port *port;
-	struct tty_port *tport;
-
-	tport = isicom_find_port(tty);
-	if (tport == NULL)
-		return -ENODEV;
-	port = container_of(tport, struct isi_port, port);
-
-	tty->driver_data = port;
-	return tty_port_open(tport, tty, filp);
-}
-
-/* close et all */
-
-/* card->lock HAS to be held */
-static void isicom_shutdown_port(struct isi_port *port)
-{
-	struct isi_board *card = port->card;
-
-	if (--card->count < 0) {
-		pr_debug("%s: bad board(0x%lx) count %d.\n",
-			 __func__, card->base, card->count);
-		card->count = 0;
-	}
-	/* last port was closed, shutdown that board too */
-	if (!card->count)
-		card->status &= BOARD_ACTIVE;
-}
-
-static void isicom_flush_buffer(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-	struct isi_board *card = port->card;
-	unsigned long flags;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_flush_buffer"))
-		return;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	port->xmit_cnt = port->xmit_head = port->xmit_tail = 0;
-	spin_unlock_irqrestore(&card->card_lock, flags);
-
-	tty_wakeup(tty);
-}
-
-static void isicom_shutdown(struct tty_port *port)
-{
-	struct isi_port *ip = container_of(port, struct isi_port, port);
-	struct isi_board *card = ip->card;
-	unsigned long flags;
-
-	/* indicate to the card that no more data can be received
-	   on this port */
-	spin_lock_irqsave(&card->card_lock, flags);
-	card->port_status &= ~(1 << ip->channel);
-	outw(card->port_status, card->base + 0x02);
-	isicom_shutdown_port(ip);
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	tty_port_free_xmit_buf(port);
-}
-
-static void isicom_close(struct tty_struct *tty, struct file *filp)
-{
-	struct isi_port *ip = tty->driver_data;
-	struct tty_port *port;
-
-	if (ip == NULL)
-		return;
-
-	port = &ip->port;
-	if (isicom_paranoia_check(ip, tty->name, "isicom_close"))
-		return;
-	tty_port_close(port, tty, filp);
-}
-
-/* write et all */
-static int isicom_write(struct tty_struct *tty,	const unsigned char *buf,
-	int count)
-{
-	struct isi_port *port = tty->driver_data;
-	struct isi_board *card = port->card;
-	unsigned long flags;
-	int cnt, total = 0;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_write"))
-		return 0;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-
-	while (1) {
-		cnt = min_t(int, count, min(SERIAL_XMIT_SIZE - port->xmit_cnt
-				- 1, SERIAL_XMIT_SIZE - port->xmit_head));
-		if (cnt <= 0)
-			break;
-
-		memcpy(port->port.xmit_buf + port->xmit_head, buf, cnt);
-		port->xmit_head = (port->xmit_head + cnt) & (SERIAL_XMIT_SIZE
-			- 1);
-		port->xmit_cnt += cnt;
-		buf += cnt;
-		count -= cnt;
-		total += cnt;
-	}
-	if (port->xmit_cnt && !tty->stopped && !tty->hw_stopped)
-		port->status |= ISI_TXOK;
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	return total;
-}
-
-/* put_char et all */
-static int isicom_put_char(struct tty_struct *tty, unsigned char ch)
-{
-	struct isi_port *port = tty->driver_data;
-	struct isi_board *card = port->card;
-	unsigned long flags;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_put_char"))
-		return 0;
-
-	spin_lock_irqsave(&card->card_lock, flags);
-	if (port->xmit_cnt >= SERIAL_XMIT_SIZE - 1) {
-		spin_unlock_irqrestore(&card->card_lock, flags);
-		return 0;
-	}
-
-	port->port.xmit_buf[port->xmit_head++] = ch;
-	port->xmit_head &= (SERIAL_XMIT_SIZE - 1);
-	port->xmit_cnt++;
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	return 1;
-}
-
-/* flush_chars et all */
-static void isicom_flush_chars(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_flush_chars"))
-		return;
-
-	if (port->xmit_cnt <= 0 || tty->stopped || tty->hw_stopped ||
-			!port->port.xmit_buf)
-		return;
-
-	/* this tells the transmitter to consider this port for
-	   data output to the card ... that's the best we can do. */
-	port->status |= ISI_TXOK;
-}
-
-/* write_room et all */
-static int isicom_write_room(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-	int free;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_write_room"))
-		return 0;
-
-	free = SERIAL_XMIT_SIZE - port->xmit_cnt - 1;
-	if (free < 0)
-		free = 0;
-	return free;
-}
-
-/* chars_in_buffer et all */
-static int isicom_chars_in_buffer(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-	if (isicom_paranoia_check(port, tty->name, "isicom_chars_in_buffer"))
-		return 0;
-	return port->xmit_cnt;
-}
-
-/* ioctl et all */
-static int isicom_send_break(struct tty_struct *tty, int length)
-{
-	struct isi_port *port = tty->driver_data;
-	struct isi_board *card = port->card;
-	unsigned long base = card->base;
-
-	if (length == -1)
-		return -EOPNOTSUPP;
-
-	if (!lock_card(card))
-		return -EINVAL;
-
-	outw(0x8000 | ((port->channel) << (card->shift_count)) | 0x3, base);
-	outw((length & 0xff) << 8 | 0x00, base);
-	outw((length & 0xff00u), base);
-	InterruptTheCard(base);
-
-	unlock_card(card);
-	return 0;
-}
-
-static int isicom_tiocmget(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-	/* just send the port status */
-	u16 status = port->status;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_ioctl"))
-		return -ENODEV;
-
-	return  ((status & ISI_RTS) ? TIOCM_RTS : 0) |
-		((status & ISI_DTR) ? TIOCM_DTR : 0) |
-		((status & ISI_DCD) ? TIOCM_CAR : 0) |
-		((status & ISI_DSR) ? TIOCM_DSR : 0) |
-		((status & ISI_CTS) ? TIOCM_CTS : 0) |
-		((status & ISI_RI ) ? TIOCM_RI  : 0);
-}
-
-static int isicom_tiocmset(struct tty_struct *tty,
-					unsigned int set, unsigned int clear)
-{
-	struct isi_port *port = tty->driver_data;
-	unsigned long flags;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_ioctl"))
-		return -ENODEV;
-
-	spin_lock_irqsave(&port->card->card_lock, flags);
-	if (set & TIOCM_RTS)
-		raise_rts(port);
-	if (set & TIOCM_DTR)
-		raise_dtr(port);
-
-	if (clear & TIOCM_RTS)
-		drop_rts(port);
-	if (clear & TIOCM_DTR)
-		drop_dtr(port);
-	spin_unlock_irqrestore(&port->card->card_lock, flags);
-
-	return 0;
-}
-
-static int isicom_set_serial_info(struct tty_struct *tty,
-					struct serial_struct *ss)
-{
-	struct isi_port *port = tty->driver_data;
-	int reconfig_port;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_ioctl"))
-		return -ENODEV;
-
-	mutex_lock(&port->port.mutex);
-	reconfig_port = ((port->port.flags & ASYNC_SPD_MASK) !=
-		(ss->flags & ASYNC_SPD_MASK));
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		if ((ss->close_delay != port->port.close_delay) ||
-				(ss->closing_wait != port->port.closing_wait) ||
-				((ss->flags & ~ASYNC_USR_MASK) !=
-				(port->port.flags & ~ASYNC_USR_MASK))) {
-			mutex_unlock(&port->port.mutex);
-			return -EPERM;
-		}
-		port->port.flags = ((port->port.flags & ~ASYNC_USR_MASK) |
-				(ss->flags & ASYNC_USR_MASK));
-	} else {
-		port->port.close_delay = ss->close_delay;
-		port->port.closing_wait = ss->closing_wait;
-		port->port.flags = ((port->port.flags & ~ASYNC_FLAGS) |
-				(ss->flags & ASYNC_FLAGS));
-	}
-	if (reconfig_port) {
-		unsigned long flags;
-		spin_lock_irqsave(&port->card->card_lock, flags);
-		isicom_config_port(tty);
-		spin_unlock_irqrestore(&port->card->card_lock, flags);
-	}
-	mutex_unlock(&port->port.mutex);
-	return 0;
-}
-
-static int isicom_get_serial_info(struct tty_struct *tty,
-	struct serial_struct *ss)
-{
-	struct isi_port *port = tty->driver_data;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_ioctl"))
-		return -ENODEV;
-
-	mutex_lock(&port->port.mutex);
-/*	ss->type = ? */
-	ss->line = port - isi_ports;
-	ss->port = port->card->base;
-	ss->irq = port->card->irq;
-	ss->flags = port->port.flags;
-/*	ss->baud_base = ? */
-	ss->close_delay = port->port.close_delay;
-	ss->closing_wait = port->port.closing_wait;
-	mutex_unlock(&port->port.mutex);
-	return 0;
-}
-
-/* set_termios et all */
-static void isicom_set_termios(struct tty_struct *tty,
-	struct ktermios *old_termios)
-{
-	struct isi_port *port = tty->driver_data;
-	unsigned long flags;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_set_termios"))
-		return;
-
-	if (tty->termios.c_cflag == old_termios->c_cflag &&
-			tty->termios.c_iflag == old_termios->c_iflag)
-		return;
-
-	spin_lock_irqsave(&port->card->card_lock, flags);
-	isicom_config_port(tty);
-	spin_unlock_irqrestore(&port->card->card_lock, flags);
-
-	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty)) {
-		tty->hw_stopped = 0;
-		isicom_start(tty);
-	}
-}
-
-/* throttle et all */
-static void isicom_throttle(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-	struct isi_board *card = port->card;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_throttle"))
-		return;
-
-	/* tell the card that this port cannot handle any more data for now */
-	card->port_status &= ~(1 << port->channel);
-	outw(card->port_status, card->base + 0x02);
-}
-
-/* unthrottle et all */
-static void isicom_unthrottle(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-	struct isi_board *card = port->card;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_unthrottle"))
-		return;
-
-	/* tell the card that this port is ready to accept more data */
-	card->port_status |= (1 << port->channel);
-	outw(card->port_status, card->base + 0x02);
-}
-
-/* stop et all */
-static void isicom_stop(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_stop"))
-		return;
-
-	/* this tells the transmitter not to consider this port for
-	   data output to the card. */
-	port->status &= ~ISI_TXOK;
-}
-
-/* start et all */
-static void isicom_start(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_start"))
-		return;
-
-	/* this tells the transmitter to consider this port for
-	   data output to the card. */
-	port->status |= ISI_TXOK;
-}
-
-static void isicom_hangup(struct tty_struct *tty)
-{
-	struct isi_port *port = tty->driver_data;
-
-	if (isicom_paranoia_check(port, tty->name, "isicom_hangup"))
-		return;
-	tty_port_hangup(&port->port);
-}
-
-
-/*
- * Driver init and deinit functions
- */
-
-static const struct tty_operations isicom_ops = {
-	.open			= isicom_open,
-	.close			= isicom_close,
-	.write			= isicom_write,
-	.put_char		= isicom_put_char,
-	.flush_chars		= isicom_flush_chars,
-	.write_room		= isicom_write_room,
-	.chars_in_buffer	= isicom_chars_in_buffer,
-	.set_termios		= isicom_set_termios,
-	.throttle		= isicom_throttle,
-	.unthrottle		= isicom_unthrottle,
-	.stop			= isicom_stop,
-	.start			= isicom_start,
-	.hangup			= isicom_hangup,
-	.flush_buffer		= isicom_flush_buffer,
-	.tiocmget		= isicom_tiocmget,
-	.tiocmset		= isicom_tiocmset,
-	.break_ctl		= isicom_send_break,
-	.get_serial		= isicom_get_serial_info,
-	.set_serial		= isicom_set_serial_info,
-};
-
-static const struct tty_port_operations isicom_port_ops = {
-	.carrier_raised		= isicom_carrier_raised,
-	.dtr_rts		= isicom_dtr_rts,
-	.activate		= isicom_activate,
-	.shutdown		= isicom_shutdown,
-};
-
-static int reset_card(struct pci_dev *pdev,
-	const unsigned int card, unsigned int *signature)
-{
-	struct isi_board *board = pci_get_drvdata(pdev);
-	unsigned long base = board->base;
-	unsigned int sig, portcount = 0;
-	int retval = 0;
-
-	dev_dbg(&pdev->dev, "ISILoad:Resetting Card%d at 0x%lx\n", card + 1,
-		base);
-
-	inw(base + 0x8);
-
-	msleep(10);
-
-	outw(0, base + 0x8); /* Reset */
-
-	msleep(1000);
-
-	sig = inw(base + 0x4) & 0xff;
-
-	if (sig != 0xa5 && sig != 0xbb && sig != 0xcc && sig != 0xdd &&
-			sig != 0xee) {
-		dev_warn(&pdev->dev, "ISILoad:Card%u reset failure (Possible "
-			"bad I/O Port Address 0x%lx).\n", card + 1, base);
-		dev_dbg(&pdev->dev, "Sig=0x%x\n", sig);
-		retval = -EIO;
-		goto end;
-	}
-
-	msleep(10);
-
-	portcount = inw(base + 0x2);
-	if (!(inw(base + 0xe) & 0x1) || (portcount != 0 && portcount != 4 &&
-				portcount != 8 && portcount != 16)) {
-		dev_err(&pdev->dev, "ISILoad:PCI Card%d reset failure.\n",
-			card + 1);
-		retval = -EIO;
-		goto end;
-	}
-
-	switch (sig) {
-	case 0xa5:
-	case 0xbb:
-	case 0xdd:
-		board->port_count = (portcount == 4) ? 4 : 8;
-		board->shift_count = 12;
-		break;
-	case 0xcc:
-	case 0xee:
-		board->port_count = 16;
-		board->shift_count = 11;
-		break;
-	}
-	dev_info(&pdev->dev, "-Done\n");
-	*signature = sig;
-
-end:
-	return retval;
-}
-
-static int load_firmware(struct pci_dev *pdev,
-	const unsigned int index, const unsigned int signature)
-{
-	struct isi_board *board = pci_get_drvdata(pdev);
-	const struct firmware *fw;
-	unsigned long base = board->base;
-	unsigned int a;
-	u16 word_count, status;
-	int retval = -EIO;
-	char *name;
-	u8 *data;
-
-	struct stframe {
-		u16	addr;
-		u16	count;
-		u8	data[0];
-	} *frame;
-
-	switch (signature) {
-	case 0xa5:
-		name = "isi608.bin";
-		break;
-	case 0xbb:
-		name = "isi608em.bin";
-		break;
-	case 0xcc:
-		name = "isi616em.bin";
-		break;
-	case 0xdd:
-		name = "isi4608.bin";
-		break;
-	case 0xee:
-		name = "isi4616.bin";
-		break;
-	default:
-		dev_err(&pdev->dev, "Unknown signature.\n");
-		goto end;
-	}
-
-	retval = request_firmware(&fw, name, &pdev->dev);
-	if (retval)
-		goto end;
-
-	retval = -EIO;
-
-	for (frame = (struct stframe *)fw->data;
-			frame < (struct stframe *)(fw->data + fw->size);
-			frame = (struct stframe *)((u8 *)(frame + 1) +
-				frame->count)) {
-		if (WaitTillCardIsFree(base))
-			goto errrelfw;
-
-		outw(0xf0, base);	/* start upload sequence */
-		outw(0x00, base);
-		outw(frame->addr, base); /* lsb of address */
-
-		word_count = frame->count / 2 + frame->count % 2;
-		outw(word_count, base);
-		InterruptTheCard(base);
-
-		udelay(100); /* 0x2f */
-
-		if (WaitTillCardIsFree(base))
-			goto errrelfw;
-
-		status = inw(base + 0x4);
-		if (status != 0) {
-			dev_warn(&pdev->dev, "Card%d rejected load header:\n"
-				 "Address:0x%x\n"
-				 "Count:0x%x\n"
-				 "Status:0x%x\n",
-				 index + 1, frame->addr, frame->count, status);
-			goto errrelfw;
-		}
-		outsw(base, frame->data, word_count);
-
-		InterruptTheCard(base);
-
-		udelay(50); /* 0x0f */
-
-		if (WaitTillCardIsFree(base))
-			goto errrelfw;
-
-		status = inw(base + 0x4);
-		if (status != 0) {
-			dev_err(&pdev->dev, "Card%d got out of sync.Card "
-				"Status:0x%x\n", index + 1, status);
-			goto errrelfw;
-		}
-	}
-
-/* XXX: should we test it by reading it back and comparing with original like
- * in load firmware package? */
-	for (frame = (struct stframe *)fw->data;
-			frame < (struct stframe *)(fw->data + fw->size);
-			frame = (struct stframe *)((u8 *)(frame + 1) +
-				frame->count)) {
-		if (WaitTillCardIsFree(base))
-			goto errrelfw;
-
-		outw(0xf1, base); /* start download sequence */
-		outw(0x00, base);
-		outw(frame->addr, base); /* lsb of address */
-
-		word_count = (frame->count >> 1) + frame->count % 2;
-		outw(word_count + 1, base);
-		InterruptTheCard(base);
-
-		udelay(50); /* 0xf */
-
-		if (WaitTillCardIsFree(base))
-			goto errrelfw;
-
-		status = inw(base + 0x4);
-		if (status != 0) {
-			dev_warn(&pdev->dev, "Card%d rejected verify header:\n"
-				 "Address:0x%x\n"
-				 "Count:0x%x\n"
-				 "Status: 0x%x\n",
-				 index + 1, frame->addr, frame->count, status);
-			goto errrelfw;
-		}
-
-		data = kmalloc_array(word_count, 2, GFP_KERNEL);
-		if (data == NULL) {
-			dev_err(&pdev->dev, "Card%d, firmware upload "
-				"failed, not enough memory\n", index + 1);
-			goto errrelfw;
-		}
-		inw(base);
-		insw(base, data, word_count);
-		InterruptTheCard(base);
-
-		for (a = 0; a < frame->count; a++)
-			if (data[a] != frame->data[a]) {
-				kfree(data);
-				dev_err(&pdev->dev, "Card%d, firmware upload "
-					"failed\n", index + 1);
-				goto errrelfw;
-			}
-		kfree(data);
-
-		udelay(50); /* 0xf */
-
-		if (WaitTillCardIsFree(base))
-			goto errrelfw;
-
-		status = inw(base + 0x4);
-		if (status != 0) {
-			dev_err(&pdev->dev, "Card%d verify got out of sync. "
-				"Card Status:0x%x\n", index + 1, status);
-			goto errrelfw;
-		}
-	}
-
-	/* xfer ctrl */
-	if (WaitTillCardIsFree(base))
-		goto errrelfw;
-
-	outw(0xf2, base);
-	outw(0x800, base);
-	outw(0x0, base);
-	outw(0x0, base);
-	InterruptTheCard(base);
-	outw(0x0, base + 0x4); /* for ISI4608 cards */
-
-	board->status |= FIRMWARE_LOADED;
-	retval = 0;
-
-errrelfw:
-	release_firmware(fw);
-end:
-	return retval;
-}
-
-/*
- *	Insmod can set static symbols so keep these static
- */
-static unsigned int card_count;
-
-static int isicom_probe(struct pci_dev *pdev,
-	const struct pci_device_id *ent)
-{
-	unsigned int signature, index;
-	int retval = -EPERM;
-	struct isi_board *board = NULL;
-
-	if (card_count >= BOARD_COUNT)
-		goto err;
-
-	retval = pci_enable_device(pdev);
-	if (retval) {
-		dev_err(&pdev->dev, "failed to enable\n");
-		goto err;
-	}
-
-	dev_info(&pdev->dev, "ISI PCI Card(Device ID 0x%x)\n", ent->device);
-
-	/* allot the first empty slot in the array */
-	for (index = 0; index < BOARD_COUNT; index++) {
-		if (isi_card[index].base == 0) {
-			board = &isi_card[index];
-			break;
-		}
-	}
-	if (index == BOARD_COUNT) {
-		retval = -ENODEV;
-		goto err_disable;
-	}
-
-	board->index = index;
-	board->base = pci_resource_start(pdev, 3);
-	board->irq = pdev->irq;
-	card_count++;
-
-	pci_set_drvdata(pdev, board);
-
-	retval = pci_request_region(pdev, 3, ISICOM_NAME);
-	if (retval) {
-		dev_err(&pdev->dev, "I/O Region 0x%lx-0x%lx is busy. Card%d "
-			"will be disabled.\n", board->base, board->base + 15,
-			index + 1);
-		retval = -EBUSY;
-		goto errdec;
-	}
-
-	retval = request_irq(board->irq, isicom_interrupt,
-			IRQF_SHARED, ISICOM_NAME, board);
-	if (retval < 0) {
-		dev_err(&pdev->dev, "Could not install handler at Irq %d. "
-			"Card%d will be disabled.\n", board->irq, index + 1);
-		goto errunrr;
-	}
-
-	retval = reset_card(pdev, index, &signature);
-	if (retval < 0)
-		goto errunri;
-
-	retval = load_firmware(pdev, index, signature);
-	if (retval < 0)
-		goto errunri;
-
-	for (index = 0; index < board->port_count; index++) {
-		struct tty_port *tport = &board->ports[index].port;
-		tty_port_init(tport);
-		tport->ops = &isicom_port_ops;
-		tport->close_delay = 50 * HZ/100;
-		tport->closing_wait = 3000 * HZ/100;
-		tty_port_register_device(tport, isicom_normal,
-				board->index * 16 + index, &pdev->dev);
-	}
-
-	return 0;
-
-errunri:
-	free_irq(board->irq, board);
-errunrr:
-	pci_release_region(pdev, 3);
-errdec:
-	board->base = 0;
-	card_count--;
-err_disable:
-	pci_disable_device(pdev);
-err:
-	return retval;
-}
-
-static void isicom_remove(struct pci_dev *pdev)
-{
-	struct isi_board *board = pci_get_drvdata(pdev);
-	unsigned int i;
-
-	for (i = 0; i < board->port_count; i++) {
-		tty_unregister_device(isicom_normal, board->index * 16 + i);
-		tty_port_destroy(&board->ports[i].port);
-	}
-
-	free_irq(board->irq, board);
-	pci_release_region(pdev, 3);
-	board->base = 0;
-	card_count--;
-	pci_disable_device(pdev);
-}
-
-static int __init isicom_init(void)
-{
-	int retval, idx, channel;
-	struct isi_port *port;
-
-	for (idx = 0; idx < BOARD_COUNT; idx++) {
-		port = &isi_ports[idx * 16];
-		isi_card[idx].ports = port;
-		spin_lock_init(&isi_card[idx].card_lock);
-		for (channel = 0; channel < 16; channel++, port++) {
-			port->magic = ISICOM_MAGIC;
-			port->card = &isi_card[idx];
-			port->channel = channel;
-			port->status = 0;
-			/*  . . .  */
-		}
-		isi_card[idx].base = 0;
-		isi_card[idx].irq = 0;
-	}
-
-	/* tty driver structure initialization */
-	isicom_normal = alloc_tty_driver(PORT_COUNT);
-	if (!isicom_normal) {
-		retval = -ENOMEM;
-		goto error;
-	}
-
-	isicom_normal->name 			= "ttyM";
-	isicom_normal->major			= ISICOM_NMAJOR;
-	isicom_normal->minor_start		= 0;
-	isicom_normal->type			= TTY_DRIVER_TYPE_SERIAL;
-	isicom_normal->subtype			= SERIAL_TYPE_NORMAL;
-	isicom_normal->init_termios		= tty_std_termios;
-	isicom_normal->init_termios.c_cflag	= B9600 | CS8 | CREAD | HUPCL |
-		CLOCAL;
-	isicom_normal->flags			= TTY_DRIVER_REAL_RAW |
-		TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_HARDWARE_BREAK;
-	tty_set_operations(isicom_normal, &isicom_ops);
-
-	retval = tty_register_driver(isicom_normal);
-	if (retval) {
-		pr_debug("Couldn't register the dialin driver\n");
-		goto err_puttty;
-	}
-
-	retval = pci_register_driver(&isicom_driver);
-	if (retval < 0) {
-		pr_err("Unable to register pci driver.\n");
-		goto err_unrtty;
-	}
-
-	mod_timer(&tx, jiffies + 1);
-
-	return 0;
-err_unrtty:
-	tty_unregister_driver(isicom_normal);
-err_puttty:
-	put_tty_driver(isicom_normal);
-error:
-	return retval;
-}
-
-static void __exit isicom_exit(void)
-{
-	del_timer_sync(&tx);
-
-	pci_unregister_driver(&isicom_driver);
-	tty_unregister_driver(isicom_normal);
-	put_tty_driver(isicom_normal);
-}
-
-module_init(isicom_init);
-module_exit(isicom_exit);
-
-MODULE_AUTHOR("MultiTech");
-MODULE_DESCRIPTION("Driver for the ISI series of cards by MultiTech");
-MODULE_LICENSE("GPL");
-MODULE_FIRMWARE("isi608.bin");
-MODULE_FIRMWARE("isi608em.bin");
-MODULE_FIRMWARE("isi616em.bin");
-MODULE_FIRMWARE("isi4608.bin");
-MODULE_FIRMWARE("isi4616.bin");
diff --git a/include/linux/isicom.h b/include/linux/isicom.h
deleted file mode 100644
index 7de6822d7b1a..000000000000
--- a/include/linux/isicom.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_ISICOM_H
-#define _LINUX_ISICOM_H
-
-#define		YES	1
-#define		NO	0
-
-/*
- *  ISICOM Driver definitions ...
- *
- */
-
-#define		ISICOM_NAME	"ISICom"
-
-/*
- *      PCI definitions
- */
-
-#define		DEVID_COUNT	9
-#define		VENDOR_ID	0x10b5
-
-/*
- *	These are now officially allocated numbers
- */
-
-#define		ISICOM_NMAJOR	112	/* normal  */
-#define		ISICOM_CMAJOR	113	/* callout */
-#define		ISICOM_MAGIC	(('M' << 8) | 'T')
-
-#define		WAKEUP_CHARS	256	/* hard coded for now	*/
-#define		TX_SIZE		254
-
-#define		BOARD_COUNT	4
-#define		PORT_COUNT	(BOARD_COUNT*16)
-
-/*   character sizes  */
-
-#define		ISICOM_CS5		0x0000
-#define		ISICOM_CS6		0x0001
-#define		ISICOM_CS7		0x0002
-#define		ISICOM_CS8		0x0003
-
-/* stop bits */
-
-#define		ISICOM_1SB		0x0000
-#define		ISICOM_2SB		0x0004
-
-/* parity */
-
-#define		ISICOM_NOPAR		0x0000
-#define		ISICOM_ODPAR		0x0008
-#define		ISICOM_EVPAR		0x0018
-
-/* flow control */
-
-#define		ISICOM_CTSRTS		0x03
-#define		ISICOM_INITIATE_XONXOFF	0x04
-#define		ISICOM_RESPOND_XONXOFF	0x08
-
-#define	BOARD(line)  (((line) >> 4) & 0x3)
-
-	/*	isi kill queue bitmap	*/
-
-#define		ISICOM_KILLTX		0x01
-#define		ISICOM_KILLRX		0x02
-
-	/* isi_board status bitmap */
-
-#define		FIRMWARE_LOADED		0x0001
-#define		BOARD_ACTIVE		0x0002
-#define		BOARD_INIT		0x0004
-
- 	/* isi_port status bitmap  */
-
-#define		ISI_CTS			0x1000
-#define		ISI_DSR			0x2000
-#define		ISI_RI			0x4000
-#define		ISI_DCD			0x8000
-#define		ISI_DTR			0x0100
-#define		ISI_RTS			0x0200
-
-
-#define		ISI_TXOK		0x0001
-
-#endif	/*	ISICOM_H	*/
-- 
cgit v1.2.3


From 3b00b6af7a5bd7fd7e5189ccaad0e0cfb7dc7785 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 2 Mar 2021 07:21:37 +0100
Subject: tty: rocket, remove the driver

While the driver is still marked as maintained in MAINTAINERS, Comtrol
does not really care about this ancient driver. They are still
manufacturing serial devices, but those are controlled only by
out-of-tree drivers.

Comtrol didn't answer my pings, so this driver is apparently
unmaintained.  Aside from that, the driver was untouched for years, only
whole-tree changes happened during the past years. The driver needs much
more care, so drop it for now. If someone steps up to reintroduce it,
they need to clean it up first.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210302062214.29627-7-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/serial/rocket.rst         |  185 --
 Documentation/process/magic-number.rst             |    1 -
 .../translations/it_IT/process/magic-number.rst    |    1 -
 .../translations/zh_CN/process/magic-number.rst    |    1 -
 MAINTAINERS                                        |    6 -
 drivers/tty/Kconfig                                |   15 -
 drivers/tty/Makefile                               |    1 -
 drivers/tty/rocket.c                               | 3127 --------------------
 drivers/tty/rocket.h                               |  111 -
 drivers/tty/rocket_int.h                           | 1214 --------
 include/linux/pci_ids.h                            |   21 -
 11 files changed, 4683 deletions(-)
 delete mode 100644 Documentation/driver-api/serial/rocket.rst
 delete mode 100644 drivers/tty/rocket.c
 delete mode 100644 drivers/tty/rocket.h
 delete mode 100644 drivers/tty/rocket_int.h

(limited to 'include/linux')

diff --git a/Documentation/driver-api/serial/rocket.rst b/Documentation/driver-api/serial/rocket.rst
deleted file mode 100644
index 23761eae4282..000000000000
--- a/Documentation/driver-api/serial/rocket.rst
+++ /dev/null
@@ -1,185 +0,0 @@
-================================================
-Comtrol(tm) RocketPort(R)/RocketModem(TM) Series
-================================================
-
-Device Driver for the Linux Operating System
-============================================
-
-Product overview
-----------------
-
-This driver provides a loadable kernel driver for the Comtrol RocketPort
-and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32
-high-speed serial ports or modems.  This driver supports up to a combination
-of four RocketPort or RocketModems boards in one machine simultaneously.
-This file assumes that you are using the RocketPort driver which is
-integrated into the kernel sources.
-
-The driver can also be installed as an external module using the usual
-"make;make install" routine.  This external module driver, obtainable
-from the Comtrol website listed below, is useful for updating the driver
-or installing it into kernels which do not have the driver configured
-into them.  Installations instructions for the external module
-are in the included README and HW_INSTALL files.
-
-RocketPort ISA and RocketModem II PCI boards currently are only supported by
-this driver in module form.
-
-The RocketPort ISA board requires I/O ports to be configured by the DIP
-switches on the board.  See the section "ISA Rocketport Boards" below for
-information on how to set the DIP switches.
-
-You pass the I/O port to the driver using the following module parameters:
-
-board1:
-	I/O port for the first ISA board
-board2:
-	I/O port for the second ISA board
-board3:
-	I/O port for the third ISA board
-board4:
-	I/O port for the fourth ISA board
-
-There is a set of utilities and scripts provided with the external driver
-(downloadable from http://www.comtrol.com) that ease the configuration and
-setup of the ISA cards.
-
-The RocketModem II PCI boards require firmware to be loaded into the card
-before it will function.  The driver has only been tested as a module for this
-board.
-
-Installation Procedures
------------------------
-
-RocketPort/RocketModem PCI cards require no driver configuration, they are
-automatically detected and configured.
-
-The RocketPort driver can be installed as a module (recommended) or built
-into the kernel. This is selected, as for other drivers, through the `make config`
-command from the root of the Linux source tree during the kernel build process.
-
-The RocketPort/RocketModem serial ports installed by this driver are assigned
-device major number 46, and will be named /dev/ttyRx, where x is the port number
-starting at zero (ex. /dev/ttyR0, /devttyR1, ...).  If you have multiple cards
-installed in the system, the mapping of port names to serial ports is displayed
-in the system log at /var/log/messages.
-
-If installed as a module, the module must be loaded.  This can be done
-manually by entering "modprobe rocket".  To have the module loaded automatically
-upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line
-"alias char-major-46 rocket".
-
-In order to use the ports, their device names (nodes) must be created with mknod.
-This is only required once, the system will retain the names once created.  To
-create the RocketPort/RocketModem device names, use the command
-"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero.
-
-For example::
-
-	> mknod /dev/ttyR0 c 46 0
-	> mknod /dev/ttyR1 c 46 1
-	> mknod /dev/ttyR2 c 46 2
-
-The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes)
-for you::
-
-	>/dev/MAKEDEV ttyR
-
-ISA Rocketport Boards
----------------------
-
-You must assign and configure the I/O addresses used by the ISA Rocketport
-card before installing and using it.  This is done by setting a set of DIP
-switches on the Rocketport board.
-
-
-Setting the I/O address
------------------------
-
-Before installing RocketPort(R) or RocketPort RA boards, you must find
-a range of I/O addresses for it to use. The first RocketPort card
-requires a 68-byte contiguous block of I/O addresses, starting at one
-of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h,
-0x300h, 0x340h, 0x380h.  This I/O address must be reflected in the DIP
-switches of *all* of the Rocketport cards.
-
-The second, third, and fourth RocketPort cards require a 64-byte
-contiguous block of I/O addresses, starting at one of the following
-I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h,
-0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h.  The I/O address used by the
-second, third, and fourth Rocketport cards (if present) are set via
-software control.  The DIP switch settings for the I/O address must be
-set to the value of the first Rocketport cards.
-
-In order to distinguish each of the card from the others, each card
-must have a unique board ID set on the dip switches.  The first
-Rocketport board must be set with the DIP switches corresponding to
-the first board, the second board must be set with the DIP switches
-corresponding to the second board, etc.  IMPORTANT: The board ID is
-the only place where the DIP switch settings should differ between the
-various Rocketport boards in a system.
-
-The I/O address range used by any of the RocketPort cards must not
-conflict with any other cards in the system, including other
-RocketPort cards.  Below, you will find a list of commonly used I/O
-address ranges which may be in use by other devices in your system.
-On a Linux system, "cat /proc/ioports" will also be helpful in
-identifying what I/O addresses are being used by devices on your
-system.
-
-Remember, the FIRST RocketPort uses 68 I/O addresses.  So, if you set it
-for 0x100, it will occupy 0x100 to 0x143.  This would mean that you
-CAN NOT set the second, third or fourth board for address 0x140 since
-the first 4 bytes of that range are used by the first board.  You would
-need to set the second, third, or fourth board to one of the next available
-blocks such as 0x180.
-
-RocketPort and RocketPort RA SW1 Settings::
-
-            +-------------------------------+
-            | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 |
-            +-------+-------+---------------+
-            | Unused| Card  | I/O Port Block|
-            +-------------------------------+
-
-  DIP Switches                             DIP Switches
-  7    8                                   6    5
-  ===================                      ===================
-  On   On   UNUSED, MUST BE ON.            On   On   First Card    <==== Default
-                                           On   Off  Second Card
-                                           Off  On   Third Card
-                                           Off  Off  Fourth Card
-
-  DIP Switches         I/O Address Range
-  4    3    2    1     Used by the First Card
-  =====================================
-  On   Off  On   Off   100-143
-  On   Off  Off  On    140-183
-  On   Off  Off  Off   180-1C3       <==== Default
-  Off  On   On   Off   200-243
-  Off  On   Off  On    240-283
-  Off  On   Off  Off   280-2C3
-  Off  Off  On   Off   300-343
-  Off  Off  Off  On    340-383
-  Off  Off  Off  Off   380-3C3
-
-Reporting Bugs
---------------
-
-For technical support, please provide the following
-information: Driver version, kernel release, distribution of
-kernel, and type of board you are using. Error messages and log
-printouts port configuration details are especially helpful.
-
-USA:
-    :Phone: (612) 494-4100
-    :FAX: (612) 494-4199
-    :email: support@comtrol.com
-
-Comtrol Europe:
-    :Phone: +44 (0) 1 869 323-220
-    :FAX: +44 (0) 1 869 323-211
-    :email: support@comtrol.co.uk
-
-Web:	http://www.comtrol.com
-FTP:	ftp.comtrol.com
diff --git a/Documentation/process/magic-number.rst b/Documentation/process/magic-number.rst
index c36f21eecefb..89992fe4863f 100644
--- a/Documentation/process/magic-number.rst
+++ b/Documentation/process/magic-number.rst
@@ -95,7 +95,6 @@ USB_BLUETOOTH_MAGIC   0x6d02           usb_bluetooth            ``drivers/usb/cl
 RFCOMM_TTY_MAGIC      0x6d02                                    ``net/bluetooth/rfcomm/tty.c``
 USB_SERIAL_PORT_MAGIC 0x7301           usb_serial_port          ``drivers/usb/serial/usb-serial.h``
 CG_MAGIC              0x00090255       ufs_cylinder_group       ``include/linux/ufs_fs.h``
-RPORT_MAGIC           0x00525001       r_port                   ``drivers/char/rocket_int.h``
 LSEMAGIC              0x05091998       lse                      ``drivers/fc4/fc.c``
 RIEBL_MAGIC           0x09051990                                ``drivers/net/atarilance.c``
 NBD_REQUEST_MAGIC     0x12560953       nbd_request              ``include/linux/nbd.h``
diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst
index 440087f9f402..9be170ec0d02 100644
--- a/Documentation/translations/it_IT/process/magic-number.rst
+++ b/Documentation/translations/it_IT/process/magic-number.rst
@@ -101,7 +101,6 @@ USB_BLUETOOTH_MAGIC   0x6d02           usb_bluetooth            ``drivers/usb/cl
 RFCOMM_TTY_MAGIC      0x6d02                                    ``net/bluetooth/rfcomm/tty.c``
 USB_SERIAL_PORT_MAGIC 0x7301           usb_serial_port          ``drivers/usb/serial/usb-serial.h``
 CG_MAGIC              0x00090255       ufs_cylinder_group       ``include/linux/ufs_fs.h``
-RPORT_MAGIC           0x00525001       r_port                   ``drivers/char/rocket_int.h``
 LSEMAGIC              0x05091998       lse                      ``drivers/fc4/fc.c``
 GDTIOCTL_MAGIC        0x06030f07       gdth_iowr_str            ``drivers/scsi/gdth_ioctl.h``
 RIEBL_MAGIC           0x09051990                                ``drivers/net/atarilance.c``
diff --git a/Documentation/translations/zh_CN/process/magic-number.rst b/Documentation/translations/zh_CN/process/magic-number.rst
index e91bec4ec156..191d705349ef 100644
--- a/Documentation/translations/zh_CN/process/magic-number.rst
+++ b/Documentation/translations/zh_CN/process/magic-number.rst
@@ -84,7 +84,6 @@ USB_BLUETOOTH_MAGIC   0x6d02           usb_bluetooth            ``drivers/usb/cl
 RFCOMM_TTY_MAGIC      0x6d02                                    ``net/bluetooth/rfcomm/tty.c``
 USB_SERIAL_PORT_MAGIC 0x7301           usb_serial_port          ``drivers/usb/serial/usb-serial.h``
 CG_MAGIC              0x00090255       ufs_cylinder_group       ``include/linux/ufs_fs.h``
-RPORT_MAGIC           0x00525001       r_port                   ``drivers/char/rocket_int.h``
 LSEMAGIC              0x05091998       lse                      ``drivers/fc4/fc.c``
 GDTIOCTL_MAGIC        0x06030f07       gdth_iowr_str            ``drivers/scsi/gdth_ioctl.h``
 RIEBL_MAGIC           0x09051990                                ``drivers/net/atarilance.c``
diff --git a/MAINTAINERS b/MAINTAINERS
index f62df0494d11..2c92d4a55d7f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15394,12 +15394,6 @@ L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/rocker/
 
-ROCKETPORT DRIVER
-S:	Maintained
-W:	http://www.comtrol.com
-F:	Documentation/driver-api/serial/rocket.rst
-F:	drivers/tty/rocket*
-
 ROCKETPORT EXPRESS/INFINITY DRIVER
 M:	Kevin Cernekee <cernekee@gmail.com>
 L:	linux-serial@vger.kernel.org
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 0031aa8f8b16..1d30add862af 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -192,21 +192,6 @@ config SERIAL_NONSTANDARD
 
 	  Most people can say N here.
 
-config ROCKETPORT
-	tristate "Comtrol RocketPort support"
-	depends on SERIAL_NONSTANDARD && (ISA || EISA || PCI)
-	help
-	  This driver supports Comtrol RocketPort and RocketModem PCI boards.   
-	  These boards provide 2, 4, 8, 16, or 32 high-speed serial ports or
-	  modems.  For information about the RocketPort/RocketModem  boards
-	  and this driver read <file:Documentation/driver-api/serial/rocket.rst>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called rocket.
-
-	  If you want to compile this driver into the kernel, say Y here.  If
-	  you don't have a Comtrol RocketPort/RocketModem card installed, say N.
-
 config MOXA_INTELLIO
 	tristate "Moxa Intellio support"
 	depends on SERIAL_NONSTANDARD && (ISA || EISA || PCI)
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index a34055bc8b7a..c7054f5117c3 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_MOXA_INTELLIO)	+= moxa.o
 obj-$(CONFIG_MOXA_SMARTIO)	+= mxser.o
 obj-$(CONFIG_NOZOMI)		+= nozomi.o
 obj-$(CONFIG_NULL_TTY)	        += ttynull.o
-obj-$(CONFIG_ROCKETPORT)	+= rocket.o
 obj-$(CONFIG_SYNCLINK_GT)	+= synclink_gt.o
 obj-$(CONFIG_PPC_EPAPR_HV_BYTECHAN) += ehv_bytechan.o
 obj-$(CONFIG_GOLDFISH_TTY)	+= goldfish.o
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
deleted file mode 100644
index 2540b2e4c8e8..000000000000
--- a/drivers/tty/rocket.c
+++ /dev/null
@@ -1,3127 +0,0 @@
-// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
-/*
- * RocketPort device driver for Linux
- *
- * Written by Theodore Ts'o, 1995, 1996, 1997, 1998, 1999, 2000.
- * 
- * Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2003 by Comtrol, Inc.
- */
-
-/*
- * Kernel Synchronization:
- *
- * This driver has 2 kernel control paths - exception handlers (calls into the driver
- * from user mode) and the timer bottom half (tasklet).  This is a polled driver, interrupts
- * are not used.
- *
- * Critical data: 
- * -  rp_table[], accessed through passed "info" pointers, is a global (static) array of 
- *    serial port state information and the xmit_buf circular buffer.  Protected by 
- *    a per port spinlock.
- * -  xmit_flags[], an array of ints indexed by line (port) number, indicating that there
- *    is data to be transmitted.  Protected by atomic bit operations.
- * -  rp_num_ports, int indicating number of open ports, protected by atomic operations.
- * 
- * rp_write() and rp_write_char() functions use a per port semaphore to protect against
- * simultaneous access to the same port by more than one process.
- */
-
-/****** Defines ******/
-#define ROCKET_PARANOIA_CHECK
-#define ROCKET_DISABLE_SIMUSAGE
-
-#undef ROCKET_SOFT_FLOW
-#undef ROCKET_DEBUG_OPEN
-#undef ROCKET_DEBUG_INTR
-#undef ROCKET_DEBUG_WRITE
-#undef ROCKET_DEBUG_FLOW
-#undef ROCKET_DEBUG_THROTTLE
-#undef ROCKET_DEBUG_WAIT_UNTIL_SENT
-#undef ROCKET_DEBUG_RECEIVE
-#undef ROCKET_DEBUG_HANGUP
-#undef REV_PCI_ORDER
-#undef ROCKET_DEBUG_IO
-
-#define POLL_PERIOD (HZ/100)	/*  Polling period .01 seconds (10ms) */
-
-/****** Kernel includes ******/
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/major.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/interrupt.h>
-#include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/tty_flip.h>
-#include <linux/serial.h>
-#include <linux/string.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/mutex.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/completion.h>
-#include <linux/wait.h>
-#include <linux/pci.h>
-#include <linux/uaccess.h>
-#include <linux/atomic.h>
-#include <asm/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/spinlock.h>
-#include <linux/init.h>
-
-/****** RocketPort includes ******/
-
-#include "rocket_int.h"
-#include "rocket.h"
-
-#define ROCKET_VERSION "2.09"
-#define ROCKET_DATE "12-June-2003"
-
-/****** RocketPort Local Variables ******/
-
-static void rp_do_poll(struct timer_list *unused);
-
-static struct tty_driver *rocket_driver;
-
-static struct rocket_version driver_version = {	
-	ROCKET_VERSION, ROCKET_DATE
-};
-
-static struct r_port *rp_table[MAX_RP_PORTS];	       /*  The main repository of serial port state information. */
-static unsigned int xmit_flags[NUM_BOARDS];	       /*  Bit significant, indicates port had data to transmit. */
-						       /*  eg.  Bit 0 indicates port 0 has xmit data, ...        */
-static atomic_t rp_num_ports_open;	               /*  Number of serial ports open                           */
-static DEFINE_TIMER(rocket_timer, rp_do_poll);
-
-static unsigned long board1;	                       /* ISA addresses, retrieved from rocketport.conf          */
-static unsigned long board2;
-static unsigned long board3;
-static unsigned long board4;
-static unsigned long controller;
-static bool support_low_speed;
-static unsigned long modem1;
-static unsigned long modem2;
-static unsigned long modem3;
-static unsigned long modem4;
-static unsigned long pc104_1[8];
-static unsigned long pc104_2[8];
-static unsigned long pc104_3[8];
-static unsigned long pc104_4[8];
-static unsigned long *pc104[4] = { pc104_1, pc104_2, pc104_3, pc104_4 };
-
-static int rp_baud_base[NUM_BOARDS];	               /*  Board config info (Someday make a per-board structure)  */
-static unsigned long rcktpt_io_addr[NUM_BOARDS];
-static int rcktpt_type[NUM_BOARDS];
-static int is_PCI[NUM_BOARDS];
-static rocketModel_t rocketModel[NUM_BOARDS];
-static int max_board;
-static const struct tty_port_operations rocket_port_ops;
-
-/*
- * The following arrays define the interrupt bits corresponding to each AIOP.
- * These bits are different between the ISA and regular PCI boards and the
- * Universal PCI boards.
- */
-
-static Word_t aiop_intr_bits[AIOP_CTL_SIZE] = {
-	AIOP_INTR_BIT_0,
-	AIOP_INTR_BIT_1,
-	AIOP_INTR_BIT_2,
-	AIOP_INTR_BIT_3
-};
-
-#ifdef CONFIG_PCI
-static Word_t upci_aiop_intr_bits[AIOP_CTL_SIZE] = {
-	UPCI_AIOP_INTR_BIT_0,
-	UPCI_AIOP_INTR_BIT_1,
-	UPCI_AIOP_INTR_BIT_2,
-	UPCI_AIOP_INTR_BIT_3
-};
-#endif
-
-static Byte_t RData[RDATASIZE] = {
-	0x00, 0x09, 0xf6, 0x82,
-	0x02, 0x09, 0x86, 0xfb,
-	0x04, 0x09, 0x00, 0x0a,
-	0x06, 0x09, 0x01, 0x0a,
-	0x08, 0x09, 0x8a, 0x13,
-	0x0a, 0x09, 0xc5, 0x11,
-	0x0c, 0x09, 0x86, 0x85,
-	0x0e, 0x09, 0x20, 0x0a,
-	0x10, 0x09, 0x21, 0x0a,
-	0x12, 0x09, 0x41, 0xff,
-	0x14, 0x09, 0x82, 0x00,
-	0x16, 0x09, 0x82, 0x7b,
-	0x18, 0x09, 0x8a, 0x7d,
-	0x1a, 0x09, 0x88, 0x81,
-	0x1c, 0x09, 0x86, 0x7a,
-	0x1e, 0x09, 0x84, 0x81,
-	0x20, 0x09, 0x82, 0x7c,
-	0x22, 0x09, 0x0a, 0x0a
-};
-
-static Byte_t RRegData[RREGDATASIZE] = {
-	0x00, 0x09, 0xf6, 0x82,	/* 00: Stop Rx processor */
-	0x08, 0x09, 0x8a, 0x13,	/* 04: Tx software flow control */
-	0x0a, 0x09, 0xc5, 0x11,	/* 08: XON char */
-	0x0c, 0x09, 0x86, 0x85,	/* 0c: XANY */
-	0x12, 0x09, 0x41, 0xff,	/* 10: Rx mask char */
-	0x14, 0x09, 0x82, 0x00,	/* 14: Compare/Ignore #0 */
-	0x16, 0x09, 0x82, 0x7b,	/* 18: Compare #1 */
-	0x18, 0x09, 0x8a, 0x7d,	/* 1c: Compare #2 */
-	0x1a, 0x09, 0x88, 0x81,	/* 20: Interrupt #1 */
-	0x1c, 0x09, 0x86, 0x7a,	/* 24: Ignore/Replace #1 */
-	0x1e, 0x09, 0x84, 0x81,	/* 28: Interrupt #2 */
-	0x20, 0x09, 0x82, 0x7c,	/* 2c: Ignore/Replace #2 */
-	0x22, 0x09, 0x0a, 0x0a	/* 30: Rx FIFO Enable */
-};
-
-static CONTROLLER_T sController[CTL_SIZE] = {
-	{-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {0, 0, 0, 0},
-	 {0, 0, 0, 0}, {-1, -1, -1, -1}, {0, 0, 0, 0}},
-	{-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {0, 0, 0, 0},
-	 {0, 0, 0, 0}, {-1, -1, -1, -1}, {0, 0, 0, 0}},
-	{-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {0, 0, 0, 0},
-	 {0, 0, 0, 0}, {-1, -1, -1, -1}, {0, 0, 0, 0}},
-	{-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, {0, 0, 0, 0},
-	 {0, 0, 0, 0}, {-1, -1, -1, -1}, {0, 0, 0, 0}}
-};
-
-static Byte_t sBitMapClrTbl[8] = {
-	0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f
-};
-
-static Byte_t sBitMapSetTbl[8] = {
-	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-};
-
-static int sClockPrescale = 0x14;
-
-/*
- *  Line number is the ttySIx number (x), the Minor number.  We 
- *  assign them sequentially, starting at zero.  The following 
- *  array keeps track of the line number assigned to a given board/aiop/channel.
- */
-static unsigned char lineNumbers[MAX_RP_PORTS];
-static unsigned long nextLineNumber;
-
-/*****  RocketPort Static Prototypes   *********/
-static int __init init_ISA(int i);
-static void rp_wait_until_sent(struct tty_struct *tty, int timeout);
-static void rp_flush_buffer(struct tty_struct *tty);
-static unsigned char GetLineNumber(int ctrl, int aiop, int ch);
-static unsigned char SetLineNumber(int ctrl, int aiop, int ch);
-static void rp_start(struct tty_struct *tty);
-static int sInitChan(CONTROLLER_T * CtlP, CHANNEL_T * ChP, int AiopNum,
-		     int ChanNum);
-static void sSetInterfaceMode(CHANNEL_T * ChP, Byte_t mode);
-static void sFlushRxFIFO(CHANNEL_T * ChP);
-static void sFlushTxFIFO(CHANNEL_T * ChP);
-static void sEnInterrupts(CHANNEL_T * ChP, Word_t Flags);
-static void sDisInterrupts(CHANNEL_T * ChP, Word_t Flags);
-static void sModemReset(CONTROLLER_T * CtlP, int chan, int on);
-static void sPCIModemReset(CONTROLLER_T * CtlP, int chan, int on);
-static int sWriteTxPrioByte(CHANNEL_T * ChP, Byte_t Data);
-static int sInitController(CONTROLLER_T * CtlP, int CtlNum, ByteIO_t MudbacIO,
-			   ByteIO_t * AiopIOList, int AiopIOListSize,
-			   int IRQNum, Byte_t Frequency, int PeriodicOnly);
-static int sReadAiopID(ByteIO_t io);
-static int sReadAiopNumChan(WordIO_t io);
-
-MODULE_AUTHOR("Theodore Ts'o");
-MODULE_DESCRIPTION("Comtrol RocketPort driver");
-module_param_hw(board1, ulong, ioport, 0);
-MODULE_PARM_DESC(board1, "I/O port for (ISA) board #1");
-module_param_hw(board2, ulong, ioport, 0);
-MODULE_PARM_DESC(board2, "I/O port for (ISA) board #2");
-module_param_hw(board3, ulong, ioport, 0);
-MODULE_PARM_DESC(board3, "I/O port for (ISA) board #3");
-module_param_hw(board4, ulong, ioport, 0);
-MODULE_PARM_DESC(board4, "I/O port for (ISA) board #4");
-module_param_hw(controller, ulong, ioport, 0);
-MODULE_PARM_DESC(controller, "I/O port for (ISA) rocketport controller");
-module_param(support_low_speed, bool, 0);
-MODULE_PARM_DESC(support_low_speed, "1 means support 50 baud, 0 means support 460400 baud");
-module_param(modem1, ulong, 0);
-MODULE_PARM_DESC(modem1, "1 means (ISA) board #1 is a RocketModem");
-module_param(modem2, ulong, 0);
-MODULE_PARM_DESC(modem2, "1 means (ISA) board #2 is a RocketModem");
-module_param(modem3, ulong, 0);
-MODULE_PARM_DESC(modem3, "1 means (ISA) board #3 is a RocketModem");
-module_param(modem4, ulong, 0);
-MODULE_PARM_DESC(modem4, "1 means (ISA) board #4 is a RocketModem");
-module_param_array(pc104_1, ulong, NULL, 0);
-MODULE_PARM_DESC(pc104_1, "set interface types for ISA(PC104) board #1 (e.g. pc104_1=232,232,485,485,...");
-module_param_array(pc104_2, ulong, NULL, 0);
-MODULE_PARM_DESC(pc104_2, "set interface types for ISA(PC104) board #2 (e.g. pc104_2=232,232,485,485,...");
-module_param_array(pc104_3, ulong, NULL, 0);
-MODULE_PARM_DESC(pc104_3, "set interface types for ISA(PC104) board #3 (e.g. pc104_3=232,232,485,485,...");
-module_param_array(pc104_4, ulong, NULL, 0);
-MODULE_PARM_DESC(pc104_4, "set interface types for ISA(PC104) board #4 (e.g. pc104_4=232,232,485,485,...");
-
-static int __init rp_init(void);
-static void rp_cleanup_module(void);
-
-module_init(rp_init);
-module_exit(rp_cleanup_module);
-
-
-MODULE_LICENSE("Dual BSD/GPL");
-
-/*************************************************************************/
-/*                     Module code starts here                           */
-
-static inline int rocket_paranoia_check(struct r_port *info,
-					const char *routine)
-{
-#ifdef ROCKET_PARANOIA_CHECK
-	if (!info)
-		return 1;
-	if (info->magic != RPORT_MAGIC) {
-		printk(KERN_WARNING "Warning: bad magic number for rocketport "
-				"struct in %s\n", routine);
-		return 1;
-	}
-#endif
-	return 0;
-}
-
-
-/*  Serial port receive data function.  Called (from timer poll) when an AIOPIC signals 
- *  that receive data is present on a serial port.  Pulls data from FIFO, moves it into the 
- *  tty layer.  
- */
-static void rp_do_receive(struct r_port *info, CHANNEL_t *cp,
-		unsigned int ChanStatus)
-{
-	unsigned int CharNStat;
-	int ToRecv, wRecv, space;
-	unsigned char *cbuf;
-
-	ToRecv = sGetRxCnt(cp);
-#ifdef ROCKET_DEBUG_INTR
-	printk(KERN_INFO "rp_do_receive(%d)...\n", ToRecv);
-#endif
-	if (ToRecv == 0)
-		return;
-
-	/*
-	 * if status indicates there are errored characters in the
-	 * FIFO, then enter status mode (a word in FIFO holds
-	 * character and status).
-	 */
-	if (ChanStatus & (RXFOVERFL | RXBREAK | RXFRAME | RXPARITY)) {
-		if (!(ChanStatus & STATMODE)) {
-#ifdef ROCKET_DEBUG_RECEIVE
-			printk(KERN_INFO "Entering STATMODE...\n");
-#endif
-			ChanStatus |= STATMODE;
-			sEnRxStatusMode(cp);
-		}
-	}
-
-	/* 
-	 * if we previously entered status mode, then read down the
-	 * FIFO one word at a time, pulling apart the character and
-	 * the status.  Update error counters depending on status
-	 */
-	if (ChanStatus & STATMODE) {
-#ifdef ROCKET_DEBUG_RECEIVE
-		printk(KERN_INFO "Ignore %x, read %x...\n",
-			info->ignore_status_mask, info->read_status_mask);
-#endif
-		while (ToRecv) {
-			char flag;
-
-			CharNStat = sInW(sGetTxRxDataIO(cp));
-#ifdef ROCKET_DEBUG_RECEIVE
-			printk(KERN_INFO "%x...\n", CharNStat);
-#endif
-			if (CharNStat & STMBREAKH)
-				CharNStat &= ~(STMFRAMEH | STMPARITYH);
-			if (CharNStat & info->ignore_status_mask) {
-				ToRecv--;
-				continue;
-			}
-			CharNStat &= info->read_status_mask;
-			if (CharNStat & STMBREAKH)
-				flag = TTY_BREAK;
-			else if (CharNStat & STMPARITYH)
-				flag = TTY_PARITY;
-			else if (CharNStat & STMFRAMEH)
-				flag = TTY_FRAME;
-			else if (CharNStat & STMRCVROVRH)
-				flag = TTY_OVERRUN;
-			else
-				flag = TTY_NORMAL;
-			tty_insert_flip_char(&info->port, CharNStat & 0xff,
-					flag);
-			ToRecv--;
-		}
-
-		/*
-		 * after we've emptied the FIFO in status mode, turn
-		 * status mode back off
-		 */
-		if (sGetRxCnt(cp) == 0) {
-#ifdef ROCKET_DEBUG_RECEIVE
-			printk(KERN_INFO "Status mode off.\n");
-#endif
-			sDisRxStatusMode(cp);
-		}
-	} else {
-		/*
-		 * we aren't in status mode, so read down the FIFO two
-		 * characters at time by doing repeated word IO
-		 * transfer.
-		 */
-		space = tty_prepare_flip_string(&info->port, &cbuf, ToRecv);
-		if (space < ToRecv) {
-#ifdef ROCKET_DEBUG_RECEIVE
-			printk(KERN_INFO "rp_do_receive:insufficient space ToRecv=%d space=%d\n", ToRecv, space);
-#endif
-			if (space <= 0)
-				return;
-			ToRecv = space;
-		}
-		wRecv = ToRecv >> 1;
-		if (wRecv)
-			sInStrW(sGetTxRxDataIO(cp), (unsigned short *) cbuf, wRecv);
-		if (ToRecv & 1)
-			cbuf[ToRecv - 1] = sInB(sGetTxRxDataIO(cp));
-	}
-	/*  Push the data up to the tty layer */
-	tty_flip_buffer_push(&info->port);
-}
-
-/*
- *  Serial port transmit data function.  Called from the timer polling loop as a 
- *  result of a bit set in xmit_flags[], indicating data (from the tty layer) is ready
- *  to be sent out the serial port.  Data is buffered in rp_table[line].xmit_buf, it is 
- *  moved to the port's xmit FIFO.  *info is critical data, protected by spinlocks.
- */
-static void rp_do_transmit(struct r_port *info)
-{
-	int c;
-	CHANNEL_t *cp = &info->channel;
-	struct tty_struct *tty;
-	unsigned long flags;
-
-#ifdef ROCKET_DEBUG_INTR
-	printk(KERN_DEBUG "%s\n", __func__);
-#endif
-	if (!info)
-		return;
-	tty = tty_port_tty_get(&info->port);
-
-	if (tty == NULL) {
-		printk(KERN_WARNING "rp: WARNING %s called with tty==NULL\n", __func__);
-		clear_bit((info->aiop * 8) + info->chan, (void *) &xmit_flags[info->board]);
-		return;
-	}
-
-	spin_lock_irqsave(&info->slock, flags);
-	info->xmit_fifo_room = TXFIFO_SIZE - sGetTxCnt(cp);
-
-	/*  Loop sending data to FIFO until done or FIFO full */
-	while (1) {
-		if (tty->stopped)
-			break;
-		c = min(info->xmit_fifo_room, info->xmit_cnt);
-		c = min(c, XMIT_BUF_SIZE - info->xmit_tail);
-		if (c <= 0 || info->xmit_fifo_room <= 0)
-			break;
-		sOutStrW(sGetTxRxDataIO(cp), (unsigned short *) (info->xmit_buf + info->xmit_tail), c / 2);
-		if (c & 1)
-			sOutB(sGetTxRxDataIO(cp), info->xmit_buf[info->xmit_tail + c - 1]);
-		info->xmit_tail += c;
-		info->xmit_tail &= XMIT_BUF_SIZE - 1;
-		info->xmit_cnt -= c;
-		info->xmit_fifo_room -= c;
-#ifdef ROCKET_DEBUG_INTR
-		printk(KERN_INFO "tx %d chars...\n", c);
-#endif
-	}
-
-	if (info->xmit_cnt == 0)
-		clear_bit((info->aiop * 8) + info->chan, (void *) &xmit_flags[info->board]);
-
-	if (info->xmit_cnt < WAKEUP_CHARS) {
-		tty_wakeup(tty);
-#ifdef ROCKETPORT_HAVE_POLL_WAIT
-		wake_up_interruptible(&tty->poll_wait);
-#endif
-	}
-
-	spin_unlock_irqrestore(&info->slock, flags);
-	tty_kref_put(tty);
-
-#ifdef ROCKET_DEBUG_INTR
-	printk(KERN_DEBUG "(%d,%d,%d,%d)...\n", info->xmit_cnt, info->xmit_head,
-	       info->xmit_tail, info->xmit_fifo_room);
-#endif
-}
-
-/*
- *  Called when a serial port signals it has read data in it's RX FIFO.
- *  It checks what interrupts are pending and services them, including
- *  receiving serial data.  
- */
-static void rp_handle_port(struct r_port *info)
-{
-	CHANNEL_t *cp;
-	unsigned int IntMask, ChanStatus;
-
-	if (!info)
-		return;
-
-	if (!tty_port_initialized(&info->port)) {
-		printk(KERN_WARNING "rp: WARNING: rp_handle_port called with "
-				"info->flags & NOT_INIT\n");
-		return;
-	}
-
-	cp = &info->channel;
-
-	IntMask = sGetChanIntID(cp) & info->intmask;
-#ifdef ROCKET_DEBUG_INTR
-	printk(KERN_INFO "rp_interrupt %02x...\n", IntMask);
-#endif
-	ChanStatus = sGetChanStatus(cp);
-	if (IntMask & RXF_TRIG) {	/* Rx FIFO trigger level */
-		rp_do_receive(info, cp, ChanStatus);
-	}
-	if (IntMask & DELTA_CD) {	/* CD change  */
-#if (defined(ROCKET_DEBUG_OPEN) || defined(ROCKET_DEBUG_INTR) || defined(ROCKET_DEBUG_HANGUP))
-		printk(KERN_INFO "ttyR%d CD now %s...\n", info->line,
-		       (ChanStatus & CD_ACT) ? "on" : "off");
-#endif
-		if (!(ChanStatus & CD_ACT) && info->cd_status) {
-#ifdef ROCKET_DEBUG_HANGUP
-			printk(KERN_INFO "CD drop, calling hangup.\n");
-#endif
-			tty_port_tty_hangup(&info->port, false);
-		}
-		info->cd_status = (ChanStatus & CD_ACT) ? 1 : 0;
-		wake_up_interruptible(&info->port.open_wait);
-	}
-#ifdef ROCKET_DEBUG_INTR
-	if (IntMask & DELTA_CTS) {	/* CTS change */
-		printk(KERN_INFO "CTS change...\n");
-	}
-	if (IntMask & DELTA_DSR) {	/* DSR change */
-		printk(KERN_INFO "DSR change...\n");
-	}
-#endif
-}
-
-/*
- *  The top level polling routine.  Repeats every 1/100 HZ (10ms).
- */
-static void rp_do_poll(struct timer_list *unused)
-{
-	CONTROLLER_t *ctlp;
-	int ctrl, aiop, ch, line;
-	unsigned int xmitmask, i;
-	unsigned int CtlMask;
-	unsigned char AiopMask;
-	Word_t bit;
-
-	/*  Walk through all the boards (ctrl's) */
-	for (ctrl = 0; ctrl < max_board; ctrl++) {
-		if (rcktpt_io_addr[ctrl] <= 0)
-			continue;
-
-		/*  Get a ptr to the board's control struct */
-		ctlp = sCtlNumToCtlPtr(ctrl);
-
-		/*  Get the interrupt status from the board */
-#ifdef CONFIG_PCI
-		if (ctlp->BusType == isPCI)
-			CtlMask = sPCIGetControllerIntStatus(ctlp);
-		else
-#endif
-			CtlMask = sGetControllerIntStatus(ctlp);
-
-		/*  Check if any AIOP read bits are set */
-		for (aiop = 0; CtlMask; aiop++) {
-			bit = ctlp->AiopIntrBits[aiop];
-			if (CtlMask & bit) {
-				CtlMask &= ~bit;
-				AiopMask = sGetAiopIntStatus(ctlp, aiop);
-
-				/*  Check if any port read bits are set */
-				for (ch = 0; AiopMask;  AiopMask >>= 1, ch++) {
-					if (AiopMask & 1) {
-
-						/*  Get the line number (/dev/ttyRx number). */
-						/*  Read the data from the port. */
-						line = GetLineNumber(ctrl, aiop, ch);
-						rp_handle_port(rp_table[line]);
-					}
-				}
-			}
-		}
-
-		xmitmask = xmit_flags[ctrl];
-
-		/*
-		 *  xmit_flags contains bit-significant flags, indicating there is data
-		 *  to xmit on the port. Bit 0 is port 0 on this board, bit 1 is port 
-		 *  1, ... (32 total possible).  The variable i has the aiop and ch 
-		 *  numbers encoded in it (port 0-7 are aiop0, 8-15 are aiop1, etc).
-		 */
-		if (xmitmask) {
-			for (i = 0; i < rocketModel[ctrl].numPorts; i++) {
-				if (xmitmask & (1 << i)) {
-					aiop = (i & 0x18) >> 3;
-					ch = i & 0x07;
-					line = GetLineNumber(ctrl, aiop, ch);
-					rp_do_transmit(rp_table[line]);
-				}
-			}
-		}
-	}
-
-	/*
-	 * Reset the timer so we get called at the next clock tick (10ms).
-	 */
-	if (atomic_read(&rp_num_ports_open))
-		mod_timer(&rocket_timer, jiffies + POLL_PERIOD);
-}
-
-/*
- *  Initializes the r_port structure for a port, as well as enabling the port on 
- *  the board.  
- *  Inputs:  board, aiop, chan numbers
- */
-static void __init
-init_r_port(int board, int aiop, int chan, struct pci_dev *pci_dev)
-{
-	unsigned rocketMode;
-	struct r_port *info;
-	int line;
-	CONTROLLER_T *ctlp;
-
-	/*  Get the next available line number */
-	line = SetLineNumber(board, aiop, chan);
-
-	ctlp = sCtlNumToCtlPtr(board);
-
-	/*  Get a r_port struct for the port, fill it in and save it globally, indexed by line number */
-	info = kzalloc(sizeof (struct r_port), GFP_KERNEL);
-	if (!info) {
-		printk(KERN_ERR "Couldn't allocate info struct for line #%d\n",
-				line);
-		return;
-	}
-
-	info->magic = RPORT_MAGIC;
-	info->line = line;
-	info->ctlp = ctlp;
-	info->board = board;
-	info->aiop = aiop;
-	info->chan = chan;
-	tty_port_init(&info->port);
-	info->port.ops = &rocket_port_ops;
-	info->flags &= ~ROCKET_MODE_MASK;
-	if (board < ARRAY_SIZE(pc104) && line < ARRAY_SIZE(pc104_1))
-		switch (pc104[board][line]) {
-		case 422:
-			info->flags |= ROCKET_MODE_RS422;
-			break;
-		case 485:
-			info->flags |= ROCKET_MODE_RS485;
-			break;
-		case 232:
-		default:
-			info->flags |= ROCKET_MODE_RS232;
-			break;
-		}
-	else
-		info->flags |= ROCKET_MODE_RS232;
-
-	info->intmask = RXF_TRIG | TXFIFO_MT | SRC_INT | DELTA_CD | DELTA_CTS | DELTA_DSR;
-	if (sInitChan(ctlp, &info->channel, aiop, chan) == 0) {
-		printk(KERN_ERR "RocketPort sInitChan(%d, %d, %d) failed!\n",
-				board, aiop, chan);
-		tty_port_destroy(&info->port);
-		kfree(info);
-		return;
-	}
-
-	rocketMode = info->flags & ROCKET_MODE_MASK;
-
-	if ((info->flags & ROCKET_RTS_TOGGLE) || (rocketMode == ROCKET_MODE_RS485))
-		sEnRTSToggle(&info->channel);
-	else
-		sDisRTSToggle(&info->channel);
-
-	if (ctlp->boardType == ROCKET_TYPE_PC104) {
-		switch (rocketMode) {
-		case ROCKET_MODE_RS485:
-			sSetInterfaceMode(&info->channel, InterfaceModeRS485);
-			break;
-		case ROCKET_MODE_RS422:
-			sSetInterfaceMode(&info->channel, InterfaceModeRS422);
-			break;
-		case ROCKET_MODE_RS232:
-		default:
-			if (info->flags & ROCKET_RTS_TOGGLE)
-				sSetInterfaceMode(&info->channel, InterfaceModeRS232T);
-			else
-				sSetInterfaceMode(&info->channel, InterfaceModeRS232);
-			break;
-		}
-	}
-	spin_lock_init(&info->slock);
-	mutex_init(&info->write_mtx);
-	rp_table[line] = info;
-	tty_port_register_device(&info->port, rocket_driver, line,
-			pci_dev ? &pci_dev->dev : NULL);
-}
-
-/*
- *  Configures a rocketport port according to its termio settings.  Called from 
- *  user mode into the driver (exception handler).  *info CD manipulation is spinlock protected.
- */
-static void configure_r_port(struct tty_struct *tty, struct r_port *info,
-			     struct ktermios *old_termios)
-{
-	unsigned cflag;
-	unsigned long flags;
-	unsigned rocketMode;
-	int bits, baud, divisor;
-	CHANNEL_t *cp;
-	struct ktermios *t = &tty->termios;
-
-	cp = &info->channel;
-	cflag = t->c_cflag;
-
-	/* Byte size and parity */
-	if ((cflag & CSIZE) == CS8) {
-		sSetData8(cp);
-		bits = 10;
-	} else {
-		sSetData7(cp);
-		bits = 9;
-	}
-	if (cflag & CSTOPB) {
-		sSetStop2(cp);
-		bits++;
-	} else {
-		sSetStop1(cp);
-	}
-
-	if (cflag & PARENB) {
-		sEnParity(cp);
-		bits++;
-		if (cflag & PARODD) {
-			sSetOddParity(cp);
-		} else {
-			sSetEvenParity(cp);
-		}
-	} else {
-		sDisParity(cp);
-	}
-
-	/* baud rate */
-	baud = tty_get_baud_rate(tty);
-	if (!baud)
-		baud = 9600;
-	divisor = ((rp_baud_base[info->board] + (baud >> 1)) / baud) - 1;
-	if ((divisor >= 8192 || divisor < 0) && old_termios) {
-		baud = tty_termios_baud_rate(old_termios);
-		if (!baud)
-			baud = 9600;
-		divisor = (rp_baud_base[info->board] / baud) - 1;
-	}
-	if (divisor >= 8192 || divisor < 0) {
-		baud = 9600;
-		divisor = (rp_baud_base[info->board] / baud) - 1;
-	}
-	info->cps = baud / bits;
-	sSetBaud(cp, divisor);
-
-	/* FIXME: Should really back compute a baud rate from the divisor */
-	tty_encode_baud_rate(tty, baud, baud);
-
-	if (cflag & CRTSCTS) {
-		info->intmask |= DELTA_CTS;
-		sEnCTSFlowCtl(cp);
-	} else {
-		info->intmask &= ~DELTA_CTS;
-		sDisCTSFlowCtl(cp);
-	}
-	if (cflag & CLOCAL) {
-		info->intmask &= ~DELTA_CD;
-	} else {
-		spin_lock_irqsave(&info->slock, flags);
-		if (sGetChanStatus(cp) & CD_ACT)
-			info->cd_status = 1;
-		else
-			info->cd_status = 0;
-		info->intmask |= DELTA_CD;
-		spin_unlock_irqrestore(&info->slock, flags);
-	}
-
-	/*
-	 * Handle software flow control in the board
-	 */
-#ifdef ROCKET_SOFT_FLOW
-	if (I_IXON(tty)) {
-		sEnTxSoftFlowCtl(cp);
-		if (I_IXANY(tty)) {
-			sEnIXANY(cp);
-		} else {
-			sDisIXANY(cp);
-		}
-		sSetTxXONChar(cp, START_CHAR(tty));
-		sSetTxXOFFChar(cp, STOP_CHAR(tty));
-	} else {
-		sDisTxSoftFlowCtl(cp);
-		sDisIXANY(cp);
-		sClrTxXOFF(cp);
-	}
-#endif
-
-	/*
-	 * Set up ignore/read mask words
-	 */
-	info->read_status_mask = STMRCVROVRH | 0xFF;
-	if (I_INPCK(tty))
-		info->read_status_mask |= STMFRAMEH | STMPARITYH;
-	if (I_BRKINT(tty) || I_PARMRK(tty))
-		info->read_status_mask |= STMBREAKH;
-
-	/*
-	 * Characters to ignore
-	 */
-	info->ignore_status_mask = 0;
-	if (I_IGNPAR(tty))
-		info->ignore_status_mask |= STMFRAMEH | STMPARITYH;
-	if (I_IGNBRK(tty)) {
-		info->ignore_status_mask |= STMBREAKH;
-		/*
-		 * If we're ignoring parity and break indicators,
-		 * ignore overruns too.  (For real raw support).
-		 */
-		if (I_IGNPAR(tty))
-			info->ignore_status_mask |= STMRCVROVRH;
-	}
-
-	rocketMode = info->flags & ROCKET_MODE_MASK;
-
-	if ((info->flags & ROCKET_RTS_TOGGLE)
-	    || (rocketMode == ROCKET_MODE_RS485))
-		sEnRTSToggle(cp);
-	else
-		sDisRTSToggle(cp);
-
-	sSetRTS(&info->channel);
-
-	if (cp->CtlP->boardType == ROCKET_TYPE_PC104) {
-		switch (rocketMode) {
-		case ROCKET_MODE_RS485:
-			sSetInterfaceMode(cp, InterfaceModeRS485);
-			break;
-		case ROCKET_MODE_RS422:
-			sSetInterfaceMode(cp, InterfaceModeRS422);
-			break;
-		case ROCKET_MODE_RS232:
-		default:
-			if (info->flags & ROCKET_RTS_TOGGLE)
-				sSetInterfaceMode(cp, InterfaceModeRS232T);
-			else
-				sSetInterfaceMode(cp, InterfaceModeRS232);
-			break;
-		}
-	}
-}
-
-static int carrier_raised(struct tty_port *port)
-{
-	struct r_port *info = container_of(port, struct r_port, port);
-	return (sGetChanStatusLo(&info->channel) & CD_ACT) ? 1 : 0;
-}
-
-static void dtr_rts(struct tty_port *port, int on)
-{
-	struct r_port *info = container_of(port, struct r_port, port);
-	if (on) {
-		sSetDTR(&info->channel);
-		sSetRTS(&info->channel);
-	} else {
-		sClrDTR(&info->channel);
-		sClrRTS(&info->channel);
-	}
-}
-
-/*
- *  Exception handler that opens a serial port.  Creates xmit_buf storage, fills in 
- *  port's r_port struct.  Initializes the port hardware.  
- */
-static int rp_open(struct tty_struct *tty, struct file *filp)
-{
-	struct r_port *info;
-	struct tty_port *port;
-	int retval;
-	CHANNEL_t *cp;
-	unsigned long page;
-
-	info = rp_table[tty->index];
-	if (info == NULL)
-		return -ENXIO;
-	port = &info->port;
-	
-	page = __get_free_page(GFP_KERNEL);
-	if (!page)
-		return -ENOMEM;
-
-	/*
-	 * We must not sleep from here until the port is marked fully in use.
-	 */
-	if (info->xmit_buf)
-		free_page(page);
-	else
-		info->xmit_buf = (unsigned char *) page;
-
-	tty->driver_data = info;
-	tty_port_tty_set(port, tty);
-
-	if (port->count++ == 0) {
-		atomic_inc(&rp_num_ports_open);
-
-#ifdef ROCKET_DEBUG_OPEN
-		printk(KERN_INFO "rocket mod++ = %d...\n",
-				atomic_read(&rp_num_ports_open));
-#endif
-	}
-#ifdef ROCKET_DEBUG_OPEN
-	printk(KERN_INFO "rp_open ttyR%d, count=%d\n", info->line, info->port.count);
-#endif
-
-	/*
-	 * Info->count is now 1; so it's safe to sleep now.
-	 */
-	if (!tty_port_initialized(port)) {
-		cp = &info->channel;
-		sSetRxTrigger(cp, TRIG_1);
-		if (sGetChanStatus(cp) & CD_ACT)
-			info->cd_status = 1;
-		else
-			info->cd_status = 0;
-		sDisRxStatusMode(cp);
-		sFlushRxFIFO(cp);
-		sFlushTxFIFO(cp);
-
-		sEnInterrupts(cp, (TXINT_EN | MCINT_EN | RXINT_EN | SRCINT_EN | CHANINT_EN));
-		sSetRxTrigger(cp, TRIG_1);
-
-		sGetChanStatus(cp);
-		sDisRxStatusMode(cp);
-		sClrTxXOFF(cp);
-
-		sDisCTSFlowCtl(cp);
-		sDisTxSoftFlowCtl(cp);
-
-		sEnRxFIFO(cp);
-		sEnTransmit(cp);
-
-		tty_port_set_initialized(&info->port, 1);
-
-		configure_r_port(tty, info, NULL);
-		if (C_BAUD(tty)) {
-			sSetDTR(cp);
-			sSetRTS(cp);
-		}
-	}
-	/*  Starts (or resets) the maint polling loop */
-	mod_timer(&rocket_timer, jiffies + POLL_PERIOD);
-
-	retval = tty_port_block_til_ready(port, tty, filp);
-	if (retval) {
-#ifdef ROCKET_DEBUG_OPEN
-		printk(KERN_INFO "rp_open returning after block_til_ready with %d\n", retval);
-#endif
-		return retval;
-	}
-	return 0;
-}
-
-/*
- *  Exception handler that closes a serial port. info->port.count is considered critical.
- */
-static void rp_close(struct tty_struct *tty, struct file *filp)
-{
-	struct r_port *info = tty->driver_data;
-	struct tty_port *port = &info->port;
-	int timeout;
-	CHANNEL_t *cp;
-	
-	if (rocket_paranoia_check(info, "rp_close"))
-		return;
-
-#ifdef ROCKET_DEBUG_OPEN
-	printk(KERN_INFO "rp_close ttyR%d, count = %d\n", info->line, info->port.count);
-#endif
-
-	if (tty_port_close_start(port, tty, filp) == 0)
-		return;
-
-	mutex_lock(&port->mutex);
-	cp = &info->channel;
-	/*
-	 * Before we drop DTR, make sure the UART transmitter
-	 * has completely drained; this is especially
-	 * important if there is a transmit FIFO!
-	 */
-	timeout = (sGetTxCnt(cp) + 1) * HZ / info->cps;
-	if (timeout == 0)
-		timeout = 1;
-	rp_wait_until_sent(tty, timeout);
-	clear_bit((info->aiop * 8) + info->chan, (void *) &xmit_flags[info->board]);
-
-	sDisTransmit(cp);
-	sDisInterrupts(cp, (TXINT_EN | MCINT_EN | RXINT_EN | SRCINT_EN | CHANINT_EN));
-	sDisCTSFlowCtl(cp);
-	sDisTxSoftFlowCtl(cp);
-	sClrTxXOFF(cp);
-	sFlushRxFIFO(cp);
-	sFlushTxFIFO(cp);
-	sClrRTS(cp);
-	if (C_HUPCL(tty))
-		sClrDTR(cp);
-
-	rp_flush_buffer(tty);
-		
-	tty_ldisc_flush(tty);
-
-	clear_bit((info->aiop * 8) + info->chan, (void *) &xmit_flags[info->board]);
-
-	/* We can't yet use tty_port_close_end as the buffer handling in this
-	   driver is a bit different to the usual */
-
-	if (port->blocked_open) {
-		if (port->close_delay) {
-			msleep_interruptible(jiffies_to_msecs(port->close_delay));
-		}
-		wake_up_interruptible(&port->open_wait);
-	} else {
-		if (info->xmit_buf) {
-			free_page((unsigned long) info->xmit_buf);
-			info->xmit_buf = NULL;
-		}
-	}
-	spin_lock_irq(&port->lock);
-	tty->closing = 0;
-	spin_unlock_irq(&port->lock);
-	tty_port_set_initialized(port, 0);
-	tty_port_set_active(port, 0);
-	mutex_unlock(&port->mutex);
-	tty_port_tty_set(port, NULL);
-
-	atomic_dec(&rp_num_ports_open);
-
-#ifdef ROCKET_DEBUG_OPEN
-	printk(KERN_INFO "rocket mod-- = %d...\n",
-			atomic_read(&rp_num_ports_open));
-	printk(KERN_INFO "rp_close ttyR%d complete shutdown\n", info->line);
-#endif
-
-}
-
-static void rp_set_termios(struct tty_struct *tty,
-			   struct ktermios *old_termios)
-{
-	struct r_port *info = tty->driver_data;
-	CHANNEL_t *cp;
-	unsigned cflag;
-
-	if (rocket_paranoia_check(info, "rp_set_termios"))
-		return;
-
-	cflag = tty->termios.c_cflag;
-
-	/*
-	 * This driver doesn't support CS5 or CS6
-	 */
-	if (((cflag & CSIZE) == CS5) || ((cflag & CSIZE) == CS6))
-		tty->termios.c_cflag =
-		    ((cflag & ~CSIZE) | (old_termios->c_cflag & CSIZE));
-	/* Or CMSPAR */
-	tty->termios.c_cflag &= ~CMSPAR;
-
-	configure_r_port(tty, info, old_termios);
-
-	cp = &info->channel;
-
-	/* Handle transition to B0 status */
-	if ((old_termios->c_cflag & CBAUD) && !C_BAUD(tty)) {
-		sClrDTR(cp);
-		sClrRTS(cp);
-	}
-
-	/* Handle transition away from B0 status */
-	if (!(old_termios->c_cflag & CBAUD) && C_BAUD(tty)) {
-		sSetRTS(cp);
-		sSetDTR(cp);
-	}
-
-	if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty))
-		rp_start(tty);
-}
-
-static int rp_break(struct tty_struct *tty, int break_state)
-{
-	struct r_port *info = tty->driver_data;
-	unsigned long flags;
-
-	if (rocket_paranoia_check(info, "rp_break"))
-		return -EINVAL;
-
-	spin_lock_irqsave(&info->slock, flags);
-	if (break_state == -1)
-		sSendBreak(&info->channel);
-	else
-		sClrBreak(&info->channel);
-	spin_unlock_irqrestore(&info->slock, flags);
-	return 0;
-}
-
-/*
- * sGetChanRI used to be a macro in rocket_int.h. When the functionality for
- * the UPCI boards was added, it was decided to make this a function because
- * the macro was getting too complicated. All cases except the first one
- * (UPCIRingInd) are taken directly from the original macro.
- */
-static int sGetChanRI(CHANNEL_T * ChP)
-{
-	CONTROLLER_t *CtlP = ChP->CtlP;
-	int ChanNum = ChP->ChanNum;
-	int RingInd = 0;
-
-	if (CtlP->UPCIRingInd)
-		RingInd = !(sInB(CtlP->UPCIRingInd) & sBitMapSetTbl[ChanNum]);
-	else if (CtlP->AltChanRingIndicator)
-		RingInd = sInB((ByteIO_t) (ChP->ChanStat + 8)) & DSR_ACT;
-	else if (CtlP->boardType == ROCKET_TYPE_PC104)
-		RingInd = !(sInB(CtlP->AiopIO[3]) & sBitMapSetTbl[ChanNum]);
-
-	return RingInd;
-}
-
-/********************************************************************************************/
-/*  Here are the routines used by rp_ioctl.  These are all called from exception handlers.  */
-
-/*
- *  Returns the state of the serial modem control lines.  These next 2 functions 
- *  are the way kernel versions > 2.5 handle modem control lines rather than IOCTLs.
- */
-static int rp_tiocmget(struct tty_struct *tty)
-{
-	struct r_port *info = tty->driver_data;
-	unsigned int control, result, ChanStatus;
-
-	ChanStatus = sGetChanStatusLo(&info->channel);
-	control = info->channel.TxControl[3];
-	result = ((control & SET_RTS) ? TIOCM_RTS : 0) | 
-		((control & SET_DTR) ?  TIOCM_DTR : 0) |
-		((ChanStatus & CD_ACT) ? TIOCM_CAR : 0) |
-		(sGetChanRI(&info->channel) ? TIOCM_RNG : 0) |
-		((ChanStatus & DSR_ACT) ? TIOCM_DSR : 0) |
-		((ChanStatus & CTS_ACT) ? TIOCM_CTS : 0);
-
-	return result;
-}
-
-/* 
- *  Sets the modem control lines
- */
-static int rp_tiocmset(struct tty_struct *tty,
-				unsigned int set, unsigned int clear)
-{
-	struct r_port *info = tty->driver_data;
-
-	if (set & TIOCM_RTS)
-		info->channel.TxControl[3] |= SET_RTS;
-	if (set & TIOCM_DTR)
-		info->channel.TxControl[3] |= SET_DTR;
-	if (clear & TIOCM_RTS)
-		info->channel.TxControl[3] &= ~SET_RTS;
-	if (clear & TIOCM_DTR)
-		info->channel.TxControl[3] &= ~SET_DTR;
-
-	out32(info->channel.IndexAddr, info->channel.TxControl);
-	return 0;
-}
-
-static int get_config(struct r_port *info, struct rocket_config __user *retinfo)
-{
-	struct rocket_config tmp;
-
-	memset(&tmp, 0, sizeof (tmp));
-	mutex_lock(&info->port.mutex);
-	tmp.line = info->line;
-	tmp.flags = info->flags;
-	tmp.close_delay = info->port.close_delay;
-	tmp.closing_wait = info->port.closing_wait;
-	tmp.port = rcktpt_io_addr[(info->line >> 5) & 3];
-	mutex_unlock(&info->port.mutex);
-
-	if (copy_to_user(retinfo, &tmp, sizeof (*retinfo)))
-		return -EFAULT;
-	return 0;
-}
-
-static int set_config(struct tty_struct *tty, struct r_port *info,
-					struct rocket_config __user *new_info)
-{
-	struct rocket_config new_serial;
-
-	if (copy_from_user(&new_serial, new_info, sizeof (new_serial)))
-		return -EFAULT;
-
-	mutex_lock(&info->port.mutex);
-	if (!capable(CAP_SYS_ADMIN))
-	{
-		if ((new_serial.flags & ~ROCKET_USR_MASK) != (info->flags & ~ROCKET_USR_MASK)) {
-			mutex_unlock(&info->port.mutex);
-			return -EPERM;
-		}
-		info->flags = ((info->flags & ~ROCKET_USR_MASK) | (new_serial.flags & ROCKET_USR_MASK));
-		mutex_unlock(&info->port.mutex);
-		return 0;
-	}
-
-	if ((new_serial.flags ^ info->flags) & ROCKET_SPD_MASK) {
-		/* warn about deprecation, unless clearing */
-		if (new_serial.flags & ROCKET_SPD_MASK)
-			dev_warn_ratelimited(tty->dev, "use of SPD flags is deprecated\n");
-	}
-
-	info->flags = ((info->flags & ~ROCKET_FLAGS) | (new_serial.flags & ROCKET_FLAGS));
-	info->port.close_delay = new_serial.close_delay;
-	info->port.closing_wait = new_serial.closing_wait;
-
-	mutex_unlock(&info->port.mutex);
-
-	configure_r_port(tty, info, NULL);
-	return 0;
-}
-
-/*
- *  This function fills in a rocket_ports struct with information
- *  about what boards/ports are in the system.  This info is passed
- *  to user space.  See setrocket.c where the info is used to create
- *  the /dev/ttyRx ports.
- */
-static int get_ports(struct r_port *info, struct rocket_ports __user *retports)
-{
-	struct rocket_ports *tmp;
-	int board, ret = 0;
-
-	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
-
-	tmp->tty_major = rocket_driver->major;
-
-	for (board = 0; board < 4; board++) {
-		tmp->rocketModel[board].model = rocketModel[board].model;
-		strcpy(tmp->rocketModel[board].modelString,
-		       rocketModel[board].modelString);
-		tmp->rocketModel[board].numPorts = rocketModel[board].numPorts;
-		tmp->rocketModel[board].loadrm2 = rocketModel[board].loadrm2;
-		tmp->rocketModel[board].startingPortNumber =
-			rocketModel[board].startingPortNumber;
-	}
-	if (copy_to_user(retports, tmp, sizeof(*retports)))
-		ret = -EFAULT;
-	kfree(tmp);
-	return ret;
-}
-
-static int reset_rm2(struct r_port *info, void __user *arg)
-{
-	int reset;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (copy_from_user(&reset, arg, sizeof (int)))
-		return -EFAULT;
-	if (reset)
-		reset = 1;
-
-	if (rcktpt_type[info->board] != ROCKET_TYPE_MODEMII &&
-            rcktpt_type[info->board] != ROCKET_TYPE_MODEMIII)
-		return -EINVAL;
-
-	if (info->ctlp->BusType == isISA)
-		sModemReset(info->ctlp, info->chan, reset);
-	else
-		sPCIModemReset(info->ctlp, info->chan, reset);
-
-	return 0;
-}
-
-static int get_version(struct r_port *info, struct rocket_version __user *retvers)
-{
-	if (copy_to_user(retvers, &driver_version, sizeof (*retvers)))
-		return -EFAULT;
-	return 0;
-}
-
-/*  IOCTL call handler into the driver */
-static int rp_ioctl(struct tty_struct *tty,
-		    unsigned int cmd, unsigned long arg)
-{
-	struct r_port *info = tty->driver_data;
-	void __user *argp = (void __user *)arg;
-	int ret = 0;
-
-	if (cmd != RCKP_GET_PORTS && rocket_paranoia_check(info, "rp_ioctl"))
-		return -ENXIO;
-
-	switch (cmd) {
-	case RCKP_GET_CONFIG:
-		dev_warn_ratelimited(tty->dev,
-					"RCKP_GET_CONFIG option is deprecated\n");
-		ret = get_config(info, argp);
-		break;
-	case RCKP_SET_CONFIG:
-		dev_warn_ratelimited(tty->dev,
-					"RCKP_SET_CONFIG option is deprecated\n");
-		ret = set_config(tty, info, argp);
-		break;
-	case RCKP_GET_PORTS:
-		dev_warn_ratelimited(tty->dev,
-					"RCKP_GET_PORTS option is deprecated\n");
-		ret = get_ports(info, argp);
-		break;
-	case RCKP_RESET_RM2:
-		dev_warn_ratelimited(tty->dev,
-					"RCKP_RESET_RM2 option is deprecated\n");
-		ret = reset_rm2(info, argp);
-		break;
-	case RCKP_GET_VERSION:
-		dev_warn_ratelimited(tty->dev,
-					"RCKP_GET_VERSION option is deprecated\n");
-		ret = get_version(info, argp);
-		break;
-	default:
-		ret = -ENOIOCTLCMD;
-	}
-	return ret;
-}
-
-static void rp_send_xchar(struct tty_struct *tty, char ch)
-{
-	struct r_port *info = tty->driver_data;
-	CHANNEL_t *cp;
-
-	if (rocket_paranoia_check(info, "rp_send_xchar"))
-		return;
-
-	cp = &info->channel;
-	if (sGetTxCnt(cp))
-		sWriteTxPrioByte(cp, ch);
-	else
-		sWriteTxByte(sGetTxRxDataIO(cp), ch);
-}
-
-static void rp_throttle(struct tty_struct *tty)
-{
-	struct r_port *info = tty->driver_data;
-
-#ifdef ROCKET_DEBUG_THROTTLE
-	printk(KERN_INFO "throttle %s ....\n", tty->name);
-#endif
-
-	if (rocket_paranoia_check(info, "rp_throttle"))
-		return;
-
-	if (I_IXOFF(tty))
-		rp_send_xchar(tty, STOP_CHAR(tty));
-
-	sClrRTS(&info->channel);
-}
-
-static void rp_unthrottle(struct tty_struct *tty)
-{
-	struct r_port *info = tty->driver_data;
-#ifdef ROCKET_DEBUG_THROTTLE
-	printk(KERN_INFO "unthrottle %s ....\n", tty->name);
-#endif
-
-	if (rocket_paranoia_check(info, "rp_unthrottle"))
-		return;
-
-	if (I_IXOFF(tty))
-		rp_send_xchar(tty, START_CHAR(tty));
-
-	sSetRTS(&info->channel);
-}
-
-/*
- * ------------------------------------------------------------
- * rp_stop() and rp_start()
- *
- * This routines are called before setting or resetting tty->stopped.
- * They enable or disable transmitter interrupts, as necessary.
- * ------------------------------------------------------------
- */
-static void rp_stop(struct tty_struct *tty)
-{
-	struct r_port *info = tty->driver_data;
-
-#ifdef ROCKET_DEBUG_FLOW
-	printk(KERN_INFO "stop %s: %d %d....\n", tty->name,
-	       info->xmit_cnt, info->xmit_fifo_room);
-#endif
-
-	if (rocket_paranoia_check(info, "rp_stop"))
-		return;
-
-	if (sGetTxCnt(&info->channel))
-		sDisTransmit(&info->channel);
-}
-
-static void rp_start(struct tty_struct *tty)
-{
-	struct r_port *info = tty->driver_data;
-
-#ifdef ROCKET_DEBUG_FLOW
-	printk(KERN_INFO "start %s: %d %d....\n", tty->name,
-	       info->xmit_cnt, info->xmit_fifo_room);
-#endif
-
-	if (rocket_paranoia_check(info, "rp_stop"))
-		return;
-
-	sEnTransmit(&info->channel);
-	set_bit((info->aiop * 8) + info->chan,
-		(void *) &xmit_flags[info->board]);
-}
-
-/*
- * rp_wait_until_sent() --- wait until the transmitter is empty
- */
-static void rp_wait_until_sent(struct tty_struct *tty, int timeout)
-{
-	struct r_port *info = tty->driver_data;
-	CHANNEL_t *cp;
-	unsigned long orig_jiffies;
-	int check_time, exit_time;
-	int txcnt;
-
-	if (rocket_paranoia_check(info, "rp_wait_until_sent"))
-		return;
-
-	cp = &info->channel;
-
-	orig_jiffies = jiffies;
-#ifdef ROCKET_DEBUG_WAIT_UNTIL_SENT
-	printk(KERN_INFO "In %s(%d) (jiff=%lu)...\n", __func__, timeout,
-	       jiffies);
-	printk(KERN_INFO "cps=%d...\n", info->cps);
-#endif
-	while (1) {
-		txcnt = sGetTxCnt(cp);
-		if (!txcnt) {
-			if (sGetChanStatusLo(cp) & TXSHRMT)
-				break;
-			check_time = (HZ / info->cps) / 5;
-		} else {
-			check_time = HZ * txcnt / info->cps;
-		}
-		if (timeout) {
-			exit_time = orig_jiffies + timeout - jiffies;
-			if (exit_time <= 0)
-				break;
-			if (exit_time < check_time)
-				check_time = exit_time;
-		}
-		if (check_time == 0)
-			check_time = 1;
-#ifdef ROCKET_DEBUG_WAIT_UNTIL_SENT
-		printk(KERN_INFO "txcnt = %d (jiff=%lu,check=%d)...\n", txcnt,
-				jiffies, check_time);
-#endif
-		msleep_interruptible(jiffies_to_msecs(check_time));
-		if (signal_pending(current))
-			break;
-	}
-	__set_current_state(TASK_RUNNING);
-#ifdef ROCKET_DEBUG_WAIT_UNTIL_SENT
-	printk(KERN_INFO "txcnt = %d (jiff=%lu)...done\n", txcnt, jiffies);
-#endif
-}
-
-/*
- * rp_hangup() --- called by tty_hangup() when a hangup is signaled.
- */
-static void rp_hangup(struct tty_struct *tty)
-{
-	CHANNEL_t *cp;
-	struct r_port *info = tty->driver_data;
-	unsigned long flags;
-
-	if (rocket_paranoia_check(info, "rp_hangup"))
-		return;
-
-#if (defined(ROCKET_DEBUG_OPEN) || defined(ROCKET_DEBUG_HANGUP))
-	printk(KERN_INFO "rp_hangup of ttyR%d...\n", info->line);
-#endif
-	rp_flush_buffer(tty);
-	spin_lock_irqsave(&info->port.lock, flags);
-	if (info->port.count)
-		atomic_dec(&rp_num_ports_open);
-	clear_bit((info->aiop * 8) + info->chan, (void *) &xmit_flags[info->board]);
-	spin_unlock_irqrestore(&info->port.lock, flags);
-
-	tty_port_hangup(&info->port);
-
-	cp = &info->channel;
-	sDisRxFIFO(cp);
-	sDisTransmit(cp);
-	sDisInterrupts(cp, (TXINT_EN | MCINT_EN | RXINT_EN | SRCINT_EN | CHANINT_EN));
-	sDisCTSFlowCtl(cp);
-	sDisTxSoftFlowCtl(cp);
-	sClrTxXOFF(cp);
-	tty_port_set_initialized(&info->port, 0);
-
-	wake_up_interruptible(&info->port.open_wait);
-}
-
-/*
- *  Exception handler - write char routine.  The RocketPort driver uses a
- *  double-buffering strategy, with the twist that if the in-memory CPU
- *  buffer is empty, and there's space in the transmit FIFO, the
- *  writing routines will write directly to transmit FIFO.
- *  Write buffer and counters protected by spinlocks
- */
-static int rp_put_char(struct tty_struct *tty, unsigned char ch)
-{
-	struct r_port *info = tty->driver_data;
-	CHANNEL_t *cp;
-	unsigned long flags;
-
-	if (rocket_paranoia_check(info, "rp_put_char"))
-		return 0;
-
-	/*
-	 * Grab the port write mutex, locking out other processes that try to
-	 * write to this port
-	 */
-	mutex_lock(&info->write_mtx);
-
-#ifdef ROCKET_DEBUG_WRITE
-	printk(KERN_INFO "rp_put_char %c...\n", ch);
-#endif
-
-	spin_lock_irqsave(&info->slock, flags);
-	cp = &info->channel;
-
-	if (!tty->stopped && info->xmit_fifo_room == 0)
-		info->xmit_fifo_room = TXFIFO_SIZE - sGetTxCnt(cp);
-
-	if (tty->stopped || info->xmit_fifo_room == 0 || info->xmit_cnt != 0) {
-		info->xmit_buf[info->xmit_head++] = ch;
-		info->xmit_head &= XMIT_BUF_SIZE - 1;
-		info->xmit_cnt++;
-		set_bit((info->aiop * 8) + info->chan, (void *) &xmit_flags[info->board]);
-	} else {
-		sOutB(sGetTxRxDataIO(cp), ch);
-		info->xmit_fifo_room--;
-	}
-	spin_unlock_irqrestore(&info->slock, flags);
-	mutex_unlock(&info->write_mtx);
-	return 1;
-}
-
-/*
- *  Exception handler - write routine, called when user app writes to the device.
- *  A per port write mutex is used to protect from another process writing to
- *  this port at the same time.  This other process could be running on the other CPU
- *  or get control of the CPU if the copy_from_user() blocks due to a page fault (swapped out). 
- *  Spinlocks protect the info xmit members.
- */
-static int rp_write(struct tty_struct *tty,
-		    const unsigned char *buf, int count)
-{
-	struct r_port *info = tty->driver_data;
-	CHANNEL_t *cp;
-	const unsigned char *b;
-	int c, retval = 0;
-	unsigned long flags;
-
-	if (count <= 0 || rocket_paranoia_check(info, "rp_write"))
-		return 0;
-
-	if (mutex_lock_interruptible(&info->write_mtx))
-		return -ERESTARTSYS;
-
-#ifdef ROCKET_DEBUG_WRITE
-	printk(KERN_INFO "rp_write %d chars...\n", count);
-#endif
-	cp = &info->channel;
-
-	if (!tty->stopped && info->xmit_fifo_room < count)
-		info->xmit_fifo_room = TXFIFO_SIZE - sGetTxCnt(cp);
-
-        /*
-	 *  If the write queue for the port is empty, and there is FIFO space, stuff bytes 
-	 *  into FIFO.  Use the write queue for temp storage.
-         */
-	if (!tty->stopped && info->xmit_cnt == 0 && info->xmit_fifo_room > 0) {
-		c = min(count, info->xmit_fifo_room);
-		b = buf;
-
-		/*  Push data into FIFO, 2 bytes at a time */
-		sOutStrW(sGetTxRxDataIO(cp), (unsigned short *) b, c / 2);
-
-		/*  If there is a byte remaining, write it */
-		if (c & 1)
-			sOutB(sGetTxRxDataIO(cp), b[c - 1]);
-
-		retval += c;
-		buf += c;
-		count -= c;
-
-		spin_lock_irqsave(&info->slock, flags);
-		info->xmit_fifo_room -= c;
-		spin_unlock_irqrestore(&info->slock, flags);
-	}
-
-	/* If count is zero, we wrote it all and are done */
-	if (!count)
-		goto end;
-
-	/*  Write remaining data into the port's xmit_buf */
-	while (1) {
-		/* Hung up ? */
-		if (!tty_port_active(&info->port))
-			goto end;
-		c = min(count, XMIT_BUF_SIZE - info->xmit_cnt - 1);
-		c = min(c, XMIT_BUF_SIZE - info->xmit_head);
-		if (c <= 0)
-			break;
-
-		b = buf;
-		memcpy(info->xmit_buf + info->xmit_head, b, c);
-
-		spin_lock_irqsave(&info->slock, flags);
-		info->xmit_head =
-		    (info->xmit_head + c) & (XMIT_BUF_SIZE - 1);
-		info->xmit_cnt += c;
-		spin_unlock_irqrestore(&info->slock, flags);
-
-		buf += c;
-		count -= c;
-		retval += c;
-	}
-
-	if ((retval > 0) && !tty->stopped)
-		set_bit((info->aiop * 8) + info->chan, (void *) &xmit_flags[info->board]);
-	
-end:
- 	if (info->xmit_cnt < WAKEUP_CHARS) {
- 		tty_wakeup(tty);
-#ifdef ROCKETPORT_HAVE_POLL_WAIT
-		wake_up_interruptible(&tty->poll_wait);
-#endif
-	}
-	mutex_unlock(&info->write_mtx);
-	return retval;
-}
-
-/*
- * Return the number of characters that can be sent.  We estimate
- * only using the in-memory transmit buffer only, and ignore the
- * potential space in the transmit FIFO.
- */
-static int rp_write_room(struct tty_struct *tty)
-{
-	struct r_port *info = tty->driver_data;
-	int ret;
-
-	if (rocket_paranoia_check(info, "rp_write_room"))
-		return 0;
-
-	ret = XMIT_BUF_SIZE - info->xmit_cnt - 1;
-	if (ret < 0)
-		ret = 0;
-#ifdef ROCKET_DEBUG_WRITE
-	printk(KERN_INFO "rp_write_room returns %d...\n", ret);
-#endif
-	return ret;
-}
-
-/*
- * Return the number of characters in the buffer.  Again, this only
- * counts those characters in the in-memory transmit buffer.
- */
-static int rp_chars_in_buffer(struct tty_struct *tty)
-{
-	struct r_port *info = tty->driver_data;
-
-	if (rocket_paranoia_check(info, "rp_chars_in_buffer"))
-		return 0;
-
-#ifdef ROCKET_DEBUG_WRITE
-	printk(KERN_INFO "rp_chars_in_buffer returns %d...\n", info->xmit_cnt);
-#endif
-	return info->xmit_cnt;
-}
-
-/*
- *  Flushes the TX fifo for a port, deletes data in the xmit_buf stored in the
- *  r_port struct for the port.  Note that spinlock are used to protect info members,
- *  do not call this function if the spinlock is already held.
- */
-static void rp_flush_buffer(struct tty_struct *tty)
-{
-	struct r_port *info = tty->driver_data;
-	CHANNEL_t *cp;
-	unsigned long flags;
-
-	if (rocket_paranoia_check(info, "rp_flush_buffer"))
-		return;
-
-	spin_lock_irqsave(&info->slock, flags);
-	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
-	spin_unlock_irqrestore(&info->slock, flags);
-
-#ifdef ROCKETPORT_HAVE_POLL_WAIT
-	wake_up_interruptible(&tty->poll_wait);
-#endif
-	tty_wakeup(tty);
-
-	cp = &info->channel;
-	sFlushTxFIFO(cp);
-}
-
-#ifdef CONFIG_PCI
-
-static const struct pci_device_id rocket_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP4QUAD) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP8OCTA) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_URP8OCTA) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP8INTF) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_URP8INTF) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP8J) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP4J) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP8SNI) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP16SNI) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP16INTF) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_URP16INTF) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_CRP16INTF) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP32INTF) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_URP32INTF) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RPP4) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RPP8) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP2_232) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP2_422) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP6M) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_RP4M) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_UPCI_RM3_8PORT) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_RP, PCI_DEVICE_ID_UPCI_RM3_4PORT) },
-	{ }
-};
-MODULE_DEVICE_TABLE(pci, rocket_pci_ids);
-
-/*  Resets the speaker controller on RocketModem II and III devices */
-static void rmSpeakerReset(CONTROLLER_T * CtlP, unsigned long model)
-{
-	ByteIO_t addr;
-
-	/* RocketModem II speaker control is at the 8th port location of offset 0x40 */
-	if ((model == MODEL_RP4M) || (model == MODEL_RP6M)) {
-		addr = CtlP->AiopIO[0] + 0x4F;
-		sOutB(addr, 0);
-	}
-
-	/* RocketModem III speaker control is at the 1st port location of offset 0x80 */
-	if ((model == MODEL_UPCI_RM3_8PORT)
-	    || (model == MODEL_UPCI_RM3_4PORT)) {
-		addr = CtlP->AiopIO[0] + 0x88;
-		sOutB(addr, 0);
-	}
-}
-
-/***************************************************************************
-Function: sPCIInitController
-Purpose:  Initialization of controller global registers and controller
-          structure.
-Call:     sPCIInitController(CtlP,CtlNum,AiopIOList,AiopIOListSize,
-                          IRQNum,Frequency,PeriodicOnly)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-          int CtlNum; Controller number
-          ByteIO_t *AiopIOList; List of I/O addresses for each AIOP.
-             This list must be in the order the AIOPs will be found on the
-             controller.  Once an AIOP in the list is not found, it is
-             assumed that there are no more AIOPs on the controller.
-          int AiopIOListSize; Number of addresses in AiopIOList
-          int IRQNum; Interrupt Request number.  Can be any of the following:
-                         0: Disable global interrupts
-                         3: IRQ 3
-                         4: IRQ 4
-                         5: IRQ 5
-                         9: IRQ 9
-                         10: IRQ 10
-                         11: IRQ 11
-                         12: IRQ 12
-                         15: IRQ 15
-          Byte_t Frequency: A flag identifying the frequency
-                   of the periodic interrupt, can be any one of the following:
-                      FREQ_DIS - periodic interrupt disabled
-                      FREQ_137HZ - 137 Hertz
-                      FREQ_69HZ - 69 Hertz
-                      FREQ_34HZ - 34 Hertz
-                      FREQ_17HZ - 17 Hertz
-                      FREQ_9HZ - 9 Hertz
-                      FREQ_4HZ - 4 Hertz
-                   If IRQNum is set to 0 the Frequency parameter is
-                   overidden, it is forced to a value of FREQ_DIS.
-          int PeriodicOnly: 1 if all interrupts except the periodic
-                               interrupt are to be blocked.
-                            0 is both the periodic interrupt and
-                               other channel interrupts are allowed.
-                            If IRQNum is set to 0 the PeriodicOnly parameter is
-                               overidden, it is forced to a value of 0.
-Return:   int: Number of AIOPs on the controller, or CTLID_NULL if controller
-               initialization failed.
-
-Comments:
-          If periodic interrupts are to be disabled but AIOP interrupts
-          are allowed, set Frequency to FREQ_DIS and PeriodicOnly to 0.
-
-          If interrupts are to be completely disabled set IRQNum to 0.
-
-          Setting Frequency to FREQ_DIS and PeriodicOnly to 1 is an
-          invalid combination.
-
-          This function performs initialization of global interrupt modes,
-          but it does not actually enable global interrupts.  To enable
-          and disable global interrupts use functions sEnGlobalInt() and
-          sDisGlobalInt().  Enabling of global interrupts is normally not
-          done until all other initializations are complete.
-
-          Even if interrupts are globally enabled, they must also be
-          individually enabled for each channel that is to generate
-          interrupts.
-
-Warnings: No range checking on any of the parameters is done.
-
-          No context switches are allowed while executing this function.
-
-          After this function all AIOPs on the controller are disabled,
-          they can be enabled with sEnAiop().
-*/
-static int sPCIInitController(CONTROLLER_T * CtlP, int CtlNum,
-			      ByteIO_t * AiopIOList, int AiopIOListSize,
-			      WordIO_t ConfigIO, int IRQNum, Byte_t Frequency,
-			      int PeriodicOnly, int altChanRingIndicator,
-			      int UPCIRingInd)
-{
-	int i;
-	ByteIO_t io;
-
-	CtlP->AltChanRingIndicator = altChanRingIndicator;
-	CtlP->UPCIRingInd = UPCIRingInd;
-	CtlP->CtlNum = CtlNum;
-	CtlP->CtlID = CTLID_0001;	/* controller release 1 */
-	CtlP->BusType = isPCI;	/* controller release 1 */
-
-	if (ConfigIO) {
-		CtlP->isUPCI = 1;
-		CtlP->PCIIO = ConfigIO + _PCI_9030_INT_CTRL;
-		CtlP->PCIIO2 = ConfigIO + _PCI_9030_GPIO_CTRL;
-		CtlP->AiopIntrBits = upci_aiop_intr_bits;
-	} else {
-		CtlP->isUPCI = 0;
-		CtlP->PCIIO =
-		    (WordIO_t) ((ByteIO_t) AiopIOList[0] + _PCI_INT_FUNC);
-		CtlP->AiopIntrBits = aiop_intr_bits;
-	}
-
-	sPCIControllerEOI(CtlP);	/* clear EOI if warm init */
-	/* Init AIOPs */
-	CtlP->NumAiop = 0;
-	for (i = 0; i < AiopIOListSize; i++) {
-		io = AiopIOList[i];
-		CtlP->AiopIO[i] = (WordIO_t) io;
-		CtlP->AiopIntChanIO[i] = io + _INT_CHAN;
-
-		CtlP->AiopID[i] = sReadAiopID(io);	/* read AIOP ID */
-		if (CtlP->AiopID[i] == AIOPID_NULL)	/* if AIOP does not exist */
-			break;	/* done looking for AIOPs */
-
-		CtlP->AiopNumChan[i] = sReadAiopNumChan((WordIO_t) io);	/* num channels in AIOP */
-		sOutW((WordIO_t) io + _INDX_ADDR, _CLK_PRE);	/* clock prescaler */
-		sOutB(io + _INDX_DATA, sClockPrescale);
-		CtlP->NumAiop++;	/* bump count of AIOPs */
-	}
-
-	if (CtlP->NumAiop == 0)
-		return (-1);
-	else
-		return (CtlP->NumAiop);
-}
-
-/*
- *  Called when a PCI card is found.  Retrieves and stores model information,
- *  init's aiopic and serial port hardware.
- *  Inputs:  i is the board number (0-n)
- */
-static __init int register_PCI(int i, struct pci_dev *dev)
-{
-	int num_aiops, aiop, max_num_aiops, chan;
-	unsigned int aiopio[MAX_AIOPS_PER_BOARD];
-	CONTROLLER_t *ctlp;
-
-	int fast_clock = 0;
-	int altChanRingIndicator = 0;
-	int ports_per_aiop = 8;
-	WordIO_t ConfigIO = 0;
-	ByteIO_t UPCIRingInd = 0;
-
-	if (!dev || !pci_match_id(rocket_pci_ids, dev) ||
-	    pci_enable_device(dev) || i >= NUM_BOARDS)
-		return 0;
-
-	rcktpt_io_addr[i] = pci_resource_start(dev, 0);
-
-	rcktpt_type[i] = ROCKET_TYPE_NORMAL;
-	rocketModel[i].loadrm2 = 0;
-	rocketModel[i].startingPortNumber = nextLineNumber;
-
-	/*  Depending on the model, set up some config variables */
-	switch (dev->device) {
-	case PCI_DEVICE_ID_RP4QUAD:
-		max_num_aiops = 1;
-		ports_per_aiop = 4;
-		rocketModel[i].model = MODEL_RP4QUAD;
-		strcpy(rocketModel[i].modelString, "RocketPort 4 port w/quad cable");
-		rocketModel[i].numPorts = 4;
-		break;
-	case PCI_DEVICE_ID_RP8OCTA:
-		max_num_aiops = 1;
-		rocketModel[i].model = MODEL_RP8OCTA;
-		strcpy(rocketModel[i].modelString, "RocketPort 8 port w/octa cable");
-		rocketModel[i].numPorts = 8;
-		break;
-	case PCI_DEVICE_ID_URP8OCTA:
-		max_num_aiops = 1;
-		rocketModel[i].model = MODEL_UPCI_RP8OCTA;
-		strcpy(rocketModel[i].modelString, "RocketPort UPCI 8 port w/octa cable");
-		rocketModel[i].numPorts = 8;
-		break;
-	case PCI_DEVICE_ID_RP8INTF:
-		max_num_aiops = 1;
-		rocketModel[i].model = MODEL_RP8INTF;
-		strcpy(rocketModel[i].modelString, "RocketPort 8 port w/external I/F");
-		rocketModel[i].numPorts = 8;
-		break;
-	case PCI_DEVICE_ID_URP8INTF:
-		max_num_aiops = 1;
-		rocketModel[i].model = MODEL_UPCI_RP8INTF;
-		strcpy(rocketModel[i].modelString, "RocketPort UPCI 8 port w/external I/F");
-		rocketModel[i].numPorts = 8;
-		break;
-	case PCI_DEVICE_ID_RP8J:
-		max_num_aiops = 1;
-		rocketModel[i].model = MODEL_RP8J;
-		strcpy(rocketModel[i].modelString, "RocketPort 8 port w/RJ11 connectors");
-		rocketModel[i].numPorts = 8;
-		break;
-	case PCI_DEVICE_ID_RP4J:
-		max_num_aiops = 1;
-		ports_per_aiop = 4;
-		rocketModel[i].model = MODEL_RP4J;
-		strcpy(rocketModel[i].modelString, "RocketPort 4 port w/RJ45 connectors");
-		rocketModel[i].numPorts = 4;
-		break;
-	case PCI_DEVICE_ID_RP8SNI:
-		max_num_aiops = 1;
-		rocketModel[i].model = MODEL_RP8SNI;
-		strcpy(rocketModel[i].modelString, "RocketPort 8 port w/ custom DB78");
-		rocketModel[i].numPorts = 8;
-		break;
-	case PCI_DEVICE_ID_RP16SNI:
-		max_num_aiops = 2;
-		rocketModel[i].model = MODEL_RP16SNI;
-		strcpy(rocketModel[i].modelString, "RocketPort 16 port w/ custom DB78");
-		rocketModel[i].numPorts = 16;
-		break;
-	case PCI_DEVICE_ID_RP16INTF:
-		max_num_aiops = 2;
-		rocketModel[i].model = MODEL_RP16INTF;
-		strcpy(rocketModel[i].modelString, "RocketPort 16 port w/external I/F");
-		rocketModel[i].numPorts = 16;
-		break;
-	case PCI_DEVICE_ID_URP16INTF:
-		max_num_aiops = 2;
-		rocketModel[i].model = MODEL_UPCI_RP16INTF;
-		strcpy(rocketModel[i].modelString, "RocketPort UPCI 16 port w/external I/F");
-		rocketModel[i].numPorts = 16;
-		break;
-	case PCI_DEVICE_ID_CRP16INTF:
-		max_num_aiops = 2;
-		rocketModel[i].model = MODEL_CPCI_RP16INTF;
-		strcpy(rocketModel[i].modelString, "RocketPort Compact PCI 16 port w/external I/F");
-		rocketModel[i].numPorts = 16;
-		break;
-	case PCI_DEVICE_ID_RP32INTF:
-		max_num_aiops = 4;
-		rocketModel[i].model = MODEL_RP32INTF;
-		strcpy(rocketModel[i].modelString, "RocketPort 32 port w/external I/F");
-		rocketModel[i].numPorts = 32;
-		break;
-	case PCI_DEVICE_ID_URP32INTF:
-		max_num_aiops = 4;
-		rocketModel[i].model = MODEL_UPCI_RP32INTF;
-		strcpy(rocketModel[i].modelString, "RocketPort UPCI 32 port w/external I/F");
-		rocketModel[i].numPorts = 32;
-		break;
-	case PCI_DEVICE_ID_RPP4:
-		max_num_aiops = 1;
-		ports_per_aiop = 4;
-		altChanRingIndicator++;
-		fast_clock++;
-		rocketModel[i].model = MODEL_RPP4;
-		strcpy(rocketModel[i].modelString, "RocketPort Plus 4 port");
-		rocketModel[i].numPorts = 4;
-		break;
-	case PCI_DEVICE_ID_RPP8:
-		max_num_aiops = 2;
-		ports_per_aiop = 4;
-		altChanRingIndicator++;
-		fast_clock++;
-		rocketModel[i].model = MODEL_RPP8;
-		strcpy(rocketModel[i].modelString, "RocketPort Plus 8 port");
-		rocketModel[i].numPorts = 8;
-		break;
-	case PCI_DEVICE_ID_RP2_232:
-		max_num_aiops = 1;
-		ports_per_aiop = 2;
-		altChanRingIndicator++;
-		fast_clock++;
-		rocketModel[i].model = MODEL_RP2_232;
-		strcpy(rocketModel[i].modelString, "RocketPort Plus 2 port RS232");
-		rocketModel[i].numPorts = 2;
-		break;
-	case PCI_DEVICE_ID_RP2_422:
-		max_num_aiops = 1;
-		ports_per_aiop = 2;
-		altChanRingIndicator++;
-		fast_clock++;
-		rocketModel[i].model = MODEL_RP2_422;
-		strcpy(rocketModel[i].modelString, "RocketPort Plus 2 port RS422");
-		rocketModel[i].numPorts = 2;
-		break;
-	case PCI_DEVICE_ID_RP6M:
-
-		max_num_aiops = 1;
-		ports_per_aiop = 6;
-
-		/*  If revision is 1, the rocketmodem flash must be loaded.
-		 *  If it is 2 it is a "socketed" version. */
-		if (dev->revision == 1) {
-			rcktpt_type[i] = ROCKET_TYPE_MODEMII;
-			rocketModel[i].loadrm2 = 1;
-		} else {
-			rcktpt_type[i] = ROCKET_TYPE_MODEM;
-		}
-
-		rocketModel[i].model = MODEL_RP6M;
-		strcpy(rocketModel[i].modelString, "RocketModem 6 port");
-		rocketModel[i].numPorts = 6;
-		break;
-	case PCI_DEVICE_ID_RP4M:
-		max_num_aiops = 1;
-		ports_per_aiop = 4;
-		if (dev->revision == 1) {
-			rcktpt_type[i] = ROCKET_TYPE_MODEMII;
-			rocketModel[i].loadrm2 = 1;
-		} else {
-			rcktpt_type[i] = ROCKET_TYPE_MODEM;
-		}
-
-		rocketModel[i].model = MODEL_RP4M;
-		strcpy(rocketModel[i].modelString, "RocketModem 4 port");
-		rocketModel[i].numPorts = 4;
-		break;
-	default:
-		max_num_aiops = 0;
-		break;
-	}
-
-	/*
-	 * Check for UPCI boards.
-	 */
-
-	switch (dev->device) {
-	case PCI_DEVICE_ID_URP32INTF:
-	case PCI_DEVICE_ID_URP8INTF:
-	case PCI_DEVICE_ID_URP16INTF:
-	case PCI_DEVICE_ID_CRP16INTF:
-	case PCI_DEVICE_ID_URP8OCTA:
-		rcktpt_io_addr[i] = pci_resource_start(dev, 2);
-		ConfigIO = pci_resource_start(dev, 1);
-		if (dev->device == PCI_DEVICE_ID_URP8OCTA) {
-			UPCIRingInd = rcktpt_io_addr[i] + _PCI_9030_RING_IND;
-
-			/*
-			 * Check for octa or quad cable.
-			 */
-			if (!
-			    (sInW(ConfigIO + _PCI_9030_GPIO_CTRL) &
-			     PCI_GPIO_CTRL_8PORT)) {
-				ports_per_aiop = 4;
-				rocketModel[i].numPorts = 4;
-			}
-		}
-		break;
-	case PCI_DEVICE_ID_UPCI_RM3_8PORT:
-		max_num_aiops = 1;
-		rocketModel[i].model = MODEL_UPCI_RM3_8PORT;
-		strcpy(rocketModel[i].modelString, "RocketModem III 8 port");
-		rocketModel[i].numPorts = 8;
-		rcktpt_io_addr[i] = pci_resource_start(dev, 2);
-		UPCIRingInd = rcktpt_io_addr[i] + _PCI_9030_RING_IND;
-		ConfigIO = pci_resource_start(dev, 1);
-		rcktpt_type[i] = ROCKET_TYPE_MODEMIII;
-		break;
-	case PCI_DEVICE_ID_UPCI_RM3_4PORT:
-		max_num_aiops = 1;
-		rocketModel[i].model = MODEL_UPCI_RM3_4PORT;
-		strcpy(rocketModel[i].modelString, "RocketModem III 4 port");
-		rocketModel[i].numPorts = 4;
-		rcktpt_io_addr[i] = pci_resource_start(dev, 2);
-		UPCIRingInd = rcktpt_io_addr[i] + _PCI_9030_RING_IND;
-		ConfigIO = pci_resource_start(dev, 1);
-		rcktpt_type[i] = ROCKET_TYPE_MODEMIII;
-		break;
-	default:
-		break;
-	}
-
-	if (fast_clock) {
-		sClockPrescale = 0x12;	/* mod 2 (divide by 3) */
-		rp_baud_base[i] = 921600;
-	} else {
-		/*
-		 * If support_low_speed is set, use the slow clock
-		 * prescale, which supports 50 bps
-		 */
-		if (support_low_speed) {
-			/* mod 9 (divide by 10) prescale */
-			sClockPrescale = 0x19;
-			rp_baud_base[i] = 230400;
-		} else {
-			/* mod 4 (divide by 5) prescale */
-			sClockPrescale = 0x14;
-			rp_baud_base[i] = 460800;
-		}
-	}
-
-	for (aiop = 0; aiop < max_num_aiops; aiop++)
-		aiopio[aiop] = rcktpt_io_addr[i] + (aiop * 0x40);
-	ctlp = sCtlNumToCtlPtr(i);
-	num_aiops = sPCIInitController(ctlp, i, aiopio, max_num_aiops, ConfigIO, 0, FREQ_DIS, 0, altChanRingIndicator, UPCIRingInd);
-	for (aiop = 0; aiop < max_num_aiops; aiop++)
-		ctlp->AiopNumChan[aiop] = ports_per_aiop;
-
-	dev_info(&dev->dev, "comtrol PCI controller #%d found at "
-		"address %04lx, %d AIOP(s) (%s), creating ttyR%d - %ld\n",
-		i, rcktpt_io_addr[i], num_aiops, rocketModel[i].modelString,
-		rocketModel[i].startingPortNumber,
-		rocketModel[i].startingPortNumber + rocketModel[i].numPorts-1);
-
-	if (num_aiops <= 0) {
-		rcktpt_io_addr[i] = 0;
-		return (0);
-	}
-	is_PCI[i] = 1;
-
-	/*  Reset the AIOPIC, init the serial ports */
-	for (aiop = 0; aiop < num_aiops; aiop++) {
-		sResetAiopByNum(ctlp, aiop);
-		for (chan = 0; chan < ports_per_aiop; chan++)
-			init_r_port(i, aiop, chan, dev);
-	}
-
-	/*  Rocket modems must be reset */
-	if ((rcktpt_type[i] == ROCKET_TYPE_MODEM) ||
-	    (rcktpt_type[i] == ROCKET_TYPE_MODEMII) ||
-	    (rcktpt_type[i] == ROCKET_TYPE_MODEMIII)) {
-		for (chan = 0; chan < ports_per_aiop; chan++)
-			sPCIModemReset(ctlp, chan, 1);
-		msleep(500);
-		for (chan = 0; chan < ports_per_aiop; chan++)
-			sPCIModemReset(ctlp, chan, 0);
-		msleep(500);
-		rmSpeakerReset(ctlp, rocketModel[i].model);
-	}
-	return (1);
-}
-
-/*
- *  Probes for PCI cards, inits them if found
- *  Input:   board_found = number of ISA boards already found, or the
- *           starting board number
- *  Returns: Number of PCI boards found
- */
-static int __init init_PCI(int boards_found)
-{
-	struct pci_dev *dev = NULL;
-	int count = 0;
-
-	/*  Work through the PCI device list, pulling out ours */
-	while ((dev = pci_get_device(PCI_VENDOR_ID_RP, PCI_ANY_ID, dev))) {
-		if (register_PCI(count + boards_found, dev))
-			count++;
-	}
-	return (count);
-}
-
-#endif				/* CONFIG_PCI */
-
-/*
- *  Probes for ISA cards
- *  Input:   i = the board number to look for
- *  Returns: 1 if board found, 0 else
- */
-static int __init init_ISA(int i)
-{
-	int num_aiops, num_chan = 0, total_num_chan = 0;
-	int aiop, chan;
-	unsigned int aiopio[MAX_AIOPS_PER_BOARD];
-	CONTROLLER_t *ctlp;
-	char *type_string;
-
-	/*  If io_addr is zero, no board configured */
-	if (rcktpt_io_addr[i] == 0)
-		return (0);
-
-	/*  Reserve the IO region */
-	if (!request_region(rcktpt_io_addr[i], 64, "Comtrol RocketPort")) {
-		printk(KERN_ERR "Unable to reserve IO region for configured "
-				"ISA RocketPort at address 0x%lx, board not "
-				"installed...\n", rcktpt_io_addr[i]);
-		rcktpt_io_addr[i] = 0;
-		return (0);
-	}
-
-	ctlp = sCtlNumToCtlPtr(i);
-
-	ctlp->boardType = rcktpt_type[i];
-
-	switch (rcktpt_type[i]) {
-	case ROCKET_TYPE_PC104:
-		type_string = "(PC104)";
-		break;
-	case ROCKET_TYPE_MODEM:
-		type_string = "(RocketModem)";
-		break;
-	case ROCKET_TYPE_MODEMII:
-		type_string = "(RocketModem II)";
-		break;
-	default:
-		type_string = "";
-		break;
-	}
-
-	/*
-	 * If support_low_speed is set, use the slow clock prescale,
-	 * which supports 50 bps
-	 */
-	if (support_low_speed) {
-		sClockPrescale = 0x19;	/* mod 9 (divide by 10) prescale */
-		rp_baud_base[i] = 230400;
-	} else {
-		sClockPrescale = 0x14;	/* mod 4 (divide by 5) prescale */
-		rp_baud_base[i] = 460800;
-	}
-
-	for (aiop = 0; aiop < MAX_AIOPS_PER_BOARD; aiop++)
-		aiopio[aiop] = rcktpt_io_addr[i] + (aiop * 0x400);
-
-	num_aiops = sInitController(ctlp, i, controller + (i * 0x400), aiopio,  MAX_AIOPS_PER_BOARD, 0, FREQ_DIS, 0);
-
-	if (ctlp->boardType == ROCKET_TYPE_PC104) {
-		sEnAiop(ctlp, 2);	/* only one AIOPIC, but these */
-		sEnAiop(ctlp, 3);	/* CSels used for other stuff */
-	}
-
-	/*  If something went wrong initing the AIOP's release the ISA IO memory */
-	if (num_aiops <= 0) {
-		release_region(rcktpt_io_addr[i], 64);
-		rcktpt_io_addr[i] = 0;
-		return (0);
-	}
-  
-	rocketModel[i].startingPortNumber = nextLineNumber;
-
-	for (aiop = 0; aiop < num_aiops; aiop++) {
-		sResetAiopByNum(ctlp, aiop);
-		sEnAiop(ctlp, aiop);
-		num_chan = sGetAiopNumChan(ctlp, aiop);
-		total_num_chan += num_chan;
-		for (chan = 0; chan < num_chan; chan++)
-			init_r_port(i, aiop, chan, NULL);
-	}
-	is_PCI[i] = 0;
-	if ((rcktpt_type[i] == ROCKET_TYPE_MODEM) || (rcktpt_type[i] == ROCKET_TYPE_MODEMII)) {
-		num_chan = sGetAiopNumChan(ctlp, 0);
-		total_num_chan = num_chan;
-		for (chan = 0; chan < num_chan; chan++)
-			sModemReset(ctlp, chan, 1);
-		msleep(500);
-		for (chan = 0; chan < num_chan; chan++)
-			sModemReset(ctlp, chan, 0);
-		msleep(500);
-		strcpy(rocketModel[i].modelString, "RocketModem ISA");
-	} else {
-		strcpy(rocketModel[i].modelString, "RocketPort ISA");
-	}
-	rocketModel[i].numPorts = total_num_chan;
-	rocketModel[i].model = MODEL_ISA;
-
-	printk(KERN_INFO "RocketPort ISA card #%d found at 0x%lx - %d AIOPs %s\n", 
-	       i, rcktpt_io_addr[i], num_aiops, type_string);
-
-	printk(KERN_INFO "Installing %s, creating /dev/ttyR%d - %ld\n",
-	       rocketModel[i].modelString,
-	       rocketModel[i].startingPortNumber,
-	       rocketModel[i].startingPortNumber +
-	       rocketModel[i].numPorts - 1);
-
-	return (1);
-}
-
-static const struct tty_operations rocket_ops = {
-	.open = rp_open,
-	.close = rp_close,
-	.write = rp_write,
-	.put_char = rp_put_char,
-	.write_room = rp_write_room,
-	.chars_in_buffer = rp_chars_in_buffer,
-	.flush_buffer = rp_flush_buffer,
-	.ioctl = rp_ioctl,
-	.throttle = rp_throttle,
-	.unthrottle = rp_unthrottle,
-	.set_termios = rp_set_termios,
-	.stop = rp_stop,
-	.start = rp_start,
-	.hangup = rp_hangup,
-	.break_ctl = rp_break,
-	.send_xchar = rp_send_xchar,
-	.wait_until_sent = rp_wait_until_sent,
-	.tiocmget = rp_tiocmget,
-	.tiocmset = rp_tiocmset,
-};
-
-static const struct tty_port_operations rocket_port_ops = {
-	.carrier_raised = carrier_raised,
-	.dtr_rts = dtr_rts,
-};
-
-/*
- * The module "startup" routine; it's run when the module is loaded.
- */
-static int __init rp_init(void)
-{
-	int ret = -ENOMEM, pci_boards_found, isa_boards_found, i;
-
-	printk(KERN_INFO "RocketPort device driver module, version %s, %s\n",
-	       ROCKET_VERSION, ROCKET_DATE);
-
-	rocket_driver = alloc_tty_driver(MAX_RP_PORTS);
-	if (!rocket_driver)
-		goto err;
-
-	/*
-	 *  If board 1 is non-zero, there is at least one ISA configured.  If controller is 
-	 *  zero, use the default controller IO address of board1 + 0x40.
-	 */
-	if (board1) {
-		if (controller == 0)
-			controller = board1 + 0x40;
-	} else {
-		controller = 0;  /*  Used as a flag, meaning no ISA boards */
-	}
-
-	/*  If an ISA card is configured, reserve the 4 byte IO space for the Mudbac controller */
-	if (controller && (!request_region(controller, 4, "Comtrol RocketPort"))) {
-		printk(KERN_ERR "Unable to reserve IO region for first "
-			"configured ISA RocketPort controller 0x%lx.  "
-			"Driver exiting\n", controller);
-		ret = -EBUSY;
-		goto err_tty;
-	}
-
-	/*  Store ISA variable retrieved from command line or .conf file. */
-	rcktpt_io_addr[0] = board1;
-	rcktpt_io_addr[1] = board2;
-	rcktpt_io_addr[2] = board3;
-	rcktpt_io_addr[3] = board4;
-
-	rcktpt_type[0] = modem1 ? ROCKET_TYPE_MODEM : ROCKET_TYPE_NORMAL;
-	rcktpt_type[0] = pc104_1[0] ? ROCKET_TYPE_PC104 : rcktpt_type[0];
-	rcktpt_type[1] = modem2 ? ROCKET_TYPE_MODEM : ROCKET_TYPE_NORMAL;
-	rcktpt_type[1] = pc104_2[0] ? ROCKET_TYPE_PC104 : rcktpt_type[1];
-	rcktpt_type[2] = modem3 ? ROCKET_TYPE_MODEM : ROCKET_TYPE_NORMAL;
-	rcktpt_type[2] = pc104_3[0] ? ROCKET_TYPE_PC104 : rcktpt_type[2];
-	rcktpt_type[3] = modem4 ? ROCKET_TYPE_MODEM : ROCKET_TYPE_NORMAL;
-	rcktpt_type[3] = pc104_4[0] ? ROCKET_TYPE_PC104 : rcktpt_type[3];
-
-	/*
-	 * Set up the tty driver structure and then register this
-	 * driver with the tty layer.
-	 */
-
-	rocket_driver->flags = TTY_DRIVER_DYNAMIC_DEV;
-	rocket_driver->name = "ttyR";
-	rocket_driver->driver_name = "Comtrol RocketPort";
-	rocket_driver->major = TTY_ROCKET_MAJOR;
-	rocket_driver->minor_start = 0;
-	rocket_driver->type = TTY_DRIVER_TYPE_SERIAL;
-	rocket_driver->subtype = SERIAL_TYPE_NORMAL;
-	rocket_driver->init_termios = tty_std_termios;
-	rocket_driver->init_termios.c_cflag =
-	    B9600 | CS8 | CREAD | HUPCL | CLOCAL;
-	rocket_driver->init_termios.c_ispeed = 9600;
-	rocket_driver->init_termios.c_ospeed = 9600;
-#ifdef ROCKET_SOFT_FLOW
-	rocket_driver->flags |= TTY_DRIVER_REAL_RAW;
-#endif
-	tty_set_operations(rocket_driver, &rocket_ops);
-
-	ret = tty_register_driver(rocket_driver);
-	if (ret < 0) {
-		printk(KERN_ERR "Couldn't install tty RocketPort driver\n");
-		goto err_controller;
-	}
-
-#ifdef ROCKET_DEBUG_OPEN
-	printk(KERN_INFO "RocketPort driver is major %d\n", rocket_driver.major);
-#endif
-
-	/*
-	 *  OK, let's probe each of the controllers looking for boards.  Any boards found
-         *  will be initialized here.
-	 */
-	isa_boards_found = 0;
-	pci_boards_found = 0;
-
-	for (i = 0; i < NUM_BOARDS; i++) {
-		if (init_ISA(i))
-			isa_boards_found++;
-	}
-
-#ifdef CONFIG_PCI
-	if (isa_boards_found < NUM_BOARDS)
-		pci_boards_found = init_PCI(isa_boards_found);
-#endif
-
-	max_board = pci_boards_found + isa_boards_found;
-
-	if (max_board == 0) {
-		printk(KERN_ERR "No rocketport ports found; unloading driver\n");
-		ret = -ENXIO;
-		goto err_ttyu;
-	}
-
-	return 0;
-err_ttyu:
-	tty_unregister_driver(rocket_driver);
-err_controller:
-	if (controller)
-		release_region(controller, 4);
-err_tty:
-	put_tty_driver(rocket_driver);
-err:
-	return ret;
-}
-
-
-static void rp_cleanup_module(void)
-{
-	int retval;
-	int i;
-
-	del_timer_sync(&rocket_timer);
-
-	retval = tty_unregister_driver(rocket_driver);
-	if (retval)
-		printk(KERN_ERR "Error %d while trying to unregister "
-		       "rocketport driver\n", -retval);
-
-	for (i = 0; i < MAX_RP_PORTS; i++)
-		if (rp_table[i]) {
-			tty_unregister_device(rocket_driver, i);
-			tty_port_destroy(&rp_table[i]->port);
-			kfree(rp_table[i]);
-		}
-
-	put_tty_driver(rocket_driver);
-
-	for (i = 0; i < NUM_BOARDS; i++) {
-		if (rcktpt_io_addr[i] <= 0 || is_PCI[i])
-			continue;
-		release_region(rcktpt_io_addr[i], 64);
-	}
-	if (controller)
-		release_region(controller, 4);
-}
-
-/***************************************************************************
-Function: sInitController
-Purpose:  Initialization of controller global registers and controller
-          structure.
-Call:     sInitController(CtlP,CtlNum,MudbacIO,AiopIOList,AiopIOListSize,
-                          IRQNum,Frequency,PeriodicOnly)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-          int CtlNum; Controller number
-          ByteIO_t MudbacIO; Mudbac base I/O address.
-          ByteIO_t *AiopIOList; List of I/O addresses for each AIOP.
-             This list must be in the order the AIOPs will be found on the
-             controller.  Once an AIOP in the list is not found, it is
-             assumed that there are no more AIOPs on the controller.
-          int AiopIOListSize; Number of addresses in AiopIOList
-          int IRQNum; Interrupt Request number.  Can be any of the following:
-                         0: Disable global interrupts
-                         3: IRQ 3
-                         4: IRQ 4
-                         5: IRQ 5
-                         9: IRQ 9
-                         10: IRQ 10
-                         11: IRQ 11
-                         12: IRQ 12
-                         15: IRQ 15
-          Byte_t Frequency: A flag identifying the frequency
-                   of the periodic interrupt, can be any one of the following:
-                      FREQ_DIS - periodic interrupt disabled
-                      FREQ_137HZ - 137 Hertz
-                      FREQ_69HZ - 69 Hertz
-                      FREQ_34HZ - 34 Hertz
-                      FREQ_17HZ - 17 Hertz
-                      FREQ_9HZ - 9 Hertz
-                      FREQ_4HZ - 4 Hertz
-                   If IRQNum is set to 0 the Frequency parameter is
-                   overidden, it is forced to a value of FREQ_DIS.
-          int PeriodicOnly: 1 if all interrupts except the periodic
-                               interrupt are to be blocked.
-                            0 is both the periodic interrupt and
-                               other channel interrupts are allowed.
-                            If IRQNum is set to 0 the PeriodicOnly parameter is
-                               overidden, it is forced to a value of 0.
-Return:   int: Number of AIOPs on the controller, or CTLID_NULL if controller
-               initialization failed.
-
-Comments:
-          If periodic interrupts are to be disabled but AIOP interrupts
-          are allowed, set Frequency to FREQ_DIS and PeriodicOnly to 0.
-
-          If interrupts are to be completely disabled set IRQNum to 0.
-
-          Setting Frequency to FREQ_DIS and PeriodicOnly to 1 is an
-          invalid combination.
-
-          This function performs initialization of global interrupt modes,
-          but it does not actually enable global interrupts.  To enable
-          and disable global interrupts use functions sEnGlobalInt() and
-          sDisGlobalInt().  Enabling of global interrupts is normally not
-          done until all other initializations are complete.
-
-          Even if interrupts are globally enabled, they must also be
-          individually enabled for each channel that is to generate
-          interrupts.
-
-Warnings: No range checking on any of the parameters is done.
-
-          No context switches are allowed while executing this function.
-
-          After this function all AIOPs on the controller are disabled,
-          they can be enabled with sEnAiop().
-*/
-static int sInitController(CONTROLLER_T * CtlP, int CtlNum, ByteIO_t MudbacIO,
-			   ByteIO_t * AiopIOList, int AiopIOListSize,
-			   int IRQNum, Byte_t Frequency, int PeriodicOnly)
-{
-	int i;
-	ByteIO_t io;
-	int done;
-
-	CtlP->AiopIntrBits = aiop_intr_bits;
-	CtlP->AltChanRingIndicator = 0;
-	CtlP->CtlNum = CtlNum;
-	CtlP->CtlID = CTLID_0001;	/* controller release 1 */
-	CtlP->BusType = isISA;
-	CtlP->MBaseIO = MudbacIO;
-	CtlP->MReg1IO = MudbacIO + 1;
-	CtlP->MReg2IO = MudbacIO + 2;
-	CtlP->MReg3IO = MudbacIO + 3;
-#if 1
-	CtlP->MReg2 = 0;	/* interrupt disable */
-	CtlP->MReg3 = 0;	/* no periodic interrupts */
-#else
-	if (sIRQMap[IRQNum] == 0) {	/* interrupts globally disabled */
-		CtlP->MReg2 = 0;	/* interrupt disable */
-		CtlP->MReg3 = 0;	/* no periodic interrupts */
-	} else {
-		CtlP->MReg2 = sIRQMap[IRQNum];	/* set IRQ number */
-		CtlP->MReg3 = Frequency;	/* set frequency */
-		if (PeriodicOnly) {	/* periodic interrupt only */
-			CtlP->MReg3 |= PERIODIC_ONLY;
-		}
-	}
-#endif
-	sOutB(CtlP->MReg2IO, CtlP->MReg2);
-	sOutB(CtlP->MReg3IO, CtlP->MReg3);
-	sControllerEOI(CtlP);	/* clear EOI if warm init */
-	/* Init AIOPs */
-	CtlP->NumAiop = 0;
-	for (i = done = 0; i < AiopIOListSize; i++) {
-		io = AiopIOList[i];
-		CtlP->AiopIO[i] = (WordIO_t) io;
-		CtlP->AiopIntChanIO[i] = io + _INT_CHAN;
-		sOutB(CtlP->MReg2IO, CtlP->MReg2 | (i & 0x03));	/* AIOP index */
-		sOutB(MudbacIO, (Byte_t) (io >> 6));	/* set up AIOP I/O in MUDBAC */
-		if (done)
-			continue;
-		sEnAiop(CtlP, i);	/* enable the AIOP */
-		CtlP->AiopID[i] = sReadAiopID(io);	/* read AIOP ID */
-		if (CtlP->AiopID[i] == AIOPID_NULL)	/* if AIOP does not exist */
-			done = 1;	/* done looking for AIOPs */
-		else {
-			CtlP->AiopNumChan[i] = sReadAiopNumChan((WordIO_t) io);	/* num channels in AIOP */
-			sOutW((WordIO_t) io + _INDX_ADDR, _CLK_PRE);	/* clock prescaler */
-			sOutB(io + _INDX_DATA, sClockPrescale);
-			CtlP->NumAiop++;	/* bump count of AIOPs */
-		}
-		sDisAiop(CtlP, i);	/* disable AIOP */
-	}
-
-	if (CtlP->NumAiop == 0)
-		return (-1);
-	else
-		return (CtlP->NumAiop);
-}
-
-/***************************************************************************
-Function: sReadAiopID
-Purpose:  Read the AIOP idenfication number directly from an AIOP.
-Call:     sReadAiopID(io)
-          ByteIO_t io: AIOP base I/O address
-Return:   int: Flag AIOPID_XXXX if a valid AIOP is found, where X
-                 is replace by an identifying number.
-          Flag AIOPID_NULL if no valid AIOP is found
-Warnings: No context switches are allowed while executing this function.
-
-*/
-static int sReadAiopID(ByteIO_t io)
-{
-	Byte_t AiopID;		/* ID byte from AIOP */
-
-	sOutB(io + _CMD_REG, RESET_ALL);	/* reset AIOP */
-	sOutB(io + _CMD_REG, 0x0);
-	AiopID = sInW(io + _CHN_STAT0) & 0x07;
-	if (AiopID == 0x06)
-		return (1);
-	else			/* AIOP does not exist */
-		return (-1);
-}
-
-/***************************************************************************
-Function: sReadAiopNumChan
-Purpose:  Read the number of channels available in an AIOP directly from
-          an AIOP.
-Call:     sReadAiopNumChan(io)
-          WordIO_t io: AIOP base I/O address
-Return:   int: The number of channels available
-Comments: The number of channels is determined by write/reads from identical
-          offsets within the SRAM address spaces for channels 0 and 4.
-          If the channel 4 space is mirrored to channel 0 it is a 4 channel
-          AIOP, otherwise it is an 8 channel.
-Warnings: No context switches are allowed while executing this function.
-*/
-static int sReadAiopNumChan(WordIO_t io)
-{
-	Word_t x;
-	static Byte_t R[4] = { 0x00, 0x00, 0x34, 0x12 };
-
-	/* write to chan 0 SRAM */
-	out32((DWordIO_t) io + _INDX_ADDR, R);
-	sOutW(io + _INDX_ADDR, 0);	/* read from SRAM, chan 0 */
-	x = sInW(io + _INDX_DATA);
-	sOutW(io + _INDX_ADDR, 0x4000);	/* read from SRAM, chan 4 */
-	if (x != sInW(io + _INDX_DATA))	/* if different must be 8 chan */
-		return (8);
-	else
-		return (4);
-}
-
-/***************************************************************************
-Function: sInitChan
-Purpose:  Initialization of a channel and channel structure
-Call:     sInitChan(CtlP,ChP,AiopNum,ChanNum)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-          CHANNEL_T *ChP; Ptr to channel structure
-          int AiopNum; AIOP number within controller
-          int ChanNum; Channel number within AIOP
-Return:   int: 1 if initialization succeeded, 0 if it fails because channel
-               number exceeds number of channels available in AIOP.
-Comments: This function must be called before a channel can be used.
-Warnings: No range checking on any of the parameters is done.
-
-          No context switches are allowed while executing this function.
-*/
-static int sInitChan(CONTROLLER_T * CtlP, CHANNEL_T * ChP, int AiopNum,
-		     int ChanNum)
-{
-	int i;
-	WordIO_t AiopIO;
-	WordIO_t ChIOOff;
-	Byte_t *ChR;
-	Word_t ChOff;
-	static Byte_t R[4];
-	int brd9600;
-
-	if (ChanNum >= CtlP->AiopNumChan[AiopNum])
-		return 0;	/* exceeds num chans in AIOP */
-
-	/* Channel, AIOP, and controller identifiers */
-	ChP->CtlP = CtlP;
-	ChP->ChanID = CtlP->AiopID[AiopNum];
-	ChP->AiopNum = AiopNum;
-	ChP->ChanNum = ChanNum;
-
-	/* Global direct addresses */
-	AiopIO = CtlP->AiopIO[AiopNum];
-	ChP->Cmd = (ByteIO_t) AiopIO + _CMD_REG;
-	ChP->IntChan = (ByteIO_t) AiopIO + _INT_CHAN;
-	ChP->IntMask = (ByteIO_t) AiopIO + _INT_MASK;
-	ChP->IndexAddr = (DWordIO_t) AiopIO + _INDX_ADDR;
-	ChP->IndexData = AiopIO + _INDX_DATA;
-
-	/* Channel direct addresses */
-	ChIOOff = AiopIO + ChP->ChanNum * 2;
-	ChP->TxRxData = ChIOOff + _TD0;
-	ChP->ChanStat = ChIOOff + _CHN_STAT0;
-	ChP->TxRxCount = ChIOOff + _FIFO_CNT0;
-	ChP->IntID = (ByteIO_t) AiopIO + ChP->ChanNum + _INT_ID0;
-
-	/* Initialize the channel from the RData array */
-	for (i = 0; i < RDATASIZE; i += 4) {
-		R[0] = RData[i];
-		R[1] = RData[i + 1] + 0x10 * ChanNum;
-		R[2] = RData[i + 2];
-		R[3] = RData[i + 3];
-		out32(ChP->IndexAddr, R);
-	}
-
-	ChR = ChP->R;
-	for (i = 0; i < RREGDATASIZE; i += 4) {
-		ChR[i] = RRegData[i];
-		ChR[i + 1] = RRegData[i + 1] + 0x10 * ChanNum;
-		ChR[i + 2] = RRegData[i + 2];
-		ChR[i + 3] = RRegData[i + 3];
-	}
-
-	/* Indexed registers */
-	ChOff = (Word_t) ChanNum *0x1000;
-
-	if (sClockPrescale == 0x14)
-		brd9600 = 47;
-	else
-		brd9600 = 23;
-
-	ChP->BaudDiv[0] = (Byte_t) (ChOff + _BAUD);
-	ChP->BaudDiv[1] = (Byte_t) ((ChOff + _BAUD) >> 8);
-	ChP->BaudDiv[2] = (Byte_t) brd9600;
-	ChP->BaudDiv[3] = (Byte_t) (brd9600 >> 8);
-	out32(ChP->IndexAddr, ChP->BaudDiv);
-
-	ChP->TxControl[0] = (Byte_t) (ChOff + _TX_CTRL);
-	ChP->TxControl[1] = (Byte_t) ((ChOff + _TX_CTRL) >> 8);
-	ChP->TxControl[2] = 0;
-	ChP->TxControl[3] = 0;
-	out32(ChP->IndexAddr, ChP->TxControl);
-
-	ChP->RxControl[0] = (Byte_t) (ChOff + _RX_CTRL);
-	ChP->RxControl[1] = (Byte_t) ((ChOff + _RX_CTRL) >> 8);
-	ChP->RxControl[2] = 0;
-	ChP->RxControl[3] = 0;
-	out32(ChP->IndexAddr, ChP->RxControl);
-
-	ChP->TxEnables[0] = (Byte_t) (ChOff + _TX_ENBLS);
-	ChP->TxEnables[1] = (Byte_t) ((ChOff + _TX_ENBLS) >> 8);
-	ChP->TxEnables[2] = 0;
-	ChP->TxEnables[3] = 0;
-	out32(ChP->IndexAddr, ChP->TxEnables);
-
-	ChP->TxCompare[0] = (Byte_t) (ChOff + _TXCMP1);
-	ChP->TxCompare[1] = (Byte_t) ((ChOff + _TXCMP1) >> 8);
-	ChP->TxCompare[2] = 0;
-	ChP->TxCompare[3] = 0;
-	out32(ChP->IndexAddr, ChP->TxCompare);
-
-	ChP->TxReplace1[0] = (Byte_t) (ChOff + _TXREP1B1);
-	ChP->TxReplace1[1] = (Byte_t) ((ChOff + _TXREP1B1) >> 8);
-	ChP->TxReplace1[2] = 0;
-	ChP->TxReplace1[3] = 0;
-	out32(ChP->IndexAddr, ChP->TxReplace1);
-
-	ChP->TxReplace2[0] = (Byte_t) (ChOff + _TXREP2);
-	ChP->TxReplace2[1] = (Byte_t) ((ChOff + _TXREP2) >> 8);
-	ChP->TxReplace2[2] = 0;
-	ChP->TxReplace2[3] = 0;
-	out32(ChP->IndexAddr, ChP->TxReplace2);
-
-	ChP->TxFIFOPtrs = ChOff + _TXF_OUTP;
-	ChP->TxFIFO = ChOff + _TX_FIFO;
-
-	sOutB(ChP->Cmd, (Byte_t) ChanNum | RESTXFCNT);	/* apply reset Tx FIFO count */
-	sOutB(ChP->Cmd, (Byte_t) ChanNum);	/* remove reset Tx FIFO count */
-	sOutW((WordIO_t) ChP->IndexAddr, ChP->TxFIFOPtrs);	/* clear Tx in/out ptrs */
-	sOutW(ChP->IndexData, 0);
-	ChP->RxFIFOPtrs = ChOff + _RXF_OUTP;
-	ChP->RxFIFO = ChOff + _RX_FIFO;
-
-	sOutB(ChP->Cmd, (Byte_t) ChanNum | RESRXFCNT);	/* apply reset Rx FIFO count */
-	sOutB(ChP->Cmd, (Byte_t) ChanNum);	/* remove reset Rx FIFO count */
-	sOutW((WordIO_t) ChP->IndexAddr, ChP->RxFIFOPtrs);	/* clear Rx out ptr */
-	sOutW(ChP->IndexData, 0);
-	sOutW((WordIO_t) ChP->IndexAddr, ChP->RxFIFOPtrs + 2);	/* clear Rx in ptr */
-	sOutW(ChP->IndexData, 0);
-	ChP->TxPrioCnt = ChOff + _TXP_CNT;
-	sOutW((WordIO_t) ChP->IndexAddr, ChP->TxPrioCnt);
-	sOutB(ChP->IndexData, 0);
-	ChP->TxPrioPtr = ChOff + _TXP_PNTR;
-	sOutW((WordIO_t) ChP->IndexAddr, ChP->TxPrioPtr);
-	sOutB(ChP->IndexData, 0);
-	ChP->TxPrioBuf = ChOff + _TXP_BUF;
-	sEnRxProcessor(ChP);	/* start the Rx processor */
-
-	return 1;
-}
-
-/***************************************************************************
-Function: sStopRxProcessor
-Purpose:  Stop the receive processor from processing a channel.
-Call:     sStopRxProcessor(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-
-Comments: The receive processor can be started again with sStartRxProcessor().
-          This function causes the receive processor to skip over the
-          stopped channel.  It does not stop it from processing other channels.
-
-Warnings: No context switches are allowed while executing this function.
-
-          Do not leave the receive processor stopped for more than one
-          character time.
-
-          After calling this function a delay of 4 uS is required to ensure
-          that the receive processor is no longer processing this channel.
-*/
-static void sStopRxProcessor(CHANNEL_T * ChP)
-{
-	Byte_t R[4];
-
-	R[0] = ChP->R[0];
-	R[1] = ChP->R[1];
-	R[2] = 0x0a;
-	R[3] = ChP->R[3];
-	out32(ChP->IndexAddr, R);
-}
-
-/***************************************************************************
-Function: sFlushRxFIFO
-Purpose:  Flush the Rx FIFO
-Call:     sFlushRxFIFO(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   void
-Comments: To prevent data from being enqueued or dequeued in the Tx FIFO
-          while it is being flushed the receive processor is stopped
-          and the transmitter is disabled.  After these operations a
-          4 uS delay is done before clearing the pointers to allow
-          the receive processor to stop.  These items are handled inside
-          this function.
-Warnings: No context switches are allowed while executing this function.
-*/
-static void sFlushRxFIFO(CHANNEL_T * ChP)
-{
-	int i;
-	Byte_t Ch;		/* channel number within AIOP */
-	int RxFIFOEnabled;	/* 1 if Rx FIFO enabled */
-
-	if (sGetRxCnt(ChP) == 0)	/* Rx FIFO empty */
-		return;		/* don't need to flush */
-
-	RxFIFOEnabled = 0;
-	if (ChP->R[0x32] == 0x08) {	/* Rx FIFO is enabled */
-		RxFIFOEnabled = 1;
-		sDisRxFIFO(ChP);	/* disable it */
-		for (i = 0; i < 2000 / 200; i++)	/* delay 2 uS to allow proc to disable FIFO */
-			sInB(ChP->IntChan);	/* depends on bus i/o timing */
-	}
-	sGetChanStatus(ChP);	/* clear any pending Rx errors in chan stat */
-	Ch = (Byte_t) sGetChanNum(ChP);
-	sOutB(ChP->Cmd, Ch | RESRXFCNT);	/* apply reset Rx FIFO count */
-	sOutB(ChP->Cmd, Ch);	/* remove reset Rx FIFO count */
-	sOutW((WordIO_t) ChP->IndexAddr, ChP->RxFIFOPtrs);	/* clear Rx out ptr */
-	sOutW(ChP->IndexData, 0);
-	sOutW((WordIO_t) ChP->IndexAddr, ChP->RxFIFOPtrs + 2);	/* clear Rx in ptr */
-	sOutW(ChP->IndexData, 0);
-	if (RxFIFOEnabled)
-		sEnRxFIFO(ChP);	/* enable Rx FIFO */
-}
-
-/***************************************************************************
-Function: sFlushTxFIFO
-Purpose:  Flush the Tx FIFO
-Call:     sFlushTxFIFO(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   void
-Comments: To prevent data from being enqueued or dequeued in the Tx FIFO
-          while it is being flushed the receive processor is stopped
-          and the transmitter is disabled.  After these operations a
-          4 uS delay is done before clearing the pointers to allow
-          the receive processor to stop.  These items are handled inside
-          this function.
-Warnings: No context switches are allowed while executing this function.
-*/
-static void sFlushTxFIFO(CHANNEL_T * ChP)
-{
-	int i;
-	Byte_t Ch;		/* channel number within AIOP */
-	int TxEnabled;		/* 1 if transmitter enabled */
-
-	if (sGetTxCnt(ChP) == 0)	/* Tx FIFO empty */
-		return;		/* don't need to flush */
-
-	TxEnabled = 0;
-	if (ChP->TxControl[3] & TX_ENABLE) {
-		TxEnabled = 1;
-		sDisTransmit(ChP);	/* disable transmitter */
-	}
-	sStopRxProcessor(ChP);	/* stop Rx processor */
-	for (i = 0; i < 4000 / 200; i++)	/* delay 4 uS to allow proc to stop */
-		sInB(ChP->IntChan);	/* depends on bus i/o timing */
-	Ch = (Byte_t) sGetChanNum(ChP);
-	sOutB(ChP->Cmd, Ch | RESTXFCNT);	/* apply reset Tx FIFO count */
-	sOutB(ChP->Cmd, Ch);	/* remove reset Tx FIFO count */
-	sOutW((WordIO_t) ChP->IndexAddr, ChP->TxFIFOPtrs);	/* clear Tx in/out ptrs */
-	sOutW(ChP->IndexData, 0);
-	if (TxEnabled)
-		sEnTransmit(ChP);	/* enable transmitter */
-	sStartRxProcessor(ChP);	/* restart Rx processor */
-}
-
-/***************************************************************************
-Function: sWriteTxPrioByte
-Purpose:  Write a byte of priority transmit data to a channel
-Call:     sWriteTxPrioByte(ChP,Data)
-          CHANNEL_T *ChP; Ptr to channel structure
-          Byte_t Data; The transmit data byte
-
-Return:   int: 1 if the bytes is successfully written, otherwise 0.
-
-Comments: The priority byte is transmitted before any data in the Tx FIFO.
-
-Warnings: No context switches are allowed while executing this function.
-*/
-static int sWriteTxPrioByte(CHANNEL_T * ChP, Byte_t Data)
-{
-	Byte_t DWBuf[4];	/* buffer for double word writes */
-	Word_t *WordPtr;	/* must be far because Win SS != DS */
-	register DWordIO_t IndexAddr;
-
-	if (sGetTxCnt(ChP) > 1) {	/* write it to Tx priority buffer */
-		IndexAddr = ChP->IndexAddr;
-		sOutW((WordIO_t) IndexAddr, ChP->TxPrioCnt);	/* get priority buffer status */
-		if (sInB((ByteIO_t) ChP->IndexData) & PRI_PEND)	/* priority buffer busy */
-			return (0);	/* nothing sent */
-
-		WordPtr = (Word_t *) (&DWBuf[0]);
-		*WordPtr = ChP->TxPrioBuf;	/* data byte address */
-
-		DWBuf[2] = Data;	/* data byte value */
-		out32(IndexAddr, DWBuf);	/* write it out */
-
-		*WordPtr = ChP->TxPrioCnt;	/* Tx priority count address */
-
-		DWBuf[2] = PRI_PEND + 1;	/* indicate 1 byte pending */
-		DWBuf[3] = 0;	/* priority buffer pointer */
-		out32(IndexAddr, DWBuf);	/* write it out */
-	} else {		/* write it to Tx FIFO */
-
-		sWriteTxByte(sGetTxRxDataIO(ChP), Data);
-	}
-	return (1);		/* 1 byte sent */
-}
-
-/***************************************************************************
-Function: sEnInterrupts
-Purpose:  Enable one or more interrupts for a channel
-Call:     sEnInterrupts(ChP,Flags)
-          CHANNEL_T *ChP; Ptr to channel structure
-          Word_t Flags: Interrupt enable flags, can be any combination
-             of the following flags:
-                TXINT_EN:   Interrupt on Tx FIFO empty
-                RXINT_EN:   Interrupt on Rx FIFO at trigger level (see
-                            sSetRxTrigger())
-                SRCINT_EN:  Interrupt on SRC (Special Rx Condition)
-                MCINT_EN:   Interrupt on modem input change
-                CHANINT_EN: Allow channel interrupt signal to the AIOP's
-                            Interrupt Channel Register.
-Return:   void
-Comments: If an interrupt enable flag is set in Flags, that interrupt will be
-          enabled.  If an interrupt enable flag is not set in Flags, that
-          interrupt will not be changed.  Interrupts can be disabled with
-          function sDisInterrupts().
-
-          This function sets the appropriate bit for the channel in the AIOP's
-          Interrupt Mask Register if the CHANINT_EN flag is set.  This allows
-          this channel's bit to be set in the AIOP's Interrupt Channel Register.
-
-          Interrupts must also be globally enabled before channel interrupts
-          will be passed on to the host.  This is done with function
-          sEnGlobalInt().
-
-          In some cases it may be desirable to disable interrupts globally but
-          enable channel interrupts.  This would allow the global interrupt
-          status register to be used to determine which AIOPs need service.
-*/
-static void sEnInterrupts(CHANNEL_T * ChP, Word_t Flags)
-{
-	Byte_t Mask;		/* Interrupt Mask Register */
-
-	ChP->RxControl[2] |=
-	    ((Byte_t) Flags & (RXINT_EN | SRCINT_EN | MCINT_EN));
-
-	out32(ChP->IndexAddr, ChP->RxControl);
-
-	ChP->TxControl[2] |= ((Byte_t) Flags & TXINT_EN);
-
-	out32(ChP->IndexAddr, ChP->TxControl);
-
-	if (Flags & CHANINT_EN) {
-		Mask = sInB(ChP->IntMask) | sBitMapSetTbl[ChP->ChanNum];
-		sOutB(ChP->IntMask, Mask);
-	}
-}
-
-/***************************************************************************
-Function: sDisInterrupts
-Purpose:  Disable one or more interrupts for a channel
-Call:     sDisInterrupts(ChP,Flags)
-          CHANNEL_T *ChP; Ptr to channel structure
-          Word_t Flags: Interrupt flags, can be any combination
-             of the following flags:
-                TXINT_EN:   Interrupt on Tx FIFO empty
-                RXINT_EN:   Interrupt on Rx FIFO at trigger level (see
-                            sSetRxTrigger())
-                SRCINT_EN:  Interrupt on SRC (Special Rx Condition)
-                MCINT_EN:   Interrupt on modem input change
-                CHANINT_EN: Disable channel interrupt signal to the
-                            AIOP's Interrupt Channel Register.
-Return:   void
-Comments: If an interrupt flag is set in Flags, that interrupt will be
-          disabled.  If an interrupt flag is not set in Flags, that
-          interrupt will not be changed.  Interrupts can be enabled with
-          function sEnInterrupts().
-
-          This function clears the appropriate bit for the channel in the AIOP's
-          Interrupt Mask Register if the CHANINT_EN flag is set.  This blocks
-          this channel's bit from being set in the AIOP's Interrupt Channel
-          Register.
-*/
-static void sDisInterrupts(CHANNEL_T * ChP, Word_t Flags)
-{
-	Byte_t Mask;		/* Interrupt Mask Register */
-
-	ChP->RxControl[2] &=
-	    ~((Byte_t) Flags & (RXINT_EN | SRCINT_EN | MCINT_EN));
-	out32(ChP->IndexAddr, ChP->RxControl);
-	ChP->TxControl[2] &= ~((Byte_t) Flags & TXINT_EN);
-	out32(ChP->IndexAddr, ChP->TxControl);
-
-	if (Flags & CHANINT_EN) {
-		Mask = sInB(ChP->IntMask) & sBitMapClrTbl[ChP->ChanNum];
-		sOutB(ChP->IntMask, Mask);
-	}
-}
-
-static void sSetInterfaceMode(CHANNEL_T * ChP, Byte_t mode)
-{
-	sOutB(ChP->CtlP->AiopIO[2], (mode & 0x18) | ChP->ChanNum);
-}
-
-/*
- *  Not an official SSCI function, but how to reset RocketModems.
- *  ISA bus version
- */
-static void sModemReset(CONTROLLER_T * CtlP, int chan, int on)
-{
-	ByteIO_t addr;
-	Byte_t val;
-
-	addr = CtlP->AiopIO[0] + 0x400;
-	val = sInB(CtlP->MReg3IO);
-	/* if AIOP[1] is not enabled, enable it */
-	if ((val & 2) == 0) {
-		val = sInB(CtlP->MReg2IO);
-		sOutB(CtlP->MReg2IO, (val & 0xfc) | (1 & 0x03));
-		sOutB(CtlP->MBaseIO, (unsigned char) (addr >> 6));
-	}
-
-	sEnAiop(CtlP, 1);
-	if (!on)
-		addr += 8;
-	sOutB(addr + chan, 0);	/* apply or remove reset */
-	sDisAiop(CtlP, 1);
-}
-
-/*
- *  Not an official SSCI function, but how to reset RocketModems.
- *  PCI bus version
- */
-static void sPCIModemReset(CONTROLLER_T * CtlP, int chan, int on)
-{
-	ByteIO_t addr;
-
-	addr = CtlP->AiopIO[0] + 0x40;	/* 2nd AIOP */
-	if (!on)
-		addr += 8;
-	sOutB(addr + chan, 0);	/* apply or remove reset */
-}
-
-/*  Returns the line number given the controller (board), aiop and channel number */
-static unsigned char GetLineNumber(int ctrl, int aiop, int ch)
-{
-	return lineNumbers[(ctrl << 5) | (aiop << 3) | ch];
-}
-
-/*
- *  Stores the line number associated with a given controller (board), aiop
- *  and channel number.  
- *  Returns:  The line number assigned 
- */
-static unsigned char SetLineNumber(int ctrl, int aiop, int ch)
-{
-	lineNumbers[(ctrl << 5) | (aiop << 3) | ch] = nextLineNumber++;
-	return (nextLineNumber - 1);
-}
diff --git a/drivers/tty/rocket.h b/drivers/tty/rocket.h
deleted file mode 100644
index d62ed6587f32..000000000000
--- a/drivers/tty/rocket.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * rocket.h --- the exported interface of the rocket driver to its configuration program.
- *
- * Written by Theodore Ts'o, Copyright 1997.
- * Copyright 1997 Comtrol Corporation. 
- *
- */
-
-/*  Model Information Struct */
-typedef struct {
-	unsigned long model;
-	char modelString[80];
-	unsigned long numPorts;
-	int loadrm2;
-	int startingPortNumber;
-} rocketModel_t;
-
-struct rocket_config {
-	int line;
-	int flags;
-	int closing_wait;
-	int close_delay;
-	int port;
-	int reserved[32];
-};
-
-struct rocket_ports {
-	int tty_major;
-	int callout_major;
-	rocketModel_t rocketModel[8];
-};
-
-struct rocket_version {
-	char rocket_version[32];
-	char rocket_date[32];
-	char reserved[64];
-};
-
-/*
- * Rocketport flags
- */
-/*#define ROCKET_CALLOUT_NOHUP    0x00000001 */
-#define ROCKET_FORCE_CD		0x00000002
-#define ROCKET_HUP_NOTIFY	0x00000004
-#define ROCKET_SPLIT_TERMIOS	0x00000008
-#define ROCKET_SPD_MASK		0x00000070
-#define ROCKET_SPD_HI		0x00000010	/* Use 57600 instead of 38400 bps */
-#define ROCKET_SPD_VHI		0x00000020	/* Use 115200 instead of 38400 bps */
-#define ROCKET_SPD_SHI		0x00000030	/* Use 230400 instead of 38400 bps */
-#define ROCKET_SPD_WARP	        0x00000040	/* Use 460800 instead of 38400 bps */
-#define ROCKET_SAK		0x00000080
-#define ROCKET_SESSION_LOCKOUT	0x00000100
-#define ROCKET_PGRP_LOCKOUT	0x00000200
-#define ROCKET_RTS_TOGGLE	0x00000400
-#define ROCKET_MODE_MASK        0x00003000
-#define ROCKET_MODE_RS232       0x00000000
-#define ROCKET_MODE_RS485       0x00001000
-#define ROCKET_MODE_RS422       0x00002000
-#define ROCKET_FLAGS		0x00003FFF
-
-#define ROCKET_USR_MASK 0x0071	/* Legal flags that non-privileged
-				 * users can set or reset */
-
-/*
- * For closing_wait and closing_wait2
- */
-#define ROCKET_CLOSING_WAIT_NONE	ASYNC_CLOSING_WAIT_NONE
-#define ROCKET_CLOSING_WAIT_INF		ASYNC_CLOSING_WAIT_INF
-
-/*
- * Rocketport ioctls -- "RP"
- */
-#define RCKP_GET_CONFIG		0x00525002
-#define RCKP_SET_CONFIG		0x00525003
-#define RCKP_GET_PORTS		0x00525004
-#define RCKP_RESET_RM2		0x00525005
-#define RCKP_GET_VERSION	0x00525006
-
-/*  Rocketport Models */
-#define MODEL_RP32INTF        0x0001	/* RP 32 port w/external I/F   */
-#define MODEL_RP8INTF         0x0002	/* RP 8 port w/external I/F    */
-#define MODEL_RP16INTF        0x0003	/* RP 16 port w/external I/F   */
-#define MODEL_RP8OCTA         0x0005	/* RP 8 port w/octa cable      */
-#define MODEL_RP4QUAD         0x0004	/* RP 4 port w/quad cable      */
-#define MODEL_RP8J            0x0006	/* RP 8 port w/RJ11 connectors */
-#define MODEL_RP4J            0x0007	/* RP 4 port w/RJ45 connectors */
-#define MODEL_RP8SNI          0x0008	/* RP 8 port w/ DB78 SNI connector */
-#define MODEL_RP16SNI         0x0009	/* RP 16 port w/ DB78 SNI connector */
-#define MODEL_RPP4            0x000A	/* RP Plus 4 port              */
-#define MODEL_RPP8            0x000B	/* RP Plus 8 port              */
-#define MODEL_RP2_232         0x000E	/* RP Plus 2 port RS232        */
-#define MODEL_RP2_422         0x000F	/* RP Plus 2 port RS232        */
-
-/*  Rocketmodem II Models */
-#define MODEL_RP6M            0x000C	/* RM 6 port                   */
-#define MODEL_RP4M            0x000D	/* RM 4 port                   */
-
-/* Universal PCI boards */
-#define MODEL_UPCI_RP32INTF   0x0801	/* RP UPCI 32 port w/external I/F     */
-#define MODEL_UPCI_RP8INTF    0x0802	/* RP UPCI 8 port w/external I/F      */
-#define MODEL_UPCI_RP16INTF   0x0803	/* RP UPCI 16 port w/external I/F     */
-#define MODEL_UPCI_RP8OCTA    0x0805	/* RP UPCI 8 port w/octa cable        */ 
-#define MODEL_UPCI_RM3_8PORT  0x080C	/* RP UPCI Rocketmodem III 8 port     */
-#define MODEL_UPCI_RM3_4PORT  0x080C	/* RP UPCI Rocketmodem III 4 port     */
-
-/*  Compact PCI 16 port  */
-#define MODEL_CPCI_RP16INTF   0x0903	/* RP Compact PCI 16 port w/external I/F */
-
-/* All ISA boards */
-#define MODEL_ISA             0x1000
diff --git a/drivers/tty/rocket_int.h b/drivers/tty/rocket_int.h
deleted file mode 100644
index 727e50dbb92f..000000000000
--- a/drivers/tty/rocket_int.h
+++ /dev/null
@@ -1,1214 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * rocket_int.h --- internal header file for rocket.c
- *
- * Written by Theodore Ts'o, Copyright 1997.
- * Copyright 1997 Comtrol Corporation.  
- * 
- */
-
-/*
- * Definition of the types in rcktpt_type
- */
-#define ROCKET_TYPE_NORMAL	0
-#define ROCKET_TYPE_MODEM	1
-#define ROCKET_TYPE_MODEMII	2
-#define ROCKET_TYPE_MODEMIII	3
-#define ROCKET_TYPE_PC104       4
-
-#include <linux/mutex.h>
-
-#include <asm/io.h>
-#include <asm/byteorder.h>
-
-typedef unsigned char Byte_t;
-typedef unsigned int ByteIO_t;
-
-typedef unsigned int Word_t;
-typedef unsigned int WordIO_t;
-
-typedef unsigned int DWordIO_t;
-
-/*
- * Note!  Normally the Linux I/O macros already take care of
- * byte-swapping the I/O instructions.  However, all accesses using
- * sOutDW aren't really 32-bit accesses, but should be handled in byte
- * order.  Hence the use of the cpu_to_le32() macro to byte-swap
- * things to no-op the byte swapping done by the big-endian outl()
- * instruction.
- */
-
-static inline void sOutB(unsigned short port, unsigned char value)
-{
-#ifdef ROCKET_DEBUG_IO
-	printk(KERN_DEBUG "sOutB(%x, %x)...\n", port, value);
-#endif
-	outb_p(value, port);
-}
-
-static inline void sOutW(unsigned short port, unsigned short value)
-{
-#ifdef ROCKET_DEBUG_IO
-	printk(KERN_DEBUG "sOutW(%x, %x)...\n", port, value);
-#endif
-	outw_p(value, port);
-}
-
-static inline void out32(unsigned short port, Byte_t *p)
-{
-	u32 value = get_unaligned_le32(p);
-#ifdef ROCKET_DEBUG_IO
-	printk(KERN_DEBUG "out32(%x, %lx)...\n", port, value);
-#endif
-	outl_p(value, port);
-}
-
-static inline unsigned char sInB(unsigned short port)
-{
-	return inb_p(port);
-}
-
-static inline unsigned short sInW(unsigned short port)
-{
-	return inw_p(port);
-}
-
-/* This is used to move arrays of bytes so byte swapping isn't appropriate. */
-#define sOutStrW(port, addr, count) if (count) outsw(port, addr, count)
-#define sInStrW(port, addr, count) if (count) insw(port, addr, count)
-
-#define CTL_SIZE 8
-#define AIOP_CTL_SIZE 4
-#define CHAN_AIOP_SIZE 8
-#define MAX_PORTS_PER_AIOP 8
-#define MAX_AIOPS_PER_BOARD 4
-#define MAX_PORTS_PER_BOARD 32
-
-/* Bus type ID */
-#define	isISA	0
-#define	isPCI	1
-#define	isMC	2
-
-/* Controller ID numbers */
-#define CTLID_NULL  -1		/* no controller exists */
-#define CTLID_0001  0x0001	/* controller release 1 */
-
-/* AIOP ID numbers, identifies AIOP type implementing channel */
-#define AIOPID_NULL -1		/* no AIOP or channel exists */
-#define AIOPID_0001 0x0001	/* AIOP release 1 */
-
-/************************************************************************
- Global Register Offsets - Direct Access - Fixed values
-************************************************************************/
-
-#define _CMD_REG   0x38		/* Command Register            8    Write */
-#define _INT_CHAN  0x39		/* Interrupt Channel Register  8    Read */
-#define _INT_MASK  0x3A		/* Interrupt Mask Register     8    Read / Write */
-#define _UNUSED    0x3B		/* Unused                      8 */
-#define _INDX_ADDR 0x3C		/* Index Register Address      16   Write */
-#define _INDX_DATA 0x3E		/* Index Register Data         8/16 Read / Write */
-
-/************************************************************************
- Channel Register Offsets for 1st channel in AIOP - Direct Access
-************************************************************************/
-#define _TD0       0x00		/* Transmit Data               16   Write */
-#define _RD0       0x00		/* Receive Data                16   Read */
-#define _CHN_STAT0 0x20		/* Channel Status              8/16 Read / Write */
-#define _FIFO_CNT0 0x10		/* Transmit/Receive FIFO Count 16   Read */
-#define _INT_ID0   0x30		/* Interrupt Identification    8    Read */
-
-/************************************************************************
- Tx Control Register Offsets - Indexed - External - Fixed
-************************************************************************/
-#define _TX_ENBLS  0x980	/* Tx Processor Enables Register 8 Read / Write */
-#define _TXCMP1    0x988	/* Transmit Compare Value #1     8 Read / Write */
-#define _TXCMP2    0x989	/* Transmit Compare Value #2     8 Read / Write */
-#define _TXREP1B1  0x98A	/* Tx Replace Value #1 - Byte 1  8 Read / Write */
-#define _TXREP1B2  0x98B	/* Tx Replace Value #1 - Byte 2  8 Read / Write */
-#define _TXREP2    0x98C	/* Transmit Replace Value #2     8 Read / Write */
-
-/************************************************************************
-Memory Controller Register Offsets - Indexed - External - Fixed
-************************************************************************/
-#define _RX_FIFO    0x000	/* Rx FIFO */
-#define _TX_FIFO    0x800	/* Tx FIFO */
-#define _RXF_OUTP   0x990	/* Rx FIFO OUT pointer        16 Read / Write */
-#define _RXF_INP    0x992	/* Rx FIFO IN pointer         16 Read / Write */
-#define _TXF_OUTP   0x994	/* Tx FIFO OUT pointer        8  Read / Write */
-#define _TXF_INP    0x995	/* Tx FIFO IN pointer         8  Read / Write */
-#define _TXP_CNT    0x996	/* Tx Priority Count          8  Read / Write */
-#define _TXP_PNTR   0x997	/* Tx Priority Pointer        8  Read / Write */
-
-#define PRI_PEND    0x80	/* Priority data pending (bit7, Tx pri cnt) */
-#define TXFIFO_SIZE 255		/* size of Tx FIFO */
-#define RXFIFO_SIZE 1023	/* size of Rx FIFO */
-
-/************************************************************************
-Tx Priority Buffer - Indexed - External - Fixed
-************************************************************************/
-#define _TXP_BUF    0x9C0	/* Tx Priority Buffer  32  Bytes   Read / Write */
-#define TXP_SIZE    0x20	/* 32 bytes */
-
-/************************************************************************
-Channel Register Offsets - Indexed - Internal - Fixed
-************************************************************************/
-
-#define _TX_CTRL    0xFF0	/* Transmit Control               16  Write */
-#define _RX_CTRL    0xFF2	/* Receive Control                 8  Write */
-#define _BAUD       0xFF4	/* Baud Rate                      16  Write */
-#define _CLK_PRE    0xFF6	/* Clock Prescaler                 8  Write */
-
-#define STMBREAK   0x08		/* BREAK */
-#define STMFRAME   0x04		/* framing error */
-#define STMRCVROVR 0x02		/* receiver over run error */
-#define STMPARITY  0x01		/* parity error */
-#define STMERROR   (STMBREAK | STMFRAME | STMPARITY)
-#define STMBREAKH   0x800	/* BREAK */
-#define STMFRAMEH   0x400	/* framing error */
-#define STMRCVROVRH 0x200	/* receiver over run error */
-#define STMPARITYH  0x100	/* parity error */
-#define STMERRORH   (STMBREAKH | STMFRAMEH | STMPARITYH)
-
-#define CTS_ACT   0x20		/* CTS input asserted */
-#define DSR_ACT   0x10		/* DSR input asserted */
-#define CD_ACT    0x08		/* CD input asserted */
-#define TXFIFOMT  0x04		/* Tx FIFO is empty */
-#define TXSHRMT   0x02		/* Tx shift register is empty */
-#define RDA       0x01		/* Rx data available */
-#define DRAINED (TXFIFOMT | TXSHRMT)	/* indicates Tx is drained */
-
-#define STATMODE  0x8000	/* status mode enable bit */
-#define RXFOVERFL 0x2000	/* receive FIFO overflow */
-#define RX2MATCH  0x1000	/* receive compare byte 2 match */
-#define RX1MATCH  0x0800	/* receive compare byte 1 match */
-#define RXBREAK   0x0400	/* received BREAK */
-#define RXFRAME   0x0200	/* received framing error */
-#define RXPARITY  0x0100	/* received parity error */
-#define STATERROR (RXBREAK | RXFRAME | RXPARITY)
-
-#define CTSFC_EN  0x80		/* CTS flow control enable bit */
-#define RTSTOG_EN 0x40		/* RTS toggle enable bit */
-#define TXINT_EN  0x10		/* transmit interrupt enable */
-#define STOP2     0x08		/* enable 2 stop bits (0 = 1 stop) */
-#define PARITY_EN 0x04		/* enable parity (0 = no parity) */
-#define EVEN_PAR  0x02		/* even parity (0 = odd parity) */
-#define DATA8BIT  0x01		/* 8 bit data (0 = 7 bit data) */
-
-#define SETBREAK  0x10		/* send break condition (must clear) */
-#define LOCALLOOP 0x08		/* local loopback set for test */
-#define SET_DTR   0x04		/* assert DTR */
-#define SET_RTS   0x02		/* assert RTS */
-#define TX_ENABLE 0x01		/* enable transmitter */
-
-#define RTSFC_EN  0x40		/* RTS flow control enable */
-#define RXPROC_EN 0x20		/* receive processor enable */
-#define TRIG_NO   0x00		/* Rx FIFO trigger level 0 (no trigger) */
-#define TRIG_1    0x08		/* trigger level 1 char */
-#define TRIG_1_2  0x10		/* trigger level 1/2 */
-#define TRIG_7_8  0x18		/* trigger level 7/8 */
-#define TRIG_MASK 0x18		/* trigger level mask */
-#define SRCINT_EN 0x04		/* special Rx condition interrupt enable */
-#define RXINT_EN  0x02		/* Rx interrupt enable */
-#define MCINT_EN  0x01		/* modem change interrupt enable */
-
-#define RXF_TRIG  0x20		/* Rx FIFO trigger level interrupt */
-#define TXFIFO_MT 0x10		/* Tx FIFO empty interrupt */
-#define SRC_INT   0x08		/* special receive condition interrupt */
-#define DELTA_CD  0x04		/* CD change interrupt */
-#define DELTA_CTS 0x02		/* CTS change interrupt */
-#define DELTA_DSR 0x01		/* DSR change interrupt */
-
-#define REP1W2_EN 0x10		/* replace byte 1 with 2 bytes enable */
-#define IGN2_EN   0x08		/* ignore byte 2 enable */
-#define IGN1_EN   0x04		/* ignore byte 1 enable */
-#define COMP2_EN  0x02		/* compare byte 2 enable */
-#define COMP1_EN  0x01		/* compare byte 1 enable */
-
-#define RESET_ALL 0x80		/* reset AIOP (all channels) */
-#define TXOVERIDE 0x40		/* Transmit software off override */
-#define RESETUART 0x20		/* reset channel's UART */
-#define RESTXFCNT 0x10		/* reset channel's Tx FIFO count register */
-#define RESRXFCNT 0x08		/* reset channel's Rx FIFO count register */
-
-#define INTSTAT0  0x01		/* AIOP 0 interrupt status */
-#define INTSTAT1  0x02		/* AIOP 1 interrupt status */
-#define INTSTAT2  0x04		/* AIOP 2 interrupt status */
-#define INTSTAT3  0x08		/* AIOP 3 interrupt status */
-
-#define INTR_EN   0x08		/* allow interrupts to host */
-#define INT_STROB 0x04		/* strobe and clear interrupt line (EOI) */
-
-/**************************************************************************
- MUDBAC remapped for PCI
-**************************************************************************/
-
-#define _CFG_INT_PCI  0x40
-#define _PCI_INT_FUNC 0x3A
-
-#define PCI_STROB 0x2000	/* bit 13 of int aiop register */
-#define INTR_EN_PCI   0x0010	/* allow interrupts to host */
-
-/*
- * Definitions for Universal PCI board registers
- */
-#define _PCI_9030_INT_CTRL	0x4c          /* Offsets from BAR1 */
-#define _PCI_9030_GPIO_CTRL	0x54
-#define PCI_INT_CTRL_AIOP	0x0001
-#define PCI_GPIO_CTRL_8PORT	0x4000
-#define _PCI_9030_RING_IND	0xc0          /* Offsets from BAR1 */
-
-#define CHAN3_EN  0x08		/* enable AIOP 3 */
-#define CHAN2_EN  0x04		/* enable AIOP 2 */
-#define CHAN1_EN  0x02		/* enable AIOP 1 */
-#define CHAN0_EN  0x01		/* enable AIOP 0 */
-#define FREQ_DIS  0x00
-#define FREQ_274HZ 0x60
-#define FREQ_137HZ 0x50
-#define FREQ_69HZ  0x40
-#define FREQ_34HZ  0x30
-#define FREQ_17HZ  0x20
-#define FREQ_9HZ   0x10
-#define PERIODIC_ONLY 0x80	/* only PERIODIC interrupt */
-
-#define CHANINT_EN 0x0100	/* flags to enable/disable channel ints */
-
-#define RDATASIZE 72
-#define RREGDATASIZE 52
-
-/*
- * AIOP interrupt bits for ISA/PCI boards and UPCI boards.
- */
-#define AIOP_INTR_BIT_0		0x0001
-#define AIOP_INTR_BIT_1		0x0002
-#define AIOP_INTR_BIT_2		0x0004
-#define AIOP_INTR_BIT_3		0x0008
-
-#define AIOP_INTR_BITS ( \
-	AIOP_INTR_BIT_0 \
-	| AIOP_INTR_BIT_1 \
-	| AIOP_INTR_BIT_2 \
-	| AIOP_INTR_BIT_3)
-
-#define UPCI_AIOP_INTR_BIT_0	0x0004
-#define UPCI_AIOP_INTR_BIT_1	0x0020
-#define UPCI_AIOP_INTR_BIT_2	0x0100
-#define UPCI_AIOP_INTR_BIT_3	0x0800
-
-#define UPCI_AIOP_INTR_BITS ( \
-	UPCI_AIOP_INTR_BIT_0 \
-	| UPCI_AIOP_INTR_BIT_1 \
-	| UPCI_AIOP_INTR_BIT_2 \
-	| UPCI_AIOP_INTR_BIT_3)
-
-/* Controller level information structure */
-typedef struct {
-	int CtlID;
-	int CtlNum;
-	int BusType;
-	int boardType;
-	int isUPCI;
-	WordIO_t PCIIO;
-	WordIO_t PCIIO2;
-	ByteIO_t MBaseIO;
-	ByteIO_t MReg1IO;
-	ByteIO_t MReg2IO;
-	ByteIO_t MReg3IO;
-	Byte_t MReg2;
-	Byte_t MReg3;
-	int NumAiop;
-	int AltChanRingIndicator;
-	ByteIO_t UPCIRingInd;
-	WordIO_t AiopIO[AIOP_CTL_SIZE];
-	ByteIO_t AiopIntChanIO[AIOP_CTL_SIZE];
-	int AiopID[AIOP_CTL_SIZE];
-	int AiopNumChan[AIOP_CTL_SIZE];
-	Word_t *AiopIntrBits;
-} CONTROLLER_T;
-
-typedef CONTROLLER_T CONTROLLER_t;
-
-/* Channel level information structure */
-typedef struct {
-	CONTROLLER_T *CtlP;
-	int AiopNum;
-	int ChanID;
-	int ChanNum;
-	int rtsToggle;
-
-	ByteIO_t Cmd;
-	ByteIO_t IntChan;
-	ByteIO_t IntMask;
-	DWordIO_t IndexAddr;
-	WordIO_t IndexData;
-
-	WordIO_t TxRxData;
-	WordIO_t ChanStat;
-	WordIO_t TxRxCount;
-	ByteIO_t IntID;
-
-	Word_t TxFIFO;
-	Word_t TxFIFOPtrs;
-	Word_t RxFIFO;
-	Word_t RxFIFOPtrs;
-	Word_t TxPrioCnt;
-	Word_t TxPrioPtr;
-	Word_t TxPrioBuf;
-
-	Byte_t R[RREGDATASIZE];
-
-	Byte_t BaudDiv[4];
-	Byte_t TxControl[4];
-	Byte_t RxControl[4];
-	Byte_t TxEnables[4];
-	Byte_t TxCompare[4];
-	Byte_t TxReplace1[4];
-	Byte_t TxReplace2[4];
-} CHANNEL_T;
-
-typedef CHANNEL_T CHANNEL_t;
-typedef CHANNEL_T *CHANPTR_T;
-
-#define InterfaceModeRS232  0x00
-#define InterfaceModeRS422  0x08
-#define InterfaceModeRS485  0x10
-#define InterfaceModeRS232T 0x18
-
-/***************************************************************************
-Function: sClrBreak
-Purpose:  Stop sending a transmit BREAK signal
-Call:     sClrBreak(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sClrBreak(ChP) \
-do { \
-   (ChP)->TxControl[3] &= ~SETBREAK; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sClrDTR
-Purpose:  Clr the DTR output
-Call:     sClrDTR(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sClrDTR(ChP) \
-do { \
-   (ChP)->TxControl[3] &= ~SET_DTR; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sClrRTS
-Purpose:  Clr the RTS output
-Call:     sClrRTS(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sClrRTS(ChP) \
-do { \
-   if ((ChP)->rtsToggle) break; \
-   (ChP)->TxControl[3] &= ~SET_RTS; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sClrTxXOFF
-Purpose:  Clear any existing transmit software flow control off condition
-Call:     sClrTxXOFF(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sClrTxXOFF(ChP) \
-do { \
-   sOutB((ChP)->Cmd,TXOVERIDE | (Byte_t)(ChP)->ChanNum); \
-   sOutB((ChP)->Cmd,(Byte_t)(ChP)->ChanNum); \
-} while (0)
-
-/***************************************************************************
-Function: sCtlNumToCtlPtr
-Purpose:  Convert a controller number to controller structure pointer
-Call:     sCtlNumToCtlPtr(CtlNum)
-          int CtlNum; Controller number
-Return:   CONTROLLER_T *: Ptr to controller structure
-*/
-#define sCtlNumToCtlPtr(CTLNUM) &sController[CTLNUM]
-
-/***************************************************************************
-Function: sControllerEOI
-Purpose:  Strobe the MUDBAC's End Of Interrupt bit.
-Call:     sControllerEOI(CtlP)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-*/
-#define sControllerEOI(CTLP) sOutB((CTLP)->MReg2IO,(CTLP)->MReg2 | INT_STROB)
-
-/***************************************************************************
-Function: sPCIControllerEOI
-Purpose:  Strobe the PCI End Of Interrupt bit.
-          For the UPCI boards, toggle the AIOP interrupt enable bit
-	  (this was taken from the Windows driver).
-Call:     sPCIControllerEOI(CtlP)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-*/
-#define sPCIControllerEOI(CTLP) \
-do { \
-    if ((CTLP)->isUPCI) { \
-	Word_t w = sInW((CTLP)->PCIIO); \
-	sOutW((CTLP)->PCIIO, (w ^ PCI_INT_CTRL_AIOP)); \
-	sOutW((CTLP)->PCIIO, w); \
-    } \
-    else { \
-	sOutW((CTLP)->PCIIO, PCI_STROB); \
-    } \
-} while (0)
-
-/***************************************************************************
-Function: sDisAiop
-Purpose:  Disable I/O access to an AIOP
-Call:     sDisAiop(CltP)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-          int AiopNum; Number of AIOP on controller
-*/
-#define sDisAiop(CTLP,AIOPNUM) \
-do { \
-   (CTLP)->MReg3 &= sBitMapClrTbl[AIOPNUM]; \
-   sOutB((CTLP)->MReg3IO,(CTLP)->MReg3); \
-} while (0)
-
-/***************************************************************************
-Function: sDisCTSFlowCtl
-Purpose:  Disable output flow control using CTS
-Call:     sDisCTSFlowCtl(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sDisCTSFlowCtl(ChP) \
-do { \
-   (ChP)->TxControl[2] &= ~CTSFC_EN; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sDisIXANY
-Purpose:  Disable IXANY Software Flow Control
-Call:     sDisIXANY(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sDisIXANY(ChP) \
-do { \
-   (ChP)->R[0x0e] = 0x86; \
-   out32((ChP)->IndexAddr,&(ChP)->R[0x0c]); \
-} while (0)
-
-/***************************************************************************
-Function: DisParity
-Purpose:  Disable parity
-Call:     sDisParity(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: Function sSetParity() can be used in place of functions sEnParity(),
-          sDisParity(), sSetOddParity(), and sSetEvenParity().
-*/
-#define sDisParity(ChP) \
-do { \
-   (ChP)->TxControl[2] &= ~PARITY_EN; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sDisRTSToggle
-Purpose:  Disable RTS toggle
-Call:     sDisRTSToggle(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sDisRTSToggle(ChP) \
-do { \
-   (ChP)->TxControl[2] &= ~RTSTOG_EN; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-   (ChP)->rtsToggle = 0; \
-} while (0)
-
-/***************************************************************************
-Function: sDisRxFIFO
-Purpose:  Disable Rx FIFO
-Call:     sDisRxFIFO(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sDisRxFIFO(ChP) \
-do { \
-   (ChP)->R[0x32] = 0x0a; \
-   out32((ChP)->IndexAddr,&(ChP)->R[0x30]); \
-} while (0)
-
-/***************************************************************************
-Function: sDisRxStatusMode
-Purpose:  Disable the Rx status mode
-Call:     sDisRxStatusMode(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: This takes the channel out of the receive status mode.  All
-          subsequent reads of receive data using sReadRxWord() will return
-          two data bytes.
-*/
-#define sDisRxStatusMode(ChP) sOutW((ChP)->ChanStat,0)
-
-/***************************************************************************
-Function: sDisTransmit
-Purpose:  Disable transmit
-Call:     sDisTransmit(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-          This disables movement of Tx data from the Tx FIFO into the 1 byte
-          Tx buffer.  Therefore there could be up to a 2 byte latency
-          between the time sDisTransmit() is called and the transmit buffer
-          and transmit shift register going completely empty.
-*/
-#define sDisTransmit(ChP) \
-do { \
-   (ChP)->TxControl[3] &= ~TX_ENABLE; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sDisTxSoftFlowCtl
-Purpose:  Disable Tx Software Flow Control
-Call:     sDisTxSoftFlowCtl(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sDisTxSoftFlowCtl(ChP) \
-do { \
-   (ChP)->R[0x06] = 0x8a; \
-   out32((ChP)->IndexAddr,&(ChP)->R[0x04]); \
-} while (0)
-
-/***************************************************************************
-Function: sEnAiop
-Purpose:  Enable I/O access to an AIOP
-Call:     sEnAiop(CltP)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-          int AiopNum; Number of AIOP on controller
-*/
-#define sEnAiop(CTLP,AIOPNUM) \
-do { \
-   (CTLP)->MReg3 |= sBitMapSetTbl[AIOPNUM]; \
-   sOutB((CTLP)->MReg3IO,(CTLP)->MReg3); \
-} while (0)
-
-/***************************************************************************
-Function: sEnCTSFlowCtl
-Purpose:  Enable output flow control using CTS
-Call:     sEnCTSFlowCtl(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sEnCTSFlowCtl(ChP) \
-do { \
-   (ChP)->TxControl[2] |= CTSFC_EN; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sEnIXANY
-Purpose:  Enable IXANY Software Flow Control
-Call:     sEnIXANY(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sEnIXANY(ChP) \
-do { \
-   (ChP)->R[0x0e] = 0x21; \
-   out32((ChP)->IndexAddr,&(ChP)->R[0x0c]); \
-} while (0)
-
-/***************************************************************************
-Function: EnParity
-Purpose:  Enable parity
-Call:     sEnParity(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: Function sSetParity() can be used in place of functions sEnParity(),
-          sDisParity(), sSetOddParity(), and sSetEvenParity().
-
-Warnings: Before enabling parity odd or even parity should be chosen using
-          functions sSetOddParity() or sSetEvenParity().
-*/
-#define sEnParity(ChP) \
-do { \
-   (ChP)->TxControl[2] |= PARITY_EN; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sEnRTSToggle
-Purpose:  Enable RTS toggle
-Call:     sEnRTSToggle(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: This function will disable RTS flow control and clear the RTS
-          line to allow operation of RTS toggle.
-*/
-#define sEnRTSToggle(ChP) \
-do { \
-   (ChP)->RxControl[2] &= ~RTSFC_EN; \
-   out32((ChP)->IndexAddr,(ChP)->RxControl); \
-   (ChP)->TxControl[2] |= RTSTOG_EN; \
-   (ChP)->TxControl[3] &= ~SET_RTS; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-   (ChP)->rtsToggle = 1; \
-} while (0)
-
-/***************************************************************************
-Function: sEnRxFIFO
-Purpose:  Enable Rx FIFO
-Call:     sEnRxFIFO(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sEnRxFIFO(ChP) \
-do { \
-   (ChP)->R[0x32] = 0x08; \
-   out32((ChP)->IndexAddr,&(ChP)->R[0x30]); \
-} while (0)
-
-/***************************************************************************
-Function: sEnRxProcessor
-Purpose:  Enable the receive processor
-Call:     sEnRxProcessor(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: This function is used to start the receive processor.  When
-          the channel is in the reset state the receive processor is not
-          running.  This is done to prevent the receive processor from
-          executing invalid microcode instructions prior to the
-          downloading of the microcode.
-
-Warnings: This function must be called after valid microcode has been
-          downloaded to the AIOP, and it must not be called before the
-          microcode has been downloaded.
-*/
-#define sEnRxProcessor(ChP) \
-do { \
-   (ChP)->RxControl[2] |= RXPROC_EN; \
-   out32((ChP)->IndexAddr,(ChP)->RxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sEnRxStatusMode
-Purpose:  Enable the Rx status mode
-Call:     sEnRxStatusMode(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: This places the channel in the receive status mode.  All subsequent
-          reads of receive data using sReadRxWord() will return a data byte
-          in the low word and a status byte in the high word.
-
-*/
-#define sEnRxStatusMode(ChP) sOutW((ChP)->ChanStat,STATMODE)
-
-/***************************************************************************
-Function: sEnTransmit
-Purpose:  Enable transmit
-Call:     sEnTransmit(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sEnTransmit(ChP) \
-do { \
-   (ChP)->TxControl[3] |= TX_ENABLE; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sEnTxSoftFlowCtl
-Purpose:  Enable Tx Software Flow Control
-Call:     sEnTxSoftFlowCtl(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sEnTxSoftFlowCtl(ChP) \
-do { \
-   (ChP)->R[0x06] = 0xc5; \
-   out32((ChP)->IndexAddr,&(ChP)->R[0x04]); \
-} while (0)
-
-/***************************************************************************
-Function: sGetAiopIntStatus
-Purpose:  Get the AIOP interrupt status
-Call:     sGetAiopIntStatus(CtlP,AiopNum)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-          int AiopNum; AIOP number
-Return:   Byte_t: The AIOP interrupt status.  Bits 0 through 7
-                         represent channels 0 through 7 respectively.  If a
-                         bit is set that channel is interrupting.
-*/
-#define sGetAiopIntStatus(CTLP,AIOPNUM) sInB((CTLP)->AiopIntChanIO[AIOPNUM])
-
-/***************************************************************************
-Function: sGetAiopNumChan
-Purpose:  Get the number of channels supported by an AIOP
-Call:     sGetAiopNumChan(CtlP,AiopNum)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-          int AiopNum; AIOP number
-Return:   int: The number of channels supported by the AIOP
-*/
-#define sGetAiopNumChan(CTLP,AIOPNUM) (CTLP)->AiopNumChan[AIOPNUM]
-
-/***************************************************************************
-Function: sGetChanIntID
-Purpose:  Get a channel's interrupt identification byte
-Call:     sGetChanIntID(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   Byte_t: The channel interrupt ID.  Can be any
-             combination of the following flags:
-                RXF_TRIG:     Rx FIFO trigger level interrupt
-                TXFIFO_MT:    Tx FIFO empty interrupt
-                SRC_INT:      Special receive condition interrupt
-                DELTA_CD:     CD change interrupt
-                DELTA_CTS:    CTS change interrupt
-                DELTA_DSR:    DSR change interrupt
-*/
-#define sGetChanIntID(ChP) (sInB((ChP)->IntID) & (RXF_TRIG | TXFIFO_MT | SRC_INT | DELTA_CD | DELTA_CTS | DELTA_DSR))
-
-/***************************************************************************
-Function: sGetChanNum
-Purpose:  Get the number of a channel within an AIOP
-Call:     sGetChanNum(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   int: Channel number within AIOP, or NULLCHAN if channel does
-               not exist.
-*/
-#define sGetChanNum(ChP) (ChP)->ChanNum
-
-/***************************************************************************
-Function: sGetChanStatus
-Purpose:  Get the channel status
-Call:     sGetChanStatus(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   Word_t: The channel status.  Can be any combination of
-             the following flags:
-                LOW BYTE FLAGS
-                CTS_ACT:      CTS input asserted
-                DSR_ACT:      DSR input asserted
-                CD_ACT:       CD input asserted
-                TXFIFOMT:     Tx FIFO is empty
-                TXSHRMT:      Tx shift register is empty
-                RDA:          Rx data available
-
-                HIGH BYTE FLAGS
-                STATMODE:     status mode enable bit
-                RXFOVERFL:    receive FIFO overflow
-                RX2MATCH:     receive compare byte 2 match
-                RX1MATCH:     receive compare byte 1 match
-                RXBREAK:      received BREAK
-                RXFRAME:      received framing error
-                RXPARITY:     received parity error
-Warnings: This function will clear the high byte flags in the Channel
-          Status Register.
-*/
-#define sGetChanStatus(ChP) sInW((ChP)->ChanStat)
-
-/***************************************************************************
-Function: sGetChanStatusLo
-Purpose:  Get the low byte only of the channel status
-Call:     sGetChanStatusLo(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   Byte_t: The channel status low byte.  Can be any combination
-             of the following flags:
-                CTS_ACT:      CTS input asserted
-                DSR_ACT:      DSR input asserted
-                CD_ACT:       CD input asserted
-                TXFIFOMT:     Tx FIFO is empty
-                TXSHRMT:      Tx shift register is empty
-                RDA:          Rx data available
-*/
-#define sGetChanStatusLo(ChP) sInB((ByteIO_t)(ChP)->ChanStat)
-
-/**********************************************************************
- * Get RI status of channel
- * Defined as a function in rocket.c   -aes
- */
-#if 0
-#define sGetChanRI(ChP) ((ChP)->CtlP->AltChanRingIndicator ? \
-                          (sInB((ByteIO_t)((ChP)->ChanStat+8)) & DSR_ACT) : \
-                            (((ChP)->CtlP->boardType == ROCKET_TYPE_PC104) ? \
-                               (!(sInB((ChP)->CtlP->AiopIO[3]) & sBitMapSetTbl[(ChP)->ChanNum])) : \
-                             0))
-#endif
-
-/***************************************************************************
-Function: sGetControllerIntStatus
-Purpose:  Get the controller interrupt status
-Call:     sGetControllerIntStatus(CtlP)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-Return:   Byte_t: The controller interrupt status in the lower 4
-                         bits.  Bits 0 through 3 represent AIOP's 0
-                         through 3 respectively.  If a bit is set that
-                         AIOP is interrupting.  Bits 4 through 7 will
-                         always be cleared.
-*/
-#define sGetControllerIntStatus(CTLP) (sInB((CTLP)->MReg1IO) & 0x0f)
-
-/***************************************************************************
-Function: sPCIGetControllerIntStatus
-Purpose:  Get the controller interrupt status
-Call:     sPCIGetControllerIntStatus(CtlP)
-          CONTROLLER_T *CtlP; Ptr to controller structure
-Return:   unsigned char: The controller interrupt status in the lower 4
-                         bits and bit 4.  Bits 0 through 3 represent AIOP's 0
-                         through 3 respectively. Bit 4 is set if the int 
-			 was generated from periodic. If a bit is set the
-			 AIOP is interrupting.
-*/
-#define sPCIGetControllerIntStatus(CTLP) \
-	((CTLP)->isUPCI ? \
-	  (sInW((CTLP)->PCIIO2) & UPCI_AIOP_INTR_BITS) : \
-	  ((sInW((CTLP)->PCIIO) >> 8) & AIOP_INTR_BITS))
-
-/***************************************************************************
-
-Function: sGetRxCnt
-Purpose:  Get the number of data bytes in the Rx FIFO
-Call:     sGetRxCnt(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   int: The number of data bytes in the Rx FIFO.
-Comments: Byte read of count register is required to obtain Rx count.
-
-*/
-#define sGetRxCnt(ChP) sInW((ChP)->TxRxCount)
-
-/***************************************************************************
-Function: sGetTxCnt
-Purpose:  Get the number of data bytes in the Tx FIFO
-Call:     sGetTxCnt(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   Byte_t: The number of data bytes in the Tx FIFO.
-Comments: Byte read of count register is required to obtain Tx count.
-
-*/
-#define sGetTxCnt(ChP) sInB((ByteIO_t)(ChP)->TxRxCount)
-
-/*****************************************************************************
-Function: sGetTxRxDataIO
-Purpose:  Get the I/O address of a channel's TxRx Data register
-Call:     sGetTxRxDataIO(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Return:   WordIO_t: I/O address of a channel's TxRx Data register
-*/
-#define sGetTxRxDataIO(ChP) (ChP)->TxRxData
-
-/***************************************************************************
-Function: sInitChanDefaults
-Purpose:  Initialize a channel structure to it's default state.
-Call:     sInitChanDefaults(ChP)
-          CHANNEL_T *ChP; Ptr to the channel structure
-Comments: This function must be called once for every channel structure
-          that exists before any other SSCI calls can be made.
-
-*/
-#define sInitChanDefaults(ChP) \
-do { \
-   (ChP)->CtlP = NULLCTLPTR; \
-   (ChP)->AiopNum = NULLAIOP; \
-   (ChP)->ChanID = AIOPID_NULL; \
-   (ChP)->ChanNum = NULLCHAN; \
-} while (0)
-
-/***************************************************************************
-Function: sResetAiopByNum
-Purpose:  Reset the AIOP by number
-Call:     sResetAiopByNum(CTLP,AIOPNUM)
-	CONTROLLER_T CTLP; Ptr to controller structure
-	AIOPNUM; AIOP index 
-*/
-#define sResetAiopByNum(CTLP,AIOPNUM) \
-do { \
-   sOutB((CTLP)->AiopIO[(AIOPNUM)]+_CMD_REG,RESET_ALL); \
-   sOutB((CTLP)->AiopIO[(AIOPNUM)]+_CMD_REG,0x0); \
-} while (0)
-
-/***************************************************************************
-Function: sSendBreak
-Purpose:  Send a transmit BREAK signal
-Call:     sSendBreak(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sSendBreak(ChP) \
-do { \
-   (ChP)->TxControl[3] |= SETBREAK; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetBaud
-Purpose:  Set baud rate
-Call:     sSetBaud(ChP,Divisor)
-          CHANNEL_T *ChP; Ptr to channel structure
-          Word_t Divisor; 16 bit baud rate divisor for channel
-*/
-#define sSetBaud(ChP,DIVISOR) \
-do { \
-   (ChP)->BaudDiv[2] = (Byte_t)(DIVISOR); \
-   (ChP)->BaudDiv[3] = (Byte_t)((DIVISOR) >> 8); \
-   out32((ChP)->IndexAddr,(ChP)->BaudDiv); \
-} while (0)
-
-/***************************************************************************
-Function: sSetData7
-Purpose:  Set data bits to 7
-Call:     sSetData7(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sSetData7(ChP) \
-do { \
-   (ChP)->TxControl[2] &= ~DATA8BIT; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetData8
-Purpose:  Set data bits to 8
-Call:     sSetData8(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sSetData8(ChP) \
-do { \
-   (ChP)->TxControl[2] |= DATA8BIT; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetDTR
-Purpose:  Set the DTR output
-Call:     sSetDTR(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sSetDTR(ChP) \
-do { \
-   (ChP)->TxControl[3] |= SET_DTR; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetEvenParity
-Purpose:  Set even parity
-Call:     sSetEvenParity(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: Function sSetParity() can be used in place of functions sEnParity(),
-          sDisParity(), sSetOddParity(), and sSetEvenParity().
-
-Warnings: This function has no effect unless parity is enabled with function
-          sEnParity().
-*/
-#define sSetEvenParity(ChP) \
-do { \
-   (ChP)->TxControl[2] |= EVEN_PAR; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetOddParity
-Purpose:  Set odd parity
-Call:     sSetOddParity(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: Function sSetParity() can be used in place of functions sEnParity(),
-          sDisParity(), sSetOddParity(), and sSetEvenParity().
-
-Warnings: This function has no effect unless parity is enabled with function
-          sEnParity().
-*/
-#define sSetOddParity(ChP) \
-do { \
-   (ChP)->TxControl[2] &= ~EVEN_PAR; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetRTS
-Purpose:  Set the RTS output
-Call:     sSetRTS(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sSetRTS(ChP) \
-do { \
-   if ((ChP)->rtsToggle) break; \
-   (ChP)->TxControl[3] |= SET_RTS; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetRxTrigger
-Purpose:  Set the Rx FIFO trigger level
-Call:     sSetRxProcessor(ChP,Level)
-          CHANNEL_T *ChP; Ptr to channel structure
-          Byte_t Level; Number of characters in Rx FIFO at which the
-             interrupt will be generated.  Can be any of the following flags:
-
-             TRIG_NO:   no trigger
-             TRIG_1:    1 character in FIFO
-             TRIG_1_2:  FIFO 1/2 full
-             TRIG_7_8:  FIFO 7/8 full
-Comments: An interrupt will be generated when the trigger level is reached
-          only if function sEnInterrupt() has been called with flag
-          RXINT_EN set.  The RXF_TRIG flag in the Interrupt Idenfification
-          register will be set whenever the trigger level is reached
-          regardless of the setting of RXINT_EN.
-
-*/
-#define sSetRxTrigger(ChP,LEVEL) \
-do { \
-   (ChP)->RxControl[2] &= ~TRIG_MASK; \
-   (ChP)->RxControl[2] |= LEVEL; \
-   out32((ChP)->IndexAddr,(ChP)->RxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetStop1
-Purpose:  Set stop bits to 1
-Call:     sSetStop1(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sSetStop1(ChP) \
-do { \
-   (ChP)->TxControl[2] &= ~STOP2; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetStop2
-Purpose:  Set stop bits to 2
-Call:     sSetStop2(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-*/
-#define sSetStop2(ChP) \
-do { \
-   (ChP)->TxControl[2] |= STOP2; \
-   out32((ChP)->IndexAddr,(ChP)->TxControl); \
-} while (0)
-
-/***************************************************************************
-Function: sSetTxXOFFChar
-Purpose:  Set the Tx XOFF flow control character
-Call:     sSetTxXOFFChar(ChP,Ch)
-          CHANNEL_T *ChP; Ptr to channel structure
-          Byte_t Ch; The value to set the Tx XOFF character to
-*/
-#define sSetTxXOFFChar(ChP,CH) \
-do { \
-   (ChP)->R[0x07] = (CH); \
-   out32((ChP)->IndexAddr,&(ChP)->R[0x04]); \
-} while (0)
-
-/***************************************************************************
-Function: sSetTxXONChar
-Purpose:  Set the Tx XON flow control character
-Call:     sSetTxXONChar(ChP,Ch)
-          CHANNEL_T *ChP; Ptr to channel structure
-          Byte_t Ch; The value to set the Tx XON character to
-*/
-#define sSetTxXONChar(ChP,CH) \
-do { \
-   (ChP)->R[0x0b] = (CH); \
-   out32((ChP)->IndexAddr,&(ChP)->R[0x08]); \
-} while (0)
-
-/***************************************************************************
-Function: sStartRxProcessor
-Purpose:  Start a channel's receive processor
-Call:     sStartRxProcessor(ChP)
-          CHANNEL_T *ChP; Ptr to channel structure
-Comments: This function is used to start a Rx processor after it was
-          stopped with sStopRxProcessor() or sStopSWInFlowCtl().  It
-          will restart both the Rx processor and software input flow control.
-
-*/
-#define sStartRxProcessor(ChP) out32((ChP)->IndexAddr,&(ChP)->R[0])
-
-/***************************************************************************
-Function: sWriteTxByte
-Purpose:  Write a transmit data byte to a channel.
-          ByteIO_t io: Channel transmit register I/O address.  This can
-                           be obtained with sGetTxRxDataIO().
-          Byte_t Data; The transmit data byte.
-Warnings: This function writes the data byte without checking to see if
-          sMaxTxSize is exceeded in the Tx FIFO.
-*/
-#define sWriteTxByte(IO,DATA) sOutB(IO,DATA)
-
-/*
- * Begin Linux specific definitions for the Rocketport driver
- *
- * This code is Copyright Theodore Ts'o, 1995-1997
- */
-
-struct r_port {
-	int magic;
-	struct tty_port port;
-	int line;
-	int flags;		/* Don't yet match the ASY_ flags!! */
-	unsigned int board:3;
-	unsigned int aiop:2;
-	unsigned int chan:3;
-	CONTROLLER_t *ctlp;
-	CHANNEL_t channel;
-	int intmask;
-	int xmit_fifo_room;	/* room in xmit fifo */
-	unsigned char *xmit_buf;
-	int xmit_head;
-	int xmit_tail;
-	int xmit_cnt;
-	int cd_status;
-	int ignore_status_mask;
-	int read_status_mask;
-	int cps;
-
-	spinlock_t slock;
-	struct mutex write_mtx;
-};
-
-#define RPORT_MAGIC 0x525001
-
-#define NUM_BOARDS 8
-#define MAX_RP_PORTS (32*NUM_BOARDS)
-
-/*
- * The size of the xmit buffer is 1 page, or 4096 bytes
- */
-#define XMIT_BUF_SIZE 4096
-
-/* number of characters left in xmit buffer before we ask for more */
-#define WAKEUP_CHARS 256
-
-/*
- * Assigned major numbers for the Comtrol Rocketport
- */
-#define TTY_ROCKET_MAJOR	46
-#define CUA_ROCKET_MAJOR	47
-
-#ifdef PCI_VENDOR_ID_RP
-#undef PCI_VENDOR_ID_RP
-#undef PCI_DEVICE_ID_RP8OCTA
-#undef PCI_DEVICE_ID_RP8INTF
-#undef PCI_DEVICE_ID_RP16INTF
-#undef PCI_DEVICE_ID_RP32INTF
-#undef PCI_DEVICE_ID_URP8OCTA
-#undef PCI_DEVICE_ID_URP8INTF
-#undef PCI_DEVICE_ID_URP16INTF
-#undef PCI_DEVICE_ID_CRP16INTF
-#undef PCI_DEVICE_ID_URP32INTF
-#endif
-
-/*  Comtrol PCI Vendor ID */
-#define PCI_VENDOR_ID_RP		0x11fe
-
-/*  Comtrol Device ID's */
-#define PCI_DEVICE_ID_RP32INTF		0x0001	/* Rocketport 32 port w/external I/F     */
-#define PCI_DEVICE_ID_RP8INTF		0x0002	/* Rocketport 8 port w/external I/F      */
-#define PCI_DEVICE_ID_RP16INTF		0x0003	/* Rocketport 16 port w/external I/F     */
-#define PCI_DEVICE_ID_RP4QUAD		0x0004	/* Rocketport 4 port w/quad cable        */
-#define PCI_DEVICE_ID_RP8OCTA		0x0005	/* Rocketport 8 port w/octa cable        */
-#define PCI_DEVICE_ID_RP8J		0x0006	/* Rocketport 8 port w/RJ11 connectors   */
-#define PCI_DEVICE_ID_RP4J		0x0007	/* Rocketport 4 port w/RJ11 connectors   */
-#define PCI_DEVICE_ID_RP8SNI		0x0008	/* Rocketport 8 port w/ DB78 SNI (Siemens) connector */
-#define PCI_DEVICE_ID_RP16SNI		0x0009	/* Rocketport 16 port w/ DB78 SNI (Siemens) connector   */
-#define PCI_DEVICE_ID_RPP4		0x000A	/* Rocketport Plus 4 port                */
-#define PCI_DEVICE_ID_RPP8		0x000B	/* Rocketport Plus 8 port                */
-#define PCI_DEVICE_ID_RP6M		0x000C	/* RocketModem 6 port                    */
-#define PCI_DEVICE_ID_RP4M		0x000D	/* RocketModem 4 port                    */
-#define PCI_DEVICE_ID_RP2_232           0x000E	/* Rocketport Plus 2 port RS232          */
-#define PCI_DEVICE_ID_RP2_422           0x000F	/* Rocketport Plus 2 port RS422          */ 
-
-/* Universal PCI boards  */
-#define PCI_DEVICE_ID_URP32INTF		0x0801	/* Rocketport UPCI 32 port w/external I/F */ 
-#define PCI_DEVICE_ID_URP8INTF		0x0802	/* Rocketport UPCI 8 port w/external I/F  */
-#define PCI_DEVICE_ID_URP16INTF		0x0803	/* Rocketport UPCI 16 port w/external I/F */
-#define PCI_DEVICE_ID_URP8OCTA		0x0805	/* Rocketport UPCI 8 port w/octa cable    */
-#define PCI_DEVICE_ID_UPCI_RM3_8PORT    0x080C	/* Rocketmodem III 8 port                 */
-#define PCI_DEVICE_ID_UPCI_RM3_4PORT    0x080D	/* Rocketmodem III 4 port                 */
-
-/* Compact PCI device */ 
-#define PCI_DEVICE_ID_CRP16INTF		0x0903	/* Rocketport Compact PCI 16 port w/external I/F */
-
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 056d2074f07a..4c3fa5293d76 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1688,27 +1688,6 @@
 #define PCI_VENDOR_ID_MICROSEMI		0x11f8
 
 #define PCI_VENDOR_ID_RP		0x11fe
-#define PCI_DEVICE_ID_RP32INTF		0x0001
-#define PCI_DEVICE_ID_RP8INTF		0x0002
-#define PCI_DEVICE_ID_RP16INTF		0x0003
-#define PCI_DEVICE_ID_RP4QUAD		0x0004
-#define PCI_DEVICE_ID_RP8OCTA		0x0005
-#define PCI_DEVICE_ID_RP8J		0x0006
-#define PCI_DEVICE_ID_RP4J		0x0007
-#define PCI_DEVICE_ID_RP8SNI		0x0008
-#define PCI_DEVICE_ID_RP16SNI		0x0009
-#define PCI_DEVICE_ID_RPP4		0x000A
-#define PCI_DEVICE_ID_RPP8		0x000B
-#define PCI_DEVICE_ID_RP4M		0x000D
-#define PCI_DEVICE_ID_RP2_232		0x000E
-#define PCI_DEVICE_ID_RP2_422		0x000F
-#define PCI_DEVICE_ID_URP32INTF		0x0801
-#define PCI_DEVICE_ID_URP8INTF		0x0802
-#define PCI_DEVICE_ID_URP16INTF		0x0803
-#define PCI_DEVICE_ID_URP8OCTA		0x0805
-#define PCI_DEVICE_ID_UPCI_RM3_8PORT	0x080C
-#define PCI_DEVICE_ID_UPCI_RM3_4PORT	0x080D
-#define PCI_DEVICE_ID_CRP16INTF		0x0903
 
 #define PCI_VENDOR_ID_CYCLADES		0x120e
 #define PCI_DEVICE_ID_PC300_RX_2	0x0300
-- 
cgit v1.2.3


From 981b22b8777df7de070be1803f6d7ed4f634a43c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 2 Mar 2021 07:21:38 +0100
Subject: tty: remove TTY_LDISC_MAGIC

First, it is never checked. Second, use of it as a debugging aid is
at least questionable. With the current tools, I don't think anyone used
this kind of thing for debugging purposes for years.

On the top of that, e.g. serdev does not set this field of tty_ldisc_ops
at all.

So get rid of this legacy.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210302062214.29627-8-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/process/magic-number.rst                    | 1 -
 Documentation/translations/it_IT/process/magic-number.rst | 1 -
 Documentation/translations/zh_CN/process/magic-number.rst | 1 -
 drivers/accessibility/speakup/spk_ttyio.c                 | 1 -
 drivers/bluetooth/hci_ldisc.c                             | 1 -
 drivers/misc/ti-st/st_core.c                              | 1 -
 drivers/net/caif/caif_serial.c                            | 1 -
 drivers/net/can/slcan.c                                   | 1 -
 drivers/net/hamradio/6pack.c                              | 1 -
 drivers/net/hamradio/mkiss.c                              | 1 -
 drivers/net/ppp/ppp_async.c                               | 1 -
 drivers/net/ppp/ppp_synctty.c                             | 1 -
 drivers/net/slip/slip.c                                   | 1 -
 drivers/pps/clients/pps-ldisc.c                           | 3 ---
 drivers/tty/n_gsm.c                                       | 1 -
 drivers/tty/n_hdlc.c                                      | 1 -
 drivers/tty/n_null.c                                      | 1 -
 drivers/tty/n_r3964.c                                     | 1 -
 drivers/tty/n_tty.c                                       | 1 -
 include/linux/tty_ldisc.h                                 | 3 ---
 net/nfc/nci/uart.c                                        | 1 -
 sound/soc/codecs/cx20442.c                                | 1 -
 sound/soc/ti/ams-delta.c                                  | 1 -
 23 files changed, 27 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/process/magic-number.rst b/Documentation/process/magic-number.rst
index 89992fe4863f..f5ba36e96461 100644
--- a/Documentation/process/magic-number.rst
+++ b/Documentation/process/magic-number.rst
@@ -88,7 +88,6 @@ TTY_MAGIC             0x5401           tty_struct               ``include/linux/
 MGSL_MAGIC            0x5401           mgsl_info                ``drivers/char/synclink.c``
 TTY_DRIVER_MAGIC      0x5402           tty_driver               ``include/linux/tty_driver.h``
 MGSLPC_MAGIC          0x5402           mgslpc_info              ``drivers/char/pcmcia/synclink_cs.c``
-TTY_LDISC_MAGIC       0x5403           tty_ldisc                ``include/linux/tty_ldisc.h``
 USB_SERIAL_MAGIC      0x6702           usb_serial               ``drivers/usb/serial/usb-serial.h``
 FULL_DUPLEX_MAGIC     0x6969                                    ``drivers/net/ethernet/dec/tulip/de2104x.c``
 USB_BLUETOOTH_MAGIC   0x6d02           usb_bluetooth            ``drivers/usb/class/bluetty.c``
diff --git a/Documentation/translations/it_IT/process/magic-number.rst b/Documentation/translations/it_IT/process/magic-number.rst
index 9be170ec0d02..e8c782d155a3 100644
--- a/Documentation/translations/it_IT/process/magic-number.rst
+++ b/Documentation/translations/it_IT/process/magic-number.rst
@@ -94,7 +94,6 @@ TTY_MAGIC             0x5401           tty_struct               ``include/linux/
 MGSL_MAGIC            0x5401           mgsl_info                ``drivers/char/synclink.c``
 TTY_DRIVER_MAGIC      0x5402           tty_driver               ``include/linux/tty_driver.h``
 MGSLPC_MAGIC          0x5402           mgslpc_info              ``drivers/char/pcmcia/synclink_cs.c``
-TTY_LDISC_MAGIC       0x5403           tty_ldisc                ``include/linux/tty_ldisc.h``
 USB_SERIAL_MAGIC      0x6702           usb_serial               ``drivers/usb/serial/usb-serial.h``
 FULL_DUPLEX_MAGIC     0x6969                                    ``drivers/net/ethernet/dec/tulip/de2104x.c``
 USB_BLUETOOTH_MAGIC   0x6d02           usb_bluetooth            ``drivers/usb/class/bluetty.c``
diff --git a/Documentation/translations/zh_CN/process/magic-number.rst b/Documentation/translations/zh_CN/process/magic-number.rst
index 191d705349ef..42f0635ca70a 100644
--- a/Documentation/translations/zh_CN/process/magic-number.rst
+++ b/Documentation/translations/zh_CN/process/magic-number.rst
@@ -77,7 +77,6 @@ TTY_MAGIC             0x5401           tty_struct               ``include/linux/
 MGSL_MAGIC            0x5401           mgsl_info                ``drivers/char/synclink.c``
 TTY_DRIVER_MAGIC      0x5402           tty_driver               ``include/linux/tty_driver.h``
 MGSLPC_MAGIC          0x5402           mgslpc_info              ``drivers/char/pcmcia/synclink_cs.c``
-TTY_LDISC_MAGIC       0x5403           tty_ldisc                ``include/linux/tty_ldisc.h``
 USB_SERIAL_MAGIC      0x6702           usb_serial               ``drivers/usb/serial/usb-serial.h``
 FULL_DUPLEX_MAGIC     0x6969                                    ``drivers/net/ethernet/dec/tulip/de2104x.c``
 USB_BLUETOOTH_MAGIC   0x6d02           usb_bluetooth            ``drivers/usb/class/bluetty.c``
diff --git a/drivers/accessibility/speakup/spk_ttyio.c b/drivers/accessibility/speakup/spk_ttyio.c
index 9af1d4c124d3..2e39fcf492d8 100644
--- a/drivers/accessibility/speakup/spk_ttyio.c
+++ b/drivers/accessibility/speakup/spk_ttyio.c
@@ -104,7 +104,6 @@ static int spk_ttyio_receive_buf2(struct tty_struct *tty,
 
 static struct tty_ldisc_ops spk_ttyio_ldisc_ops = {
 	.owner          = THIS_MODULE,
-	.magic          = TTY_LDISC_MAGIC,
 	.name           = "speakup_ldisc",
 	.open           = spk_ttyio_ldisc_open,
 	.close          = spk_ttyio_ldisc_close,
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 637c5b8c2aa1..71a4ca505e09 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -821,7 +821,6 @@ static __poll_t hci_uart_tty_poll(struct tty_struct *tty,
 
 static struct tty_ldisc_ops hci_uart_ldisc = {
 	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
 	.name		= "n_hci",
 	.open		= hci_uart_tty_open,
 	.close		= hci_uart_tty_close,
diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
index 5a0a5fc3d3ab..071844b58073 100644
--- a/drivers/misc/ti-st/st_core.c
+++ b/drivers/misc/ti-st/st_core.c
@@ -845,7 +845,6 @@ static void st_tty_flush_buffer(struct tty_struct *tty)
 }
 
 static struct tty_ldisc_ops st_ldisc_ops = {
-	.magic = TTY_LDISC_MAGIC,
 	.name = "n_st",
 	.open = st_tty_open,
 	.close = st_tty_close,
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index 8215cd77301f..675c374b32ee 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -382,7 +382,6 @@ static void ldisc_close(struct tty_struct *tty)
 /* The line discipline structure. */
 static struct tty_ldisc_ops caif_ldisc = {
 	.owner =	THIS_MODULE,
-	.magic =	TTY_LDISC_MAGIC,
 	.name =		"n_caif",
 	.open =		ldisc_open,
 	.close =	ldisc_close,
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index 30c8d53c9745..31ba6664503d 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -697,7 +697,6 @@ static int slcan_ioctl(struct tty_struct *tty, struct file *file,
 
 static struct tty_ldisc_ops slc_ldisc = {
 	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
 	.name		= "slcan",
 	.open		= slcan_open,
 	.close		= slcan_close,
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 71d6629e65c9..6f71eff9c52e 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -749,7 +749,6 @@ static int sixpack_ioctl(struct tty_struct *tty, struct file *file,
 
 static struct tty_ldisc_ops sp_ldisc = {
 	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
 	.name		= "6pack",
 	.open		= sixpack_open,
 	.close		= sixpack_close,
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 17be2bb2985c..65154224d5b8 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -933,7 +933,6 @@ out:
 
 static struct tty_ldisc_ops ax_ldisc = {
 	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
 	.name		= "mkiss",
 	.open		= mkiss_open,
 	.close		= mkiss_close,
diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c
index 2a91caa4f37b..8b41aa3fb64e 100644
--- a/drivers/net/ppp/ppp_async.c
+++ b/drivers/net/ppp/ppp_async.c
@@ -372,7 +372,6 @@ ppp_asynctty_wakeup(struct tty_struct *tty)
 
 static struct tty_ldisc_ops ppp_ldisc = {
 	.owner  = THIS_MODULE,
-	.magic	= TTY_LDISC_MAGIC,
 	.name	= "ppp",
 	.open	= ppp_asynctty_open,
 	.close	= ppp_asynctty_close,
diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
index d8890923a9e3..576b6a93bf23 100644
--- a/drivers/net/ppp/ppp_synctty.c
+++ b/drivers/net/ppp/ppp_synctty.c
@@ -365,7 +365,6 @@ ppp_sync_wakeup(struct tty_struct *tty)
 
 static struct tty_ldisc_ops ppp_sync_ldisc = {
 	.owner	= THIS_MODULE,
-	.magic	= TTY_LDISC_MAGIC,
 	.name	= "pppsync",
 	.open	= ppp_sync_open,
 	.close	= ppp_sync_close,
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index f81fb0b13a94..1ab124eba8eb 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -1263,7 +1263,6 @@ static int sl_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 
 static struct tty_ldisc_ops sl_ldisc = {
 	.owner 		= THIS_MODULE,
-	.magic 		= TTY_LDISC_MAGIC,
 	.name 		= "slip",
 	.open 		= slip_open,
 	.close	 	= slip_close,
diff --git a/drivers/pps/clients/pps-ldisc.c b/drivers/pps/clients/pps-ldisc.c
index 4fd0cbf7f931..bf26cc56b863 100644
--- a/drivers/pps/clients/pps-ldisc.c
+++ b/drivers/pps/clients/pps-ldisc.c
@@ -13,8 +13,6 @@
 #include <linux/pps_kernel.h>
 #include <linux/bug.h>
 
-#define PPS_TTY_MAGIC		0x0001
-
 static void pps_tty_dcd_change(struct tty_struct *tty, unsigned int status)
 {
 	struct pps_device *pps;
@@ -114,7 +112,6 @@ static int __init pps_tty_init(void)
 
 	/* Init PPS_TTY data */
 	pps_ldisc_ops.owner = THIS_MODULE;
-	pps_ldisc_ops.magic = PPS_TTY_MAGIC;
 	pps_ldisc_ops.name = "pps_tty";
 	pps_ldisc_ops.dcd_change = pps_tty_dcd_change;
 	pps_ldisc_ops.open = pps_tty_open;
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 51dafc06f541..b063bc608a8c 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2849,7 +2849,6 @@ static int gsm_create_network(struct gsm_dlci *dlci, struct gsm_netconfig *nc)
 /* Line discipline for real tty */
 static struct tty_ldisc_ops tty_ldisc_packet = {
 	.owner		 = THIS_MODULE,
-	.magic           = TTY_LDISC_MAGIC,
 	.name            = "n_gsm",
 	.open            = gsmld_open,
 	.close           = gsmld_close,
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index 1363e659dc1d..290c757db1b9 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -787,7 +787,6 @@ static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *buf_list)
 
 static struct tty_ldisc_ops n_hdlc_ldisc = {
 	.owner		= THIS_MODULE,
-	.magic		= TTY_LDISC_MAGIC,
 	.name		= "hdlc",
 	.open		= n_hdlc_tty_open,
 	.close		= n_hdlc_tty_close,
diff --git a/drivers/tty/n_null.c b/drivers/tty/n_null.c
index ce03ae78f5c6..b8f67b5f1ef8 100644
--- a/drivers/tty/n_null.c
+++ b/drivers/tty/n_null.c
@@ -40,7 +40,6 @@ static void n_null_receivebuf(struct tty_struct *tty,
 
 static struct tty_ldisc_ops null_ldisc = {
 	.owner		=	THIS_MODULE,
-	.magic		=	TTY_LDISC_MAGIC,
 	.name		=	"n_null",
 	.open		=	n_null_open,
 	.close		=	n_null_close,
diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c
index 3161f0a535e3..2eb76ea1d88d 100644
--- a/drivers/tty/n_r3964.c
+++ b/drivers/tty/n_r3964.c
@@ -146,7 +146,6 @@ static void r3964_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 
 static struct tty_ldisc_ops tty_ldisc_N_R3964 = {
 	.owner = THIS_MODULE,
-	.magic = TTY_LDISC_MAGIC,
 	.name = "R3964",
 	.open = r3964_open,
 	.close = r3964_close,
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 87ec15dbe10d..7c53185bce57 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2488,7 +2488,6 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
 }
 
 static struct tty_ldisc_ops n_tty_ops = {
-	.magic           = TTY_LDISC_MAGIC,
 	.name            = "n_tty",
 	.open            = n_tty_open,
 	.close           = n_tty_close,
diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 572a07976116..31284b55bd4f 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -173,7 +173,6 @@ extern int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass,
 
 
 struct tty_ldisc_ops {
-	int	magic;
 	char	*name;
 	int	num;
 	int	flags;
@@ -218,8 +217,6 @@ struct tty_ldisc {
 	struct tty_struct *tty;
 };
 
-#define TTY_LDISC_MAGIC	0x5403
-
 #define LDISC_FLAG_DEFINED	0x00000001
 
 #define MODULE_ALIAS_LDISC(ldisc) \
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 1204c438e87d..16d009c9b6a0 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -453,7 +453,6 @@ void nci_uart_set_config(struct nci_uart *nu, int baudrate, int flow_ctrl)
 EXPORT_SYMBOL_GPL(nci_uart_set_config);
 
 static struct tty_ldisc_ops nci_uart_ldisc = {
-	.magic		= TTY_LDISC_MAGIC,
 	.owner		= THIS_MODULE,
 	.name		= "n_nci",
 	.open		= nci_uart_tty_open,
diff --git a/sound/soc/codecs/cx20442.c b/sound/soc/codecs/cx20442.c
index 161be8b7d131..61dfa86d444d 100644
--- a/sound/soc/codecs/cx20442.c
+++ b/sound/soc/codecs/cx20442.c
@@ -285,7 +285,6 @@ static void v253_wakeup(struct tty_struct *tty)
 }
 
 struct tty_ldisc_ops v253_ops = {
-	.magic = TTY_LDISC_MAGIC,
 	.name = "cx20442",
 	.owner = THIS_MODULE,
 	.open = v253_open,
diff --git a/sound/soc/ti/ams-delta.c b/sound/soc/ti/ams-delta.c
index 57feb473a579..98198c7cc872 100644
--- a/sound/soc/ti/ams-delta.c
+++ b/sound/soc/ti/ams-delta.c
@@ -395,7 +395,6 @@ static void cx81801_wakeup(struct tty_struct *tty)
 }
 
 static struct tty_ldisc_ops cx81801_ops = {
-	.magic = TTY_LDISC_MAGIC,
 	.name = "cx81801",
 	.owner = THIS_MODULE,
 	.open = cx81801_open,
-- 
cgit v1.2.3


From a872ab4d6d191cca1ce84b945e394bd6a8d84dd9 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 2 Mar 2021 07:22:05 +0100
Subject: tty: let tty_unregister_driver return void

Now that noone checks the return value, switch the return type of
tty_unregister_driver to void. We can do that as we always return zero.

Generally, drivers are not allowed to call tty_unregister_driver while
there are open devices.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Link: https://lore.kernel.org/r/20210302062214.29627-35-jslaby@suse.cz
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 9 +--------
 include/linux/tty.h  | 2 +-
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 74733ec8f565..c8cbf08b6dcf 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3525,21 +3525,14 @@ EXPORT_SYMBOL(tty_register_driver);
 /*
  * Called by a tty driver to unregister itself.
  */
-int tty_unregister_driver(struct tty_driver *driver)
+void tty_unregister_driver(struct tty_driver *driver)
 {
-#if 0
-	/* FIXME */
-	if (driver->refcount)
-		return -EBUSY;
-#endif
 	unregister_chrdev_region(MKDEV(driver->major, driver->minor_start),
 				driver->num);
 	mutex_lock(&tty_mutex);
 	list_del(&driver->tty_drivers);
 	mutex_unlock(&tty_mutex);
-	return 0;
 }
-
 EXPORT_SYMBOL(tty_unregister_driver);
 
 dev_t tty_devnum(struct tty_struct *tty)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 95fc2f100f12..51f56e5ec955 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -482,7 +482,7 @@ extern void stop_tty(struct tty_struct *tty);
 extern void __start_tty(struct tty_struct *tty);
 extern void start_tty(struct tty_struct *tty);
 extern int tty_register_driver(struct tty_driver *driver);
-extern int tty_unregister_driver(struct tty_driver *driver);
+extern void tty_unregister_driver(struct tty_driver *driver);
 extern struct device *tty_register_device(struct tty_driver *driver,
 					  unsigned index, struct device *dev);
 extern struct device *tty_register_device_attr(struct tty_driver *driver,
-- 
cgit v1.2.3


From fcbba344907afe26da487f1ed0b0e285c06a547b Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Fri, 5 Mar 2021 06:38:59 +0900
Subject: tty: serial: samsung_tty: Add support for Apple UARTs

Apple SoCs are a distant descendant of Samsung designs and use yet
another variant of their UART style, with different interrupt handling.

In particular, this variant has the following differences with existing
ones:

* It includes a built-in interrupt controller with different registers,
  using only a single platform IRQ

* Internal interrupt sources are treated as edge-triggered, even though
  the IRQ output is level-triggered. This chiefly affects the TX IRQ
  path: the driver can no longer rely on the TX buffer empty IRQ
  immediately firing after TX is enabled, but instead must prime the
  FIFO with data directly.

Signed-off-by: Hector Martin <marcan@marcan.st>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Tested-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Link: https://lore.kernel.org/r/20210304213902.83903-25-marcan@marcan.st
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig       |   2 +-
 drivers/tty/serial/samsung_tty.c | 238 +++++++++++++++++++++++++++++++++++++--
 include/linux/serial_s3c.h       |  16 +++
 3 files changed, 247 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index fb9731d05f9e..e6f55c28cc2e 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -236,7 +236,7 @@ config SERIAL_CLPS711X_CONSOLE
 
 config SERIAL_SAMSUNG
 	tristate "Samsung SoC serial support"
-	depends on PLAT_SAMSUNG || ARCH_S5PV210 || ARCH_EXYNOS || COMPILE_TEST
+	depends on PLAT_SAMSUNG || ARCH_S5PV210 || ARCH_EXYNOS || ARCH_APPLE || COMPILE_TEST
 	select SERIAL_CORE
 	help
 	  Support for the on-chip UARTs on the Samsung S3C24XX series CPUs,
diff --git a/drivers/tty/serial/samsung_tty.c b/drivers/tty/serial/samsung_tty.c
index 26cb05992e9f..5ef37c4538ce 100644
--- a/drivers/tty/serial/samsung_tty.c
+++ b/drivers/tty/serial/samsung_tty.c
@@ -59,6 +59,7 @@
 enum s3c24xx_port_type {
 	TYPE_S3C24XX,
 	TYPE_S3C6400,
+	TYPE_APPLE_S5L,
 };
 
 struct s3c24xx_uart_info {
@@ -151,6 +152,8 @@ struct s3c24xx_uart_port {
 #endif
 };
 
+static void s3c24xx_serial_tx_chars(struct s3c24xx_uart_port *ourport);
+
 /* conversion functions */
 
 #define s3c24xx_dev_to_port(__dev) dev_get_drvdata(__dev)
@@ -290,6 +293,9 @@ static void s3c24xx_serial_stop_tx(struct uart_port *port)
 	case TYPE_S3C6400:
 		s3c24xx_set_bit(port, S3C64XX_UINTM_TXD, S3C64XX_UINTM);
 		break;
+	case TYPE_APPLE_S5L:
+		s3c24xx_clear_bit(port, APPLE_S5L_UCON_TXTHRESH_ENA, S3C2410_UCON);
+		break;
 	default:
 		disable_irq_nosync(ourport->tx_irq);
 		break;
@@ -358,6 +364,9 @@ static void enable_tx_dma(struct s3c24xx_uart_port *ourport)
 	case TYPE_S3C6400:
 		s3c24xx_set_bit(port, S3C64XX_UINTM_TXD, S3C64XX_UINTM);
 		break;
+	case TYPE_APPLE_S5L:
+		WARN_ON(1); // No DMA
+		break;
 	default:
 		disable_irq_nosync(ourport->tx_irq);
 		break;
@@ -396,12 +405,23 @@ static void enable_tx_pio(struct s3c24xx_uart_port *ourport)
 		s3c24xx_clear_bit(port, S3C64XX_UINTM_TXD,
 				  S3C64XX_UINTM);
 		break;
+	case TYPE_APPLE_S5L:
+		ucon |= APPLE_S5L_UCON_TXTHRESH_ENA_MSK;
+		wr_regl(port, S3C2410_UCON, ucon);
+		break;
 	default:
 		enable_irq(ourport->tx_irq);
 		break;
 	}
 
 	ourport->tx_mode = S3C24XX_TX_PIO;
+
+	/*
+	 * The Apple version only has edge triggered TX IRQs, so we need
+	 * to kick off the process by sending some characters here.
+	 */
+	if (ourport->info->type == TYPE_APPLE_S5L)
+		s3c24xx_serial_tx_chars(ourport);
 }
 
 static void s3c24xx_serial_start_tx_pio(struct s3c24xx_uart_port *ourport)
@@ -527,6 +547,10 @@ static void s3c24xx_serial_stop_rx(struct uart_port *port)
 			s3c24xx_set_bit(port, S3C64XX_UINTM_RXD,
 					S3C64XX_UINTM);
 			break;
+		case TYPE_APPLE_S5L:
+			s3c24xx_clear_bit(port, APPLE_S5L_UCON_RXTHRESH_ENA, S3C2410_UCON);
+			s3c24xx_clear_bit(port, APPLE_S5L_UCON_RXTO_ENA, S3C2410_UCON);
+			break;
 		default:
 			disable_irq_nosync(ourport->rx_irq);
 			break;
@@ -664,14 +688,18 @@ static void enable_rx_pio(struct s3c24xx_uart_port *ourport)
 
 	/* set Rx mode to DMA mode */
 	ucon = rd_regl(port, S3C2410_UCON);
-	ucon &= ~(S3C64XX_UCON_TIMEOUT_MASK |
-			S3C64XX_UCON_EMPTYINT_EN |
-			S3C64XX_UCON_DMASUS_EN |
-			S3C64XX_UCON_TIMEOUT_EN |
-			S3C64XX_UCON_RXMODE_MASK);
-	ucon |= 0xf << S3C64XX_UCON_TIMEOUT_SHIFT |
-			S3C64XX_UCON_TIMEOUT_EN |
-			S3C64XX_UCON_RXMODE_CPU;
+	ucon &= ~S3C64XX_UCON_RXMODE_MASK;
+	ucon |= S3C64XX_UCON_RXMODE_CPU;
+
+	/* Apple types use these bits for IRQ masks */
+	if (ourport->info->type != TYPE_APPLE_S5L) {
+		ucon &= ~(S3C64XX_UCON_TIMEOUT_MASK |
+				S3C64XX_UCON_EMPTYINT_EN |
+				S3C64XX_UCON_DMASUS_EN |
+				S3C64XX_UCON_TIMEOUT_EN);
+		ucon |= 0xf << S3C64XX_UCON_TIMEOUT_SHIFT |
+				S3C64XX_UCON_TIMEOUT_EN;
+	}
 	wr_regl(port, S3C2410_UCON, ucon);
 
 	ourport->rx_mode = S3C24XX_RX_PIO;
@@ -935,6 +963,27 @@ static irqreturn_t s3c64xx_serial_handle_irq(int irq, void *id)
 	return ret;
 }
 
+/* interrupt handler for Apple SoC's.*/
+static irqreturn_t apple_serial_handle_irq(int irq, void *id)
+{
+	struct s3c24xx_uart_port *ourport = id;
+	struct uart_port *port = &ourport->port;
+	unsigned int pend = rd_regl(port, S3C2410_UTRSTAT);
+	irqreturn_t ret = IRQ_NONE;
+
+	if (pend & (APPLE_S5L_UTRSTAT_RXTHRESH | APPLE_S5L_UTRSTAT_RXTO)) {
+		wr_regl(port, S3C2410_UTRSTAT,
+			APPLE_S5L_UTRSTAT_RXTHRESH | APPLE_S5L_UTRSTAT_RXTO);
+		ret = s3c24xx_serial_rx_irq(irq, id);
+	}
+	if (pend & APPLE_S5L_UTRSTAT_TXTHRESH) {
+		wr_regl(port, S3C2410_UTRSTAT, APPLE_S5L_UTRSTAT_TXTHRESH);
+		ret = s3c24xx_serial_tx_irq(irq, id);
+	}
+
+	return ret;
+}
+
 static unsigned int s3c24xx_serial_tx_empty(struct uart_port *port)
 {
 	struct s3c24xx_uart_info *info = s3c24xx_port_to_info(port);
@@ -1154,6 +1203,32 @@ static void s3c64xx_serial_shutdown(struct uart_port *port)
 	ourport->tx_in_progress = 0;
 }
 
+static void apple_s5l_serial_shutdown(struct uart_port *port)
+{
+	struct s3c24xx_uart_port *ourport = to_ourport(port);
+
+	unsigned int ucon;
+
+	ucon = rd_regl(port, S3C2410_UCON);
+	ucon &= ~(APPLE_S5L_UCON_TXTHRESH_ENA_MSK |
+		  APPLE_S5L_UCON_RXTHRESH_ENA_MSK |
+		  APPLE_S5L_UCON_RXTO_ENA_MSK);
+	wr_regl(port, S3C2410_UCON, ucon);
+
+	wr_regl(port, S3C2410_UTRSTAT, APPLE_S5L_UTRSTAT_ALL_FLAGS);
+
+	free_irq(port->irq, ourport);
+
+	ourport->tx_enabled = 0;
+	ourport->tx_mode = 0;
+	ourport->rx_enabled = 0;
+
+	if (ourport->dma)
+		s3c24xx_serial_release_dma(ourport);
+
+	ourport->tx_in_progress = 0;
+}
+
 static int s3c24xx_serial_startup(struct uart_port *port)
 {
 	struct s3c24xx_uart_port *ourport = to_ourport(port);
@@ -1241,6 +1316,45 @@ static int s3c64xx_serial_startup(struct uart_port *port)
 	return ret;
 }
 
+static int apple_s5l_serial_startup(struct uart_port *port)
+{
+	struct s3c24xx_uart_port *ourport = to_ourport(port);
+	unsigned long flags;
+	unsigned int ufcon;
+	int ret;
+
+	wr_regl(port, S3C2410_UTRSTAT, APPLE_S5L_UTRSTAT_ALL_FLAGS);
+
+	ret = request_irq(port->irq, apple_serial_handle_irq, 0,
+			  s3c24xx_serial_portname(port), ourport);
+	if (ret) {
+		dev_err(port->dev, "cannot get irq %d\n", port->irq);
+		return ret;
+	}
+
+	/* For compatibility with s3c24xx Soc's */
+	ourport->rx_enabled = 1;
+	ourport->tx_enabled = 0;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	ufcon = rd_regl(port, S3C2410_UFCON);
+	ufcon |= S3C2410_UFCON_RESETRX | S5PV210_UFCON_RXTRIG8;
+	if (!uart_console(port))
+		ufcon |= S3C2410_UFCON_RESETTX;
+	wr_regl(port, S3C2410_UFCON, ufcon);
+
+	enable_rx_pio(ourport);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	/* Enable Rx Interrupt */
+	s3c24xx_set_bit(port, APPLE_S5L_UCON_RXTHRESH_ENA, S3C2410_UCON);
+	s3c24xx_set_bit(port, APPLE_S5L_UCON_RXTO_ENA, S3C2410_UCON);
+
+	return ret;
+}
+
 /* power power management control */
 
 static void s3c24xx_serial_pm(struct uart_port *port, unsigned int level,
@@ -1568,6 +1682,8 @@ static const char *s3c24xx_serial_type(struct uart_port *port)
 		return "S3C24XX";
 	case TYPE_S3C6400:
 		return "S3C6400/10";
+	case TYPE_APPLE_S5L:
+		return "APPLE S5L";
 	default:
 		return NULL;
 	}
@@ -1659,6 +1775,27 @@ static const struct uart_ops s3c64xx_serial_ops = {
 #endif
 };
 
+static const struct uart_ops apple_s5l_serial_ops = {
+	.pm		= s3c24xx_serial_pm,
+	.tx_empty	= s3c24xx_serial_tx_empty,
+	.get_mctrl	= s3c24xx_serial_get_mctrl,
+	.set_mctrl	= s3c24xx_serial_set_mctrl,
+	.stop_tx	= s3c24xx_serial_stop_tx,
+	.start_tx	= s3c24xx_serial_start_tx,
+	.stop_rx	= s3c24xx_serial_stop_rx,
+	.break_ctl	= s3c24xx_serial_break_ctl,
+	.startup	= apple_s5l_serial_startup,
+	.shutdown	= apple_s5l_serial_shutdown,
+	.set_termios	= s3c24xx_serial_set_termios,
+	.type		= s3c24xx_serial_type,
+	.config_port	= s3c24xx_serial_config_port,
+	.verify_port	= s3c24xx_serial_verify_port,
+#if defined(CONFIG_SERIAL_SAMSUNG_CONSOLE) && defined(CONFIG_CONSOLE_POLL)
+	.poll_get_char = s3c24xx_serial_get_poll_char,
+	.poll_put_char = s3c24xx_serial_put_poll_char,
+#endif
+};
+
 static struct uart_driver s3c24xx_uart_drv = {
 	.owner		= THIS_MODULE,
 	.driver_name	= "s3c2410_serial",
@@ -1975,6 +2112,18 @@ static int s3c24xx_serial_init_port(struct s3c24xx_uart_port *ourport,
 		wr_regl(port, S3C64XX_UINTP, 0xf);
 		wr_regl(port, S3C64XX_UINTSP, 0xf);
 		break;
+	case TYPE_APPLE_S5L: {
+		unsigned int ucon;
+
+		ucon = rd_regl(port, S3C2410_UCON);
+		ucon &= ~(APPLE_S5L_UCON_TXTHRESH_ENA_MSK |
+			APPLE_S5L_UCON_RXTHRESH_ENA_MSK |
+			APPLE_S5L_UCON_RXTO_ENA_MSK);
+		wr_regl(port, S3C2410_UCON, ucon);
+
+		wr_regl(port, S3C2410_UTRSTAT, APPLE_S5L_UTRSTAT_ALL_FLAGS);
+		break;
+	}
 	default:
 		break;
 	}
@@ -2054,6 +2203,9 @@ static int s3c24xx_serial_probe(struct platform_device *pdev)
 	case TYPE_S3C6400:
 		ourport->port.ops = &s3c64xx_serial_ops;
 		break;
+	case TYPE_APPLE_S5L:
+		ourport->port.ops = &apple_s5l_serial_ops;
+		break;
 	}
 
 	if (np) {
@@ -2196,6 +2348,43 @@ static int s3c24xx_serial_resume_noirq(struct device *dev)
 			clk_disable_unprepare(ourport->clk);
 			break;
 		}
+		case TYPE_APPLE_S5L: {
+			unsigned int ucon;
+			int ret;
+
+			ret = clk_prepare_enable(ourport->clk);
+			if (ret) {
+				dev_err(dev, "clk_enable clk failed: %d\n", ret);
+				return ret;
+			}
+			if (!IS_ERR(ourport->baudclk)) {
+				ret = clk_prepare_enable(ourport->baudclk);
+				if (ret) {
+					dev_err(dev, "clk_enable baudclk failed: %d\n", ret);
+					clk_disable_unprepare(ourport->clk);
+					return ret;
+				}
+			}
+
+			ucon = rd_regl(port, S3C2410_UCON);
+
+			ucon &= ~(APPLE_S5L_UCON_TXTHRESH_ENA_MSK |
+				  APPLE_S5L_UCON_RXTHRESH_ENA_MSK |
+				  APPLE_S5L_UCON_RXTO_ENA_MSK);
+
+			if (ourport->tx_enabled)
+				ucon |= APPLE_S5L_UCON_TXTHRESH_ENA_MSK;
+			if (ourport->rx_enabled)
+				ucon |= APPLE_S5L_UCON_RXTHRESH_ENA_MSK |
+					APPLE_S5L_UCON_RXTO_ENA_MSK;
+
+			wr_regl(port, S3C2410_UCON, ucon);
+
+			if (!IS_ERR(ourport->baudclk))
+				clk_disable_unprepare(ourport->baudclk);
+			clk_disable_unprepare(ourport->clk);
+			break;
+		}
 		default:
 			break;
 		}
@@ -2605,6 +2794,34 @@ static struct s3c24xx_serial_drv_data exynos5433_serial_drv_data = {
 #define EXYNOS5433_SERIAL_DRV_DATA (kernel_ulong_t)NULL
 #endif
 
+#ifdef CONFIG_ARCH_APPLE
+static struct s3c24xx_serial_drv_data s5l_serial_drv_data = {
+	.info = &(struct s3c24xx_uart_info) {
+		.name		= "Apple S5L UART",
+		.type		= TYPE_APPLE_S5L,
+		.port_type	= PORT_8250,
+		.fifosize	= 16,
+		.rx_fifomask	= S3C2410_UFSTAT_RXMASK,
+		.rx_fifoshift	= S3C2410_UFSTAT_RXSHIFT,
+		.rx_fifofull	= S3C2410_UFSTAT_RXFULL,
+		.tx_fifofull	= S3C2410_UFSTAT_TXFULL,
+		.tx_fifomask	= S3C2410_UFSTAT_TXMASK,
+		.tx_fifoshift	= S3C2410_UFSTAT_TXSHIFT,
+		.def_clk_sel	= S3C2410_UCON_CLKSEL0,
+		.num_clks	= 1,
+		.clksel_mask	= 0,
+		.clksel_shift	= 0,
+	},
+	.def_cfg = &(struct s3c2410_uartcfg) {
+		.ucon		= APPLE_S5L_UCON_DEFAULT,
+		.ufcon		= S3C2410_UFCON_DEFAULT,
+	},
+};
+#define S5L_SERIAL_DRV_DATA ((kernel_ulong_t)&s5l_serial_drv_data)
+#else
+#define S5L_SERIAL_DRV_DATA ((kernel_ulong_t)NULL)
+#endif
+
 static const struct platform_device_id s3c24xx_serial_driver_ids[] = {
 	{
 		.name		= "s3c2410-uart",
@@ -2627,6 +2844,9 @@ static const struct platform_device_id s3c24xx_serial_driver_ids[] = {
 	}, {
 		.name		= "exynos5433-uart",
 		.driver_data	= EXYNOS5433_SERIAL_DRV_DATA,
+	}, {
+		.name		= "s5l-uart",
+		.driver_data	= S5L_SERIAL_DRV_DATA,
 	},
 	{ },
 };
@@ -2648,6 +2868,8 @@ static const struct of_device_id s3c24xx_uart_dt_match[] = {
 		.data = (void *)EXYNOS4210_SERIAL_DRV_DATA },
 	{ .compatible = "samsung,exynos5433-uart",
 		.data = (void *)EXYNOS5433_SERIAL_DRV_DATA },
+	{ .compatible = "apple,s5l-uart",
+		.data = (void *)S5L_SERIAL_DRV_DATA },
 	{},
 };
 MODULE_DEVICE_TABLE(of, s3c24xx_uart_dt_match);
diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index ca2c5393dc6b..f6c3323fc4c5 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -246,6 +246,22 @@
 				 S5PV210_UFCON_TXTRIG4 |	\
 				 S5PV210_UFCON_RXTRIG4)
 
+#define APPLE_S5L_UCON_RXTO_ENA		9
+#define APPLE_S5L_UCON_RXTHRESH_ENA	12
+#define APPLE_S5L_UCON_TXTHRESH_ENA	13
+#define APPLE_S5L_UCON_RXTO_ENA_MSK	(1 << APPLE_S5L_UCON_RXTO_ENA)
+#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK	(1 << APPLE_S5L_UCON_RXTHRESH_ENA)
+#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK	(1 << APPLE_S5L_UCON_TXTHRESH_ENA)
+
+#define APPLE_S5L_UCON_DEFAULT		(S3C2410_UCON_TXIRQMODE | \
+					 S3C2410_UCON_RXIRQMODE | \
+					 S3C2410_UCON_RXFIFO_TOI)
+
+#define APPLE_S5L_UTRSTAT_RXTHRESH	(1<<4)
+#define APPLE_S5L_UTRSTAT_TXTHRESH	(1<<5)
+#define APPLE_S5L_UTRSTAT_RXTO		(1<<9)
+#define APPLE_S5L_UTRSTAT_ALL_FLAGS	(0x3f0)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/serial_core.h>
-- 
cgit v1.2.3


From 92d1e87e627a6e5c9c5111adb95b6eae8df124c7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 26 Feb 2021 11:23:56 +0100
Subject: USB: remove usb_bus_type from usb.h

We have 2 forward declarations of usb_bus_type, one in the system-wide
usb.h and the other in the "USB core only header file".  This variable
is not exported from the USB core, so remove the declaration from usb.h
as it does not need to be there.

Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: linux-usb@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/20210226102356.716746-1-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 7d72c4e0713c..f15fd0fa95bd 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1257,8 +1257,6 @@ struct usb_device_driver {
 #define	to_usb_device_driver(d) container_of(d, struct usb_device_driver, \
 		drvwrap.driver)
 
-extern struct bus_type usb_bus_type;
-
 /**
  * struct usb_class_driver - identifies a USB driver that wants to use the USB major number
  * @name: the usb class device name for this driver.  Will show up in sysfs.
-- 
cgit v1.2.3


From fb95c7cf5600b7b74412f27dfb39a1e13fd8a90d Mon Sep 17 00:00:00 2001
From: Chunfeng Yun <chunfeng.yun@mediatek.com>
Date: Mon, 8 Mar 2021 10:52:05 +0800
Subject: usb: common: add function to get interval expressed in us unit

Add a new function to convert bInterval into the time expressed
in 1us unit.

Signed-off-by: Chunfeng Yun <chunfeng.yun@mediatek.com>
Link: https://lore.kernel.org/r/25c8a09b055f716c1e5bf11fea72c3418f844482.1615170625.git.chunfeng.yun@mediatek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/common/common.c | 41 +++++++++++++++++++++++++++++++++++++++++
 drivers/usb/core/devices.c  | 21 ++++-----------------
 drivers/usb/core/endpoint.c | 35 ++++-------------------------------
 include/linux/usb/ch9.h     |  3 +++
 4 files changed, 52 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c
index fc21cf2d36f6..675e8a4e683a 100644
--- a/drivers/usb/common/common.c
+++ b/drivers/usb/common/common.c
@@ -165,6 +165,47 @@ enum usb_dr_mode usb_get_dr_mode(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(usb_get_dr_mode);
 
+/**
+ * usb_decode_interval - Decode bInterval into the time expressed in 1us unit
+ * @epd: The descriptor of the endpoint
+ * @speed: The speed that the endpoint works as
+ *
+ * Function returns the interval expressed in 1us unit for servicing
+ * endpoint for data transfers.
+ */
+unsigned int usb_decode_interval(const struct usb_endpoint_descriptor *epd,
+				 enum usb_device_speed speed)
+{
+	unsigned int interval = 0;
+
+	switch (usb_endpoint_type(epd)) {
+	case USB_ENDPOINT_XFER_CONTROL:
+		/* uframes per NAK */
+		if (speed == USB_SPEED_HIGH)
+			interval = epd->bInterval;
+		break;
+	case USB_ENDPOINT_XFER_ISOC:
+		interval = 1 << (epd->bInterval - 1);
+		break;
+	case USB_ENDPOINT_XFER_BULK:
+		/* uframes per NAK */
+		if (speed == USB_SPEED_HIGH && usb_endpoint_dir_out(epd))
+			interval = epd->bInterval;
+		break;
+	case USB_ENDPOINT_XFER_INT:
+		if (speed >= USB_SPEED_HIGH)
+			interval = 1 << (epd->bInterval - 1);
+		else
+			interval = epd->bInterval;
+		break;
+	}
+
+	interval *= (speed >= USB_SPEED_HIGH) ? 125 : 1000;
+
+	return interval;
+}
+EXPORT_SYMBOL_GPL(usb_decode_interval);
+
 #ifdef CONFIG_OF
 /**
  * of_usb_get_dr_mode_by_phy - Get dual role mode for the controller device
diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c
index 1ef2de6e375a..d8b0041de612 100644
--- a/drivers/usb/core/devices.c
+++ b/drivers/usb/core/devices.c
@@ -157,38 +157,25 @@ static char *usb_dump_endpoint_descriptor(int speed, char *start, char *end,
 	switch (usb_endpoint_type(desc)) {
 	case USB_ENDPOINT_XFER_CONTROL:
 		type = "Ctrl";
-		if (speed == USB_SPEED_HIGH)	/* uframes per NAK */
-			interval = desc->bInterval;
-		else
-			interval = 0;
 		dir = 'B';			/* ctrl is bidirectional */
 		break;
 	case USB_ENDPOINT_XFER_ISOC:
 		type = "Isoc";
-		interval = 1 << (desc->bInterval - 1);
 		break;
 	case USB_ENDPOINT_XFER_BULK:
 		type = "Bulk";
-		if (speed == USB_SPEED_HIGH && dir == 'O') /* uframes per NAK */
-			interval = desc->bInterval;
-		else
-			interval = 0;
 		break;
 	case USB_ENDPOINT_XFER_INT:
 		type = "Int.";
-		if (speed == USB_SPEED_HIGH || speed >= USB_SPEED_SUPER)
-			interval = 1 << (desc->bInterval - 1);
-		else
-			interval = desc->bInterval;
 		break;
 	default:	/* "can't happen" */
 		return start;
 	}
-	interval *= (speed == USB_SPEED_HIGH ||
-		     speed >= USB_SPEED_SUPER) ? 125 : 1000;
-	if (interval % 1000)
+
+	interval = usb_decode_interval(desc, speed);
+	if (interval % 1000) {
 		unit = 'u';
-	else {
+	} else {
 		unit = 'm';
 		interval /= 1000;
 	}
diff --git a/drivers/usb/core/endpoint.c b/drivers/usb/core/endpoint.c
index 903426b6d305..a2530811cf7d 100644
--- a/drivers/usb/core/endpoint.c
+++ b/drivers/usb/core/endpoint.c
@@ -84,40 +84,13 @@ static ssize_t interval_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
 	struct ep_device *ep = to_ep_device(dev);
+	unsigned int interval;
 	char unit;
-	unsigned interval = 0;
-	unsigned in;
 
-	in = (ep->desc->bEndpointAddress & USB_DIR_IN);
-
-	switch (usb_endpoint_type(ep->desc)) {
-	case USB_ENDPOINT_XFER_CONTROL:
-		if (ep->udev->speed == USB_SPEED_HIGH)
-			/* uframes per NAK */
-			interval = ep->desc->bInterval;
-		break;
-
-	case USB_ENDPOINT_XFER_ISOC:
-		interval = 1 << (ep->desc->bInterval - 1);
-		break;
-
-	case USB_ENDPOINT_XFER_BULK:
-		if (ep->udev->speed == USB_SPEED_HIGH && !in)
-			/* uframes per NAK */
-			interval = ep->desc->bInterval;
-		break;
-
-	case USB_ENDPOINT_XFER_INT:
-		if (ep->udev->speed == USB_SPEED_HIGH)
-			interval = 1 << (ep->desc->bInterval - 1);
-		else
-			interval = ep->desc->bInterval;
-		break;
-	}
-	interval *= (ep->udev->speed == USB_SPEED_HIGH) ? 125 : 1000;
-	if (interval % 1000)
+	interval = usb_decode_interval(ep->desc, ep->udev->speed);
+	if (interval % 1000) {
 		unit = 'u';
-	else {
+	} else {
 		unit = 'm';
 		interval /= 1000;
 	}
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index abdd310c77f0..74debc824645 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -90,6 +90,9 @@ extern enum usb_ssp_rate usb_get_maximum_ssp_rate(struct device *dev);
  */
 extern const char *usb_state_string(enum usb_device_state state);
 
+unsigned int usb_decode_interval(const struct usb_endpoint_descriptor *epd,
+				 enum usb_device_speed speed);
+
 #ifdef CONFIG_TRACING
 /**
  * usb_decode_ctrl - Returns human readable representation of control request.
-- 
cgit v1.2.3


From 365038f24b3e9d2b7c9e499f03f432040e28a35c Mon Sep 17 00:00:00 2001
From: Chunfeng Yun <chunfeng.yun@mediatek.com>
Date: Mon, 8 Mar 2021 10:52:07 +0800
Subject: usb: common: move function's kerneldoc next to its definition

Following a general rule, add the kerneldoc for a function next
to it's definition, but not next to its declaration in a header
file.

Suggested-by: Alan Stern <stern@rowland.harvard.edu>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Chunfeng Yun <chunfeng.yun@mediatek.com>
Link: https://lore.kernel.org/r/c4d2e010ae2bf67cdfa0b55e6d1deb9339d9d3dc.1615170625.git.chunfeng.yun@mediatek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/common/common.c | 35 ++++++++++++++++++++++++++
 drivers/usb/common/debug.c  | 22 ++++++++++++++--
 include/linux/usb/ch9.h     | 61 ---------------------------------------------
 3 files changed, 55 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/common/common.c b/drivers/usb/common/common.c
index 675e8a4e683a..347fb3d3894a 100644
--- a/drivers/usb/common/common.c
+++ b/drivers/usb/common/common.c
@@ -25,6 +25,12 @@ static const char *const ep_type_names[] = {
 	[USB_ENDPOINT_XFER_INT] = "intr",
 };
 
+/**
+ * usb_ep_type_string() - Returns human readable-name of the endpoint type.
+ * @ep_type: The endpoint type to return human-readable name for.  If it's not
+ *   any of the types: USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT},
+ *   usually got by usb_endpoint_type(), the string 'unknown' will be returned.
+ */
 const char *usb_ep_type_string(int ep_type)
 {
 	if (ep_type < 0 || ep_type >= ARRAY_SIZE(ep_type_names))
@@ -76,6 +82,12 @@ static const char *const ssp_rate[] = {
 	[USB_SSP_GEN_2x2] = "super-speed-plus-gen2x2",
 };
 
+/**
+ * usb_speed_string() - Returns human readable-name of the speed.
+ * @speed: The speed to return human-readable name for.  If it's not
+ *   any of the speeds defined in usb_device_speed enum, string for
+ *   USB_SPEED_UNKNOWN will be returned.
+ */
 const char *usb_speed_string(enum usb_device_speed speed)
 {
 	if (speed < 0 || speed >= ARRAY_SIZE(speed_names))
@@ -84,6 +96,14 @@ const char *usb_speed_string(enum usb_device_speed speed)
 }
 EXPORT_SYMBOL_GPL(usb_speed_string);
 
+/**
+ * usb_get_maximum_speed - Get maximum requested speed for a given USB
+ * controller.
+ * @dev: Pointer to the given USB controller device
+ *
+ * The function gets the maximum speed string from property "maximum-speed",
+ * and returns the corresponding enum usb_device_speed.
+ */
 enum usb_device_speed usb_get_maximum_speed(struct device *dev)
 {
 	const char *maximum_speed;
@@ -102,6 +122,15 @@ enum usb_device_speed usb_get_maximum_speed(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(usb_get_maximum_speed);
 
+/**
+ * usb_get_maximum_ssp_rate - Get the signaling rate generation and lane count
+ *	of a SuperSpeed Plus capable device.
+ * @dev: Pointer to the given USB controller device
+ *
+ * If the string from "maximum-speed" property is super-speed-plus-genXxY where
+ * 'X' is the generation number and 'Y' is the number of lanes, then this
+ * function returns the corresponding enum usb_ssp_rate.
+ */
 enum usb_ssp_rate usb_get_maximum_ssp_rate(struct device *dev)
 {
 	const char *maximum_speed;
@@ -116,6 +145,12 @@ enum usb_ssp_rate usb_get_maximum_ssp_rate(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(usb_get_maximum_ssp_rate);
 
+/**
+ * usb_state_string - Returns human readable name for the state.
+ * @state: The state to return a human-readable name for. If it's not
+ *	any of the states devices in usb_device_state_string enum,
+ *	the string UNKNOWN will be returned.
+ */
 const char *usb_state_string(enum usb_device_state state)
 {
 	static const char *const names[] = {
diff --git a/drivers/usb/common/debug.c b/drivers/usb/common/debug.c
index ba849c7bc5c7..a76a086b9c54 100644
--- a/drivers/usb/common/debug.c
+++ b/drivers/usb/common/debug.c
@@ -207,8 +207,26 @@ static void usb_decode_set_isoch_delay(__u8 wValue, char *str, size_t size)
 	snprintf(str, size, "Set Isochronous Delay(Delay = %d ns)", wValue);
 }
 
-/*
- * usb_decode_ctrl - returns a string representation of ctrl request
+/**
+ * usb_decode_ctrl - Returns human readable representation of control request.
+ * @str: buffer to return a human-readable representation of control request.
+ *       This buffer should have about 200 bytes.
+ * @size: size of str buffer.
+ * @bRequestType: matches the USB bmRequestType field
+ * @bRequest: matches the USB bRequest field
+ * @wValue: matches the USB wValue field (CPU byte order)
+ * @wIndex: matches the USB wIndex field (CPU byte order)
+ * @wLength: matches the USB wLength field (CPU byte order)
+ *
+ * Function returns decoded, formatted and human-readable description of
+ * control request packet.
+ *
+ * The usage scenario for this is for tracepoints, so function as a return
+ * use the same value as in parameters. This approach allows to use this
+ * function in TP_printk
+ *
+ * Important: wValue, wIndex, wLength parameters before invoking this function
+ * should be processed by le16_to_cpu macro.
  */
 const char *usb_decode_ctrl(char *str, size_t size, __u8 bRequestType,
 			    __u8 bRequest, __u16 wValue, __u16 wIndex,
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 74debc824645..1cffa34740b0 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -45,76 +45,15 @@ enum usb_ssp_rate {
 	USB_SSP_GEN_2x2,
 };
 
-/**
- * usb_ep_type_string() - Returns human readable-name of the endpoint type.
- * @ep_type: The endpoint type to return human-readable name for.  If it's not
- *   any of the types: USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT},
- *   usually got by usb_endpoint_type(), the string 'unknown' will be returned.
- */
 extern const char *usb_ep_type_string(int ep_type);
-
-/**
- * usb_speed_string() - Returns human readable-name of the speed.
- * @speed: The speed to return human-readable name for.  If it's not
- *   any of the speeds defined in usb_device_speed enum, string for
- *   USB_SPEED_UNKNOWN will be returned.
- */
 extern const char *usb_speed_string(enum usb_device_speed speed);
-
-/**
- * usb_get_maximum_speed - Get maximum requested speed for a given USB
- * controller.
- * @dev: Pointer to the given USB controller device
- *
- * The function gets the maximum speed string from property "maximum-speed",
- * and returns the corresponding enum usb_device_speed.
- */
 extern enum usb_device_speed usb_get_maximum_speed(struct device *dev);
-
-/**
- * usb_get_maximum_ssp_rate - Get the signaling rate generation and lane count
- *	of a SuperSpeed Plus capable device.
- * @dev: Pointer to the given USB controller device
- *
- * If the string from "maximum-speed" property is super-speed-plus-genXxY where
- * 'X' is the generation number and 'Y' is the number of lanes, then this
- * function returns the corresponding enum usb_ssp_rate.
- */
 extern enum usb_ssp_rate usb_get_maximum_ssp_rate(struct device *dev);
-
-/**
- * usb_state_string - Returns human readable name for the state.
- * @state: The state to return a human-readable name for. If it's not
- *	any of the states devices in usb_device_state_string enum,
- *	the string UNKNOWN will be returned.
- */
 extern const char *usb_state_string(enum usb_device_state state);
-
 unsigned int usb_decode_interval(const struct usb_endpoint_descriptor *epd,
 				 enum usb_device_speed speed);
 
 #ifdef CONFIG_TRACING
-/**
- * usb_decode_ctrl - Returns human readable representation of control request.
- * @str: buffer to return a human-readable representation of control request.
- *       This buffer should have about 200 bytes.
- * @size: size of str buffer.
- * @bRequestType: matches the USB bmRequestType field
- * @bRequest: matches the USB bRequest field
- * @wValue: matches the USB wValue field (CPU byte order)
- * @wIndex: matches the USB wIndex field (CPU byte order)
- * @wLength: matches the USB wLength field (CPU byte order)
- *
- * Function returns decoded, formatted and human-readable description of
- * control request packet.
- *
- * The usage scenario for this is for tracepoints, so function as a return
- * use the same value as in parameters. This approach allows to use this
- * function in TP_printk
- *
- * Important: wValue, wIndex, wLength parameters before invoking this function
- * should be processed by le16_to_cpu macro.
- */
 extern const char *usb_decode_ctrl(char *str, size_t size, __u8 bRequestType,
 				   __u8 bRequest, __u16 wValue, __u16 wIndex,
 				   __u16 wLength);
-- 
cgit v1.2.3


From 60a35ba9141f06b67150ce3544bc595d049b0d83 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Wed, 3 Mar 2021 23:32:20 -0300
Subject: usb: usb-mx2: Remove unused file

i.MX21 support has been dropped, so remove such unused file.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20210304023220.2362407-1-festevam@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_data/usb-mx2.h | 29 -----------------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 include/linux/platform_data/usb-mx2.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/usb-mx2.h b/include/linux/platform_data/usb-mx2.h
deleted file mode 100644
index 97a670f3d8fb..000000000000
--- a/include/linux/platform_data/usb-mx2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- *	Copyright (C) 2009 Martin Fuzzey <mfuzzey@gmail.com>
- */
-
-#ifndef __ASM_ARCH_MX21_USBH
-#define __ASM_ARCH_MX21_USBH
-
-enum mx21_usbh_xcvr {
-	/* Values below as used by hardware (HWMODE register) */
-	MX21_USBXCVR_TXDIF_RXDIF = 0,
-	MX21_USBXCVR_TXDIF_RXSE = 1,
-	MX21_USBXCVR_TXSE_RXDIF = 2,
-	MX21_USBXCVR_TXSE_RXSE = 3,
-};
-
-struct mx21_usbh_platform_data {
-	enum mx21_usbh_xcvr host_xcvr; /* tranceiver mode host 1,2 ports */
-	enum mx21_usbh_xcvr otg_xcvr; /* tranceiver mode otg (as host) port */
-	u16 	enable_host1:1,
-		enable_host2:1,
-		enable_otg_host:1, /* enable "OTG" port (as host) */
-		host1_xcverless:1, /* traceiverless host1 port */
-		host1_txenoe:1, /* output enable host1 transmit enable */
-		otg_ext_xcvr:1, /* external tranceiver for OTG port */
-		unused:10;
-};
-
-#endif /* __ASM_ARCH_MX21_USBH */
-- 
cgit v1.2.3


From d5b0e0677bfd5efd17c5bbb00156931f0d41cb85 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 Mar 2021 09:38:12 +0100
Subject: u64_stats,lockdep: Fix u64_stats_init() vs lockdep

Jakub reported that:

    static struct net_device *rtl8139_init_board(struct pci_dev *pdev)
    {
	    ...
	    u64_stats_init(&tp->rx_stats.syncp);
	    u64_stats_init(&tp->tx_stats.syncp);
	    ...
    }

results in lockdep getting confused between the RX and TX stats lock.
This is because u64_stats_init() is an inline calling seqcount_init(),
which is a macro using a static variable to generate a lockdep class.

By wrapping that in an inline, we negate the effect of the macro and
fold the static key variable, hence the confusion.

Fix by also making u64_stats_init() a macro for the case where it
matters, leaving the other case an inline for argument validation
etc.

Reported-by: Jakub Kicinski <kuba@kernel.org>
Debugged-by: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Erhard F." <erhard_f@mailbox.org>
Link: https://lkml.kernel.org/r/YEXicy6+9MksdLZh@hirez.programming.kicks-ass.net
---
 include/linux/u64_stats_sync.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index c6abb79501b3..e81856c0ba13 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -115,12 +115,13 @@ static inline void u64_stats_inc(u64_stats_t *p)
 }
 #endif
 
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+#define u64_stats_init(syncp)	seqcount_init(&(syncp)->seq)
+#else
 static inline void u64_stats_init(struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
-	seqcount_init(&syncp->seq);
-#endif
 }
+#endif
 
 static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
-- 
cgit v1.2.3


From 4817a52b306136c8b2b2271d8770401441e4cf79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Mar 2021 15:21:18 +0100
Subject: seqlock,lockdep: Fix seqcount_latch_init()

seqcount_init() must be a macro in order to preserve the static
variable that is used for the lockdep key. Don't then wrap it in an
inline function, which destroys that.

Luckily there aren't many users of this function, but fix it before it
becomes a problem.

Fixes: 80793c3471d9 ("seqlock: Introduce seqcount_latch_t")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YEeFEbNUVkZaXDp4@hirez.programming.kicks-ass.net
---
 include/linux/seqlock.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 2f7bb92b4c9e..f61e34fbaaea 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -664,10 +664,7 @@ typedef struct {
  * seqcount_latch_init() - runtime initializer for seqcount_latch_t
  * @s: Pointer to the seqcount_latch_t instance
  */
-static inline void seqcount_latch_init(seqcount_latch_t *s)
-{
-	seqcount_init(&s->seqcount);
-}
+#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)
 
 /**
  * raw_read_seqcount_latch() - pick even/odd latch data copy
-- 
cgit v1.2.3


From d15dfd31384ba3cb93150e5f87661a76fa419f74 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 9 Mar 2021 12:26:01 +0000
Subject: arm64: mte: Map hotplugged memory as Normal Tagged

In a system supporting MTE, the linear map must allow reading/writing
allocation tags by setting the memory type as Normal Tagged. Currently,
this is only handled for memory present at boot. Hotplugged memory uses
Normal non-Tagged memory.

Introduce pgprot_mhp() for hotplugged memory and use it in
add_memory_resource(). The arm64 code maps pgprot_mhp() to
pgprot_tagged().

Note that ZONE_DEVICE memory should not be mapped as Tagged and
therefore setting the memory type in arch_add_memory() is not feasible.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Fixes: 0178dc761368 ("arm64: mte: Use Normal Tagged attributes for the linear map")
Reported-by: Patrick Daly <pdaly@codeaurora.org>
Tested-by: Patrick Daly <pdaly@codeaurora.org>
Link: https://lore.kernel.org/r/1614745263-27827-1-git-send-email-pdaly@codeaurora.org
Cc: <stable@vger.kernel.org> # 5.10.x
Cc: Will Deacon <will@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20210309122601.5543-1-catalin.marinas@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/pgtable-prot.h | 1 -
 arch/arm64/include/asm/pgtable.h      | 3 +++
 arch/arm64/mm/mmu.c                   | 3 ++-
 include/linux/pgtable.h               | 4 ++++
 mm/memory_hotplug.c                   | 2 +-
 5 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 046be789fbb4..9a65fb528110 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -66,7 +66,6 @@ extern bool arm64_use_ng_mappings;
 #define _PAGE_DEFAULT		(_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
 
 #define PAGE_KERNEL		__pgprot(PROT_NORMAL)
-#define PAGE_KERNEL_TAGGED	__pgprot(PROT_NORMAL_TAGGED)
 #define PAGE_KERNEL_RO		__pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
 #define PAGE_KERNEL_ROX		__pgprot((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY)
 #define PAGE_KERNEL_EXEC	__pgprot(PROT_NORMAL & ~PTE_PXN)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e17b96d0e4b5..47027796c2f9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -486,6 +486,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)
 #define pgprot_device(prot) \
 	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN)
+#define pgprot_tagged(prot) \
+	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED))
+#define pgprot_mhp	pgprot_tagged
 /*
  * DMA allocations for non-coherent devices use what the Arm architecture calls
  * "Normal non-cacheable" memory, which permits speculation, unaligned accesses
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 3802cfbdd20d..9c8aa1b44cd5 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -512,7 +512,8 @@ static void __init map_mem(pgd_t *pgdp)
 		 * if MTE is present. Otherwise, it has the same attributes as
 		 * PAGE_KERNEL.
 		 */
-		__map_memblock(pgdp, start, end, PAGE_KERNEL_TAGGED, flags);
+		__map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
+			       flags);
 	}
 
 	/*
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cdfc4e9f253e..5e772392a379 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -904,6 +904,10 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
 #define pgprot_device pgprot_noncached
 #endif
 
+#ifndef pgprot_mhp
+#define pgprot_mhp(prot)	(prot)
+#endif
+
 #ifdef CONFIG_MMU
 #ifndef pgprot_modify
 #define pgprot_modify pgprot_modify
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5ba51a8bdaeb..0cdbbfbc5757 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1072,7 +1072,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
  */
 int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
-	struct mhp_params params = { .pgprot = PAGE_KERNEL };
+	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
 	u64 start, size;
 	bool new_node = false;
 	int ret;
-- 
cgit v1.2.3


From b1b3ced389795d2671e88dd3e9e07a48dc9632fc Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Wed, 10 Mar 2021 10:06:05 +0200
Subject: mfd: Support ROHM BD9576MUF and BD9573MUF

Add core support for ROHM BD9576MUF and BD9573MUF PMICs which are
mainly used to power the R-Car series processors.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig              |  11 ++++
 drivers/mfd/Makefile             |   1 +
 drivers/mfd/rohm-bd9576.c        | 109 +++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/rohm-bd957x.h  |  59 +++++++++++++++++++++
 include/linux/mfd/rohm-generic.h |   2 +
 5 files changed, 182 insertions(+)
 create mode 100644 drivers/mfd/rohm-bd9576.c
 create mode 100644 include/linux/mfd/rohm-bd957x.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index b74efa469e90..f0c9529e7bfd 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1989,6 +1989,17 @@ config MFD_ROHM_BD71828
 	  Also included is a Coulomb counter, a real-time clock (RTC), and
 	  a 32.768 kHz clock gate.
 
+config MFD_ROHM_BD957XMUF
+	tristate "ROHM BD9576MUF and BD9573MUF Power Management ICs"
+	depends on I2C=y
+	depends on OF
+	select REGMAP_I2C
+	select MFD_CORE
+	help
+	  Select this option to get support for the ROHM BD9576MUF and
+	  BD9573MUF Power Management ICs. BD9576 and BD9573 are primarily
+	  designed to be used to power R-Car series processors.
+
 config MFD_STM32_LPTIMER
 	tristate "Support for STM32 Low-Power Timer"
 	depends on (ARCH_STM32 && OF) || COMPILE_TEST
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 834f5463af28..d93f6f361fd3 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -261,6 +261,7 @@ obj-$(CONFIG_RAVE_SP_CORE)	+= rave-sp.o
 obj-$(CONFIG_MFD_ROHM_BD70528)	+= rohm-bd70528.o
 obj-$(CONFIG_MFD_ROHM_BD71828)	+= rohm-bd71828.o
 obj-$(CONFIG_MFD_ROHM_BD718XX)	+= rohm-bd718x7.o
+obj-$(CONFIG_MFD_ROHM_BD957XMUF)	+= rohm-bd9576.o
 obj-$(CONFIG_MFD_STMFX) 	+= stmfx.o
 obj-$(CONFIG_MFD_KHADAS_MCU) 	+= khadas-mcu.o
 obj-$(CONFIG_MFD_ACER_A500_EC)	+= acer-ec-a500.o
diff --git a/drivers/mfd/rohm-bd9576.c b/drivers/mfd/rohm-bd9576.c
new file mode 100644
index 000000000000..2dbda1f401e2
--- /dev/null
+++ b/drivers/mfd/rohm-bd9576.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021 ROHM Semiconductors
+ *
+ * ROHM BD9576MUF and BD9573MUF PMIC driver
+ */
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/rohm-bd957x.h>
+#include <linux/mfd/rohm-generic.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/types.h>
+
+static struct mfd_cell bd9573_mfd_cells[] = {
+	{ .name = "bd9573-regulator", },
+	{ .name = "bd9576-wdt", },
+};
+
+static struct mfd_cell bd9576_mfd_cells[] = {
+	{ .name = "bd9576-regulator", },
+	{ .name = "bd9576-wdt", },
+};
+
+static const struct regmap_range volatile_ranges[] = {
+	regmap_reg_range(BD957X_REG_SMRB_ASSERT, BD957X_REG_SMRB_ASSERT),
+	regmap_reg_range(BD957X_REG_PMIC_INTERNAL_STAT,
+			 BD957X_REG_PMIC_INTERNAL_STAT),
+	regmap_reg_range(BD957X_REG_INT_THERM_STAT, BD957X_REG_INT_THERM_STAT),
+	regmap_reg_range(BD957X_REG_INT_OVP_STAT, BD957X_REG_INT_SYS_STAT),
+	regmap_reg_range(BD957X_REG_INT_MAIN_STAT, BD957X_REG_INT_MAIN_STAT),
+};
+
+static const struct regmap_access_table volatile_regs = {
+	.yes_ranges = &volatile_ranges[0],
+	.n_yes_ranges = ARRAY_SIZE(volatile_ranges),
+};
+
+static struct regmap_config bd957x_regmap = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.volatile_table = &volatile_regs,
+	.max_register = BD957X_MAX_REGISTER,
+	.cache_type = REGCACHE_RBTREE,
+};
+
+static int bd957x_i2c_probe(struct i2c_client *i2c,
+			     const struct i2c_device_id *id)
+{
+	int ret;
+	struct regmap *regmap;
+	struct mfd_cell *cells;
+	int num_cells;
+	unsigned long chip_type;
+
+	chip_type = (unsigned long)of_device_get_match_data(&i2c->dev);
+
+	switch (chip_type) {
+	case ROHM_CHIP_TYPE_BD9576:
+		cells = bd9576_mfd_cells;
+		num_cells = ARRAY_SIZE(bd9576_mfd_cells);
+		break;
+	case ROHM_CHIP_TYPE_BD9573:
+		cells = bd9573_mfd_cells;
+		num_cells = ARRAY_SIZE(bd9573_mfd_cells);
+		break;
+	default:
+		dev_err(&i2c->dev, "Unknown device type");
+		return -EINVAL;
+	}
+
+	regmap = devm_regmap_init_i2c(i2c, &bd957x_regmap);
+	if (IS_ERR(regmap)) {
+		dev_err(&i2c->dev, "Failed to initialize Regmap\n");
+		return PTR_ERR(regmap);
+	}
+
+	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO, cells,
+				   num_cells, NULL, 0, NULL);
+	if (ret)
+		dev_err(&i2c->dev, "Failed to create subdevices\n");
+
+	return ret;
+}
+
+static const struct of_device_id bd957x_of_match[] = {
+	{ .compatible = "rohm,bd9576", .data = (void *)ROHM_CHIP_TYPE_BD9576, },
+	{ .compatible = "rohm,bd9573", .data = (void *)ROHM_CHIP_TYPE_BD9573, },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, bd957x_of_match);
+
+static struct i2c_driver bd957x_drv = {
+	.driver = {
+		.name = "rohm-bd957x",
+		.of_match_table = bd957x_of_match,
+	},
+	.probe = &bd957x_i2c_probe,
+};
+module_i2c_driver(bd957x_drv);
+
+MODULE_AUTHOR("Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>");
+MODULE_DESCRIPTION("ROHM BD9576MUF and BD9573MUF Power Management IC driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/rohm-bd957x.h b/include/linux/mfd/rohm-bd957x.h
new file mode 100644
index 000000000000..a631abb2c101
--- /dev/null
+++ b/include/linux/mfd/rohm-bd957x.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (C) 2021 ROHM Semiconductors */
+
+#ifndef __LINUX_MFD_BD957X_H__
+#define __LINUX_MFD_BD957X_H__
+
+enum {
+	BD957X_VD50,
+	BD957X_VD18,
+	BD957X_VDDDR,
+	BD957X_VD10,
+	BD957X_VOUTL1,
+	BD957X_VOUTS1,
+};
+
+#define BD957X_REG_SMRB_ASSERT		0x15
+#define BD957X_REG_PMIC_INTERNAL_STAT	0x20
+#define BD957X_REG_INT_THERM_STAT	0x23
+#define BD957X_REG_INT_THERM_MASK	0x24
+#define BD957X_REG_INT_OVP_STAT		0x25
+#define BD957X_REG_INT_SCP_STAT		0x26
+#define BD957X_REG_INT_OCP_STAT		0x27
+#define BD957X_REG_INT_OVD_STAT		0x28
+#define BD957X_REG_INT_UVD_STAT		0x29
+#define BD957X_REG_INT_UVP_STAT		0x2a
+#define BD957X_REG_INT_SYS_STAT		0x2b
+#define BD957X_REG_INT_SYS_MASK		0x2c
+#define BD957X_REG_INT_MAIN_STAT	0x30
+#define BD957X_REG_INT_MAIN_MASK	0x31
+
+#define BD957X_REG_WDT_CONF		0x16
+
+#define BD957X_REG_POW_TRIGGER1		0x41
+#define BD957X_REG_POW_TRIGGER2		0x42
+#define BD957X_REG_POW_TRIGGER3		0x43
+#define BD957X_REG_POW_TRIGGER4		0x44
+#define BD957X_REG_POW_TRIGGERL1	0x45
+#define BD957X_REG_POW_TRIGGERS1	0x46
+
+#define BD957X_REGULATOR_EN_MASK	0xff
+#define BD957X_REGULATOR_DIS_VAL	0xff
+
+#define BD957X_VSEL_REG_MASK		0xff
+
+#define BD957X_MASK_VOUT1_TUNE		0x87
+#define BD957X_MASK_VOUT2_TUNE		0x87
+#define BD957X_MASK_VOUT3_TUNE		0x1f
+#define BD957X_MASK_VOUT4_TUNE		0x1f
+#define BD957X_MASK_VOUTL1_TUNE		0x87
+
+#define BD957X_REG_VOUT1_TUNE		0x50
+#define BD957X_REG_VOUT2_TUNE		0x53
+#define BD957X_REG_VOUT3_TUNE		0x56
+#define BD957X_REG_VOUT4_TUNE		0x59
+#define BD957X_REG_VOUTL1_TUNE		0x5c
+
+#define BD957X_MAX_REGISTER		0x61
+
+#endif
diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 66f673c35303..ac6787464004 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -14,6 +14,8 @@ enum rohm_chip_type {
 	ROHM_CHIP_TYPE_BD71828,
 	ROHM_CHIP_TYPE_BD9571,
 	ROHM_CHIP_TYPE_BD9574,
+	ROHM_CHIP_TYPE_BD9576,
+	ROHM_CHIP_TYPE_BD9573,
 	ROHM_CHIP_TYPE_AMOUNT
 };
 
-- 
cgit v1.2.3


From 0e9692607f94ecc59aedc0ecfd2348124c743412 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Wed, 10 Mar 2021 10:08:02 +0200
Subject: mfd: bd9576: Add IRQ support

BD9573 and BD9576 support set of "protection" interrupts for "fatal"
issues. Those lead to SOC reset as PMIC shuts the power outputs. Thus
there is no relevant IRQ handling for them.

Few "detection" interrupts were added to the BD9576 with the idea that
SOC could take some recovery-action before error gets unrecoverable.

Unfortunately the BD9576 interrupt logic was not re-evaluated. IRQs
are not designed to be properly acknowleged - and IRQ line is kept
active for whole duration of error condition (in comparison to
informing only about state change).

For above reason, do not consider missing IRQ as error.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rohm-bd9576.c       | 90 ++++++++++++++++++++++++++++++++++++++---
 include/linux/mfd/rohm-bd957x.h | 62 ++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/rohm-bd9576.c b/drivers/mfd/rohm-bd9576.c
index 2dbda1f401e2..6661a27d69a8 100644
--- a/drivers/mfd/rohm-bd9576.c
+++ b/drivers/mfd/rohm-bd9576.c
@@ -17,14 +17,30 @@
 #include <linux/regmap.h>
 #include <linux/types.h>
 
+enum {
+	BD957X_REGULATOR_CELL,
+	BD957X_WDT_CELL,
+};
+
+/*
+ * Due to the BD9576MUF nasty IRQ behaiour we don't always populate IRQs.
+ * These will be added to regulator resources only if IRQ information for the
+ * PMIC is populated in device-tree.
+ */
+static const struct resource bd9576_regulator_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD9576_INT_THERM, "bd9576-temp"),
+	DEFINE_RES_IRQ_NAMED(BD9576_INT_OVD, "bd9576-ovd"),
+	DEFINE_RES_IRQ_NAMED(BD9576_INT_UVD, "bd9576-uvd"),
+};
+
 static struct mfd_cell bd9573_mfd_cells[] = {
-	{ .name = "bd9573-regulator", },
-	{ .name = "bd9576-wdt", },
+	[BD957X_REGULATOR_CELL]	= { .name = "bd9573-regulator", },
+	[BD957X_WDT_CELL]	= { .name = "bd9576-wdt", },
 };
 
 static struct mfd_cell bd9576_mfd_cells[] = {
-	{ .name = "bd9576-regulator", },
-	{ .name = "bd9576-wdt", },
+	[BD957X_REGULATOR_CELL]	= { .name = "bd9576-regulator", },
+	[BD957X_WDT_CELL]	= { .name = "bd9576-wdt", },
 };
 
 static const struct regmap_range volatile_ranges[] = {
@@ -49,6 +65,29 @@ static struct regmap_config bd957x_regmap = {
 	.cache_type = REGCACHE_RBTREE,
 };
 
+static struct regmap_irq bd9576_irqs[] = {
+	REGMAP_IRQ_REG(BD9576_INT_THERM, 0, BD957X_MASK_INT_MAIN_THERM),
+	REGMAP_IRQ_REG(BD9576_INT_OVP, 0, BD957X_MASK_INT_MAIN_OVP),
+	REGMAP_IRQ_REG(BD9576_INT_SCP, 0, BD957X_MASK_INT_MAIN_SCP),
+	REGMAP_IRQ_REG(BD9576_INT_OCP, 0, BD957X_MASK_INT_MAIN_OCP),
+	REGMAP_IRQ_REG(BD9576_INT_OVD, 0, BD957X_MASK_INT_MAIN_OVD),
+	REGMAP_IRQ_REG(BD9576_INT_UVD, 0, BD957X_MASK_INT_MAIN_UVD),
+	REGMAP_IRQ_REG(BD9576_INT_UVP, 0, BD957X_MASK_INT_MAIN_UVP),
+	REGMAP_IRQ_REG(BD9576_INT_SYS, 0, BD957X_MASK_INT_MAIN_SYS),
+};
+
+static struct regmap_irq_chip bd9576_irq_chip = {
+	.name = "bd9576_irq",
+	.irqs = &bd9576_irqs[0],
+	.num_irqs = ARRAY_SIZE(bd9576_irqs),
+	.status_base = BD957X_REG_INT_MAIN_STAT,
+	.mask_base = BD957X_REG_INT_MAIN_MASK,
+	.ack_base = BD957X_REG_INT_MAIN_STAT,
+	.init_ack_masked = true,
+	.num_regs = 1,
+	.irq_reg_stride = 1,
+};
+
 static int bd957x_i2c_probe(struct i2c_client *i2c,
 			     const struct i2c_device_id *id)
 {
@@ -57,6 +96,8 @@ static int bd957x_i2c_probe(struct i2c_client *i2c,
 	struct mfd_cell *cells;
 	int num_cells;
 	unsigned long chip_type;
+	struct irq_domain *domain;
+	bool usable_irqs;
 
 	chip_type = (unsigned long)of_device_get_match_data(&i2c->dev);
 
@@ -64,10 +105,16 @@ static int bd957x_i2c_probe(struct i2c_client *i2c,
 	case ROHM_CHIP_TYPE_BD9576:
 		cells = bd9576_mfd_cells;
 		num_cells = ARRAY_SIZE(bd9576_mfd_cells);
+		usable_irqs = !!i2c->irq;
 		break;
 	case ROHM_CHIP_TYPE_BD9573:
 		cells = bd9573_mfd_cells;
 		num_cells = ARRAY_SIZE(bd9573_mfd_cells);
+		/*
+		 * BD9573 only supports fatal IRQs which we can not handle
+		 * because SoC is going to lose the power.
+		 */
+		usable_irqs = false;
 		break;
 	default:
 		dev_err(&i2c->dev, "Unknown device type");
@@ -80,8 +127,41 @@ static int bd957x_i2c_probe(struct i2c_client *i2c,
 		return PTR_ERR(regmap);
 	}
 
+	/*
+	 * BD9576 behaves badly. It kepts IRQ line asserted for the whole
+	 * duration of detected HW condition (like over temperature). So we
+	 * don't require IRQ to be populated.
+	 * If IRQ information is not given, then we mask all IRQs and do not
+	 * provide IRQ resources to regulator driver - which then just omits
+	 * the notifiers.
+	 */
+	if (usable_irqs) {
+		struct regmap_irq_chip_data *irq_data;
+		struct mfd_cell *regulators;
+
+		regulators = &bd9576_mfd_cells[BD957X_REGULATOR_CELL];
+		regulators->resources = bd9576_regulator_irqs;
+		regulators->num_resources = ARRAY_SIZE(bd9576_regulator_irqs);
+
+		ret = devm_regmap_add_irq_chip(&i2c->dev, regmap, i2c->irq,
+					       IRQF_ONESHOT, 0,
+					       &bd9576_irq_chip, &irq_data);
+		if (ret) {
+			dev_err(&i2c->dev, "Failed to add IRQ chip\n");
+			return ret;
+		}
+		domain = regmap_irq_get_domain(irq_data);
+	} else {
+		ret = regmap_update_bits(regmap, BD957X_REG_INT_MAIN_MASK,
+					 BD957X_MASK_INT_ALL,
+					 BD957X_MASK_INT_ALL);
+		if (ret)
+			return ret;
+		domain = NULL;
+	}
+
 	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO, cells,
-				   num_cells, NULL, 0, NULL);
+				   num_cells, NULL, 0, domain);
 	if (ret)
 		dev_err(&i2c->dev, "Failed to create subdevices\n");
 
diff --git a/include/linux/mfd/rohm-bd957x.h b/include/linux/mfd/rohm-bd957x.h
index a631abb2c101..ddb396ff2da5 100644
--- a/include/linux/mfd/rohm-bd957x.h
+++ b/include/linux/mfd/rohm-bd957x.h
@@ -13,6 +13,55 @@ enum {
 	BD957X_VOUTS1,
 };
 
+/*
+ * The BD9576 has own IRQ 'blocks' for:
+ *  - I2C/thermal,
+ *  - Over voltage protection
+ *  - Short-circuit protection
+ *  - Over current protection
+ *  - Over voltage detection
+ *  - Under voltage detection
+ *  - Under voltage protection
+ *  - 'system interrupt'.
+ *
+ * Each of the blocks have a status register giving more accurate IRQ source
+ * information - for example which of the regulators have over-voltage.
+ *
+ * On top of this, there is "main IRQ" status register where each bit indicates
+ * which of sub-blocks have active IRQs. Fine. That would fit regmap-irq main
+ * status handling. Except that:
+ *  - Only some sub-IRQs can be masked.
+ *  - The IRQ informs us about fault-condition, not when fault state changes.
+ *    The IRQ line it is kept asserted until the detected condition is acked
+ *    AND cleared in HW. This is annoying for IRQs like the one informing high
+ *    temperature because if IRQ is not disabled it keeps the CPU in IRQ
+ *    handling loop.
+ *
+ * For now we do just use the main-IRQ register as source for our IRQ
+ * information and bind the regmap-irq to this. We leave fine-grained sub-IRQ
+ * register handling to handlers in sub-devices. The regulator driver shall
+ * read which regulators are source for problem - or if the detected error is
+ * regulator temperature error. The sub-drivers do also handle masking of "sub-
+ * IRQs" if this is supported/needed.
+ *
+ * To overcome the problem with HW keeping IRQ asserted we do call
+ * disable_irq_nosync() from sub-device handler and add a delayed work to
+ * re-enable IRQ roughly 1 second later. This should keep our CPU out of
+ * busy-loop.
+ */
+#define IRQS_SILENT_MS			1000
+
+enum {
+	BD9576_INT_THERM,
+	BD9576_INT_OVP,
+	BD9576_INT_SCP,
+	BD9576_INT_OCP,
+	BD9576_INT_OVD,
+	BD9576_INT_UVD,
+	BD9576_INT_UVP,
+	BD9576_INT_SYS,
+};
+
 #define BD957X_REG_SMRB_ASSERT		0x15
 #define BD957X_REG_PMIC_INTERNAL_STAT	0x20
 #define BD957X_REG_INT_THERM_STAT	0x23
@@ -28,6 +77,19 @@ enum {
 #define BD957X_REG_INT_MAIN_STAT	0x30
 #define BD957X_REG_INT_MAIN_MASK	0x31
 
+#define UVD_IRQ_VALID_MASK		0x6F
+#define OVD_IRQ_VALID_MASK		0x2F
+
+#define BD957X_MASK_INT_MAIN_THERM	BIT(0)
+#define BD957X_MASK_INT_MAIN_OVP	BIT(1)
+#define BD957X_MASK_INT_MAIN_SCP	BIT(2)
+#define BD957X_MASK_INT_MAIN_OCP	BIT(3)
+#define BD957X_MASK_INT_MAIN_OVD	BIT(4)
+#define BD957X_MASK_INT_MAIN_UVD	BIT(5)
+#define BD957X_MASK_INT_MAIN_UVP	BIT(6)
+#define BD957X_MASK_INT_MAIN_SYS	BIT(7)
+#define BD957X_MASK_INT_ALL		0xff
+
 #define BD957X_REG_WDT_CONF		0x16
 
 #define BD957X_REG_POW_TRIGGER1		0x41
-- 
cgit v1.2.3


From 42fc191d60e6d5fd8c52e7afa8bccdc912947ce4 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Wed, 10 Mar 2021 10:11:44 +0200
Subject: mfd: bd9576: Add safety limit/monitoring registers

ROHM BD9576 contains safety features like over/under voltage detection,
over curren detection and over temperature detection. Add the
configuration register information.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rohm-bd957x.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-bd957x.h b/include/linux/mfd/rohm-bd957x.h
index ddb396ff2da5..acc920b64f75 100644
--- a/include/linux/mfd/rohm-bd957x.h
+++ b/include/linux/mfd/rohm-bd957x.h
@@ -116,6 +116,25 @@ enum {
 #define BD957X_REG_VOUT4_TUNE		0x59
 #define BD957X_REG_VOUTL1_TUNE		0x5c
 
+#define BD9576_REG_VOUT1_OVD		0x51
+#define BD9576_REG_VOUT1_UVD		0x52
+#define BD9576_REG_VOUT2_OVD		0x54
+#define BD9576_REG_VOUT2_UVD		0x55
+#define BD9576_REG_VOUT3_OVD		0x57
+#define BD9576_REG_VOUT3_UVD		0x58
+#define BD9576_REG_VOUT4_OVD		0x5a
+#define BD9576_REG_VOUT4_UVD		0x5b
+#define BD9576_REG_VOUTL1_OVD		0x5d
+#define BD9576_REG_VOUTL1_UVD		0x5e
+
+#define BD9576_MASK_XVD			0x7f
+
+#define BD9576_REG_VOUT1S_OCW		0x5f
+#define BD9576_REG_VOUT1S_OCP		0x60
+
+#define BD9576_MASK_VOUT1S_OCW		0x3f
+#define BD9576_MASK_VOUT1S_OCP		0x3f
+
 #define BD957X_MAX_REGISTER		0x61
 
 #endif
-- 
cgit v1.2.3


From eceae583930666a69ab805eee8e81f9699bf6930 Mon Sep 17 00:00:00 2001
From: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Date: Sun, 24 Jan 2021 22:41:23 +0100
Subject: mfd: Add base driver for Netronix embedded controller
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Netronix embedded controller is a microcontroller found in some
e-book readers designed by the original design manufacturer Netronix,
Inc. It contains RTC, battery monitoring, system power management, and
PWM functionality.

This driver implements register access and version detection.

Third-party hardware documentation is available at:

  https://github.com/neuschaefer/linux/wiki/Netronix-MSP430-embedded-controller

The EC supports interrupts, but the driver doesn't make use of them so
far.

Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig       |  11 +++
 drivers/mfd/Makefile      |   1 +
 drivers/mfd/ntxec.c       | 221 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/ntxec.h |  37 ++++++++
 4 files changed, 270 insertions(+)
 create mode 100644 drivers/mfd/ntxec.c
 create mode 100644 include/linux/mfd/ntxec.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index b74efa469e90..a03de3f7a8ed 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -967,6 +967,17 @@ config MFD_VIPERBOARD
 	  You need to select the mfd cell drivers separately.
 	  The drivers do not support all features the board exposes.
 
+config MFD_NTXEC
+	tristate "Netronix embedded controller (EC)"
+	depends on OF || COMPILE_TEST
+	depends on I2C
+	select REGMAP_I2C
+	select MFD_CORE
+	help
+	  Say yes here if you want to support the embedded controller found in
+	  certain e-book readers designed by the original design manufacturer
+	  Netronix.
+
 config MFD_RETU
 	tristate "Nokia Retu and Tahvo multi-function device"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 834f5463af28..bb5e8f2a8e3a 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -218,6 +218,7 @@ obj-$(CONFIG_MFD_INTEL_PMC_BXT)	+= intel_pmc_bxt.o
 obj-$(CONFIG_MFD_INTEL_PMT)	+= intel_pmt.o
 obj-$(CONFIG_MFD_PALMAS)	+= palmas.o
 obj-$(CONFIG_MFD_VIPERBOARD)    += viperboard.o
+obj-$(CONFIG_MFD_NTXEC)		+= ntxec.o
 obj-$(CONFIG_MFD_RC5T583)	+= rc5t583.o rc5t583-irq.o
 obj-$(CONFIG_MFD_RK808)		+= rk808.o
 obj-$(CONFIG_MFD_RN5T618)	+= rn5t618.o
diff --git a/drivers/mfd/ntxec.c b/drivers/mfd/ntxec.c
new file mode 100644
index 000000000000..957de2b03529
--- /dev/null
+++ b/drivers/mfd/ntxec.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The Netronix embedded controller is a microcontroller found in some
+ * e-book readers designed by the original design manufacturer Netronix, Inc.
+ * It contains RTC, battery monitoring, system power management, and PWM
+ * functionality.
+ *
+ * This driver implements register access, version detection, and system
+ * power-off/reset.
+ *
+ * Copyright 2020 Jonathan Neuschäfer <j.neuschaefer@gmx.net>
+ */
+
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/i2c.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/ntxec.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+#include <linux/reboot.h>
+#include <linux/regmap.h>
+#include <linux/types.h>
+#include <asm/unaligned.h>
+
+#define NTXEC_REG_VERSION	0x00
+#define NTXEC_REG_POWEROFF	0x50
+#define NTXEC_REG_POWERKEEP	0x70
+#define NTXEC_REG_RESET		0x90
+
+#define NTXEC_POWEROFF_VALUE	0x0100
+#define NTXEC_POWERKEEP_VALUE	0x0800
+#define NTXEC_RESET_VALUE	0xff00
+
+static struct i2c_client *poweroff_restart_client;
+
+static void ntxec_poweroff(void)
+{
+	int res;
+	u8 buf[3] = { NTXEC_REG_POWEROFF };
+	struct i2c_msg msgs[] = {
+		{
+			.addr = poweroff_restart_client->addr,
+			.flags = 0,
+			.len = sizeof(buf),
+			.buf = buf,
+		},
+	};
+
+	put_unaligned_be16(NTXEC_POWEROFF_VALUE, buf + 1);
+
+	res = i2c_transfer(poweroff_restart_client->adapter, msgs, ARRAY_SIZE(msgs));
+	if (res < 0)
+		dev_warn(&poweroff_restart_client->dev,
+			 "Failed to power off (err = %d)\n", res);
+
+	/*
+	 * The time from the register write until the host CPU is powered off
+	 * has been observed to be about 2.5 to 3 seconds. Sleep long enough to
+	 * safely avoid returning from the poweroff handler.
+	 */
+	msleep(5000);
+}
+
+static int ntxec_restart(struct notifier_block *nb,
+			 unsigned long action, void *data)
+{
+	int res;
+	u8 buf[3] = { NTXEC_REG_RESET };
+	/*
+	 * NOTE: The lower half of the reset value is not sent, because sending
+	 * it causes an I2C error. (The reset handler in the downstream driver
+	 * does send the full two-byte value, but doesn't check the result).
+	 */
+	struct i2c_msg msgs[] = {
+		{
+			.addr = poweroff_restart_client->addr,
+			.flags = 0,
+			.len = sizeof(buf) - 1,
+			.buf = buf,
+		},
+	};
+
+	put_unaligned_be16(NTXEC_RESET_VALUE, buf + 1);
+
+	res = i2c_transfer(poweroff_restart_client->adapter, msgs, ARRAY_SIZE(msgs));
+	if (res < 0)
+		dev_warn(&poweroff_restart_client->dev,
+			 "Failed to restart (err = %d)\n", res);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ntxec_restart_handler = {
+	.notifier_call = ntxec_restart,
+	.priority = 128,
+};
+
+static const struct regmap_config regmap_config = {
+	.name = "ntxec",
+	.reg_bits = 8,
+	.val_bits = 16,
+	.cache_type = REGCACHE_NONE,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+};
+
+static const struct mfd_cell ntxec_subdevices[] = {
+	{ .name = "ntxec-rtc" },
+	{ .name = "ntxec-pwm" },
+};
+
+static int ntxec_probe(struct i2c_client *client)
+{
+	struct ntxec *ec;
+	unsigned int version;
+	int res;
+
+	ec = devm_kmalloc(&client->dev, sizeof(*ec), GFP_KERNEL);
+	if (!ec)
+		return -ENOMEM;
+
+	ec->dev = &client->dev;
+
+	ec->regmap = devm_regmap_init_i2c(client, &regmap_config);
+	if (IS_ERR(ec->regmap)) {
+		dev_err(ec->dev, "Failed to set up regmap for device\n");
+		return PTR_ERR(ec->regmap);
+	}
+
+	/* Determine the firmware version */
+	res = regmap_read(ec->regmap, NTXEC_REG_VERSION, &version);
+	if (res < 0) {
+		dev_err(ec->dev, "Failed to read firmware version number\n");
+		return res;
+	}
+
+	/* Bail out if we encounter an unknown firmware version */
+	switch (version) {
+	case NTXEC_VERSION_KOBO_AURA:
+		break;
+	default:
+		dev_err(ec->dev,
+			"Netronix embedded controller version %04x is not supported.\n",
+			version);
+		return -ENODEV;
+	}
+
+	dev_info(ec->dev,
+		 "Netronix embedded controller version %04x detected.\n", version);
+
+	if (of_device_is_system_power_controller(ec->dev->of_node)) {
+		/*
+		 * Set the 'powerkeep' bit. This is necessary on some boards
+		 * in order to keep the system running.
+		 */
+		res = regmap_write(ec->regmap, NTXEC_REG_POWERKEEP,
+				   NTXEC_POWERKEEP_VALUE);
+		if (res < 0)
+			return res;
+
+		if (poweroff_restart_client)
+			/*
+			 * Another instance of the driver already took
+			 * poweroff/restart duties.
+			 */
+			dev_err(ec->dev, "poweroff_restart_client already assigned\n");
+		else
+			poweroff_restart_client = client;
+
+		if (pm_power_off)
+			/* Another driver already registered a poweroff handler. */
+			dev_err(ec->dev, "pm_power_off already assigned\n");
+		else
+			pm_power_off = ntxec_poweroff;
+
+		res = register_restart_handler(&ntxec_restart_handler);
+		if (res)
+			dev_err(ec->dev,
+				"Failed to register restart handler: %d\n", res);
+	}
+
+	i2c_set_clientdata(client, ec);
+
+	res = devm_mfd_add_devices(ec->dev, PLATFORM_DEVID_NONE, ntxec_subdevices,
+				   ARRAY_SIZE(ntxec_subdevices), NULL, 0, NULL);
+	if (res)
+		dev_err(ec->dev, "Failed to add subdevices: %d\n", res);
+
+	return res;
+}
+
+static int ntxec_remove(struct i2c_client *client)
+{
+	if (client == poweroff_restart_client) {
+		poweroff_restart_client = NULL;
+		pm_power_off = NULL;
+		unregister_restart_handler(&ntxec_restart_handler);
+	}
+
+	return 0;
+}
+
+static const struct of_device_id of_ntxec_match_table[] = {
+	{ .compatible = "netronix,ntxec", },
+	{}
+};
+MODULE_DEVICE_TABLE(of, of_ntxec_match_table);
+
+static struct i2c_driver ntxec_driver = {
+	.driver = {
+		.name = "ntxec",
+		.of_match_table = of_ntxec_match_table,
+	},
+	.probe_new = ntxec_probe,
+	.remove = ntxec_remove,
+};
+module_i2c_driver(ntxec_driver);
+
+MODULE_AUTHOR("Jonathan Neuschäfer <j.neuschaefer@gmx.net>");
+MODULE_DESCRIPTION("Core driver for Netronix EC");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/ntxec.h b/include/linux/mfd/ntxec.h
new file mode 100644
index 000000000000..361204d125f1
--- /dev/null
+++ b/include/linux/mfd/ntxec.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2020 Jonathan Neuschäfer
+ *
+ * Register access and version information for the Netronix embedded
+ * controller.
+ */
+
+#ifndef NTXEC_H
+#define NTXEC_H
+
+#include <linux/types.h>
+
+struct device;
+struct regmap;
+
+struct ntxec {
+	struct device *dev;
+	struct regmap *regmap;
+};
+
+/*
+ * Some registers, such as the battery status register (0x41), are in
+ * big-endian, but others only have eight significant bits, which are in the
+ * first byte transmitted over I2C (the MSB of the big-endian value).
+ * This convenience function converts an 8-bit value to 16-bit for use in the
+ * second kind of register.
+ */
+static inline __be16 ntxec_reg8(u8 value)
+{
+	return value << 8;
+}
+
+/* Known firmware versions */
+#define NTXEC_VERSION_KOBO_AURA	0xd726	/* found in Kobo Aura */
+
+#endif
-- 
cgit v1.2.3


From f7cb7fe34db9f32e8b1c13ecc823112480b875f8 Mon Sep 17 00:00:00 2001
From: Cristian Ciocaltea <cristian.ciocaltea@gmail.com>
Date: Tue, 26 Jan 2021 11:55:59 +0200
Subject: mfd: Add MFD driver for ATC260x PMICs

Add initial support for the Actions Semi ATC260x PMICs which integrates
Audio Codec, Power management, Clock generation and GPIO controller
blocks.

For the moment this driver only supports Regulator, Poweroff and Onkey
functionalities for the ATC2603C and ATC2609A chip variants.

Since the PMICs can be accessed using both I2C and SPI buses, the
following driver structure has been adopted:

           -----> atc260x-core.c (Implements core functionalities)
          /
ATC260x --------> atc260x-i2c.c (Implements I2C interface)
          \
           -----> atc260x-spi.c (Implements SPI interface - TODO)

Co-developed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |  18 ++
 drivers/mfd/Makefile                 |   3 +
 drivers/mfd/atc260x-core.c           | 310 +++++++++++++++++++++++++++++++++++
 drivers/mfd/atc260x-i2c.c            |  64 ++++++++
 include/linux/mfd/atc260x/atc2603c.h | 281 +++++++++++++++++++++++++++++++
 include/linux/mfd/atc260x/atc2609a.h | 308 ++++++++++++++++++++++++++++++++++
 include/linux/mfd/atc260x/core.h     |  58 +++++++
 7 files changed, 1042 insertions(+)
 create mode 100644 drivers/mfd/atc260x-core.c
 create mode 100644 drivers/mfd/atc260x-i2c.c
 create mode 100644 include/linux/mfd/atc260x/atc2603c.h
 create mode 100644 include/linux/mfd/atc260x/atc2609a.h
 create mode 100644 include/linux/mfd/atc260x/core.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index b74efa469e90..f9b43f15790b 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -2055,6 +2055,24 @@ config MFD_WCD934X
 	  This driver provides common support WCD934x audio codec and its
 	  associated Pin Controller, Soundwire Controller and Audio codec.
 
+config MFD_ATC260X
+	tristate
+	select MFD_CORE
+	select REGMAP
+	select REGMAP_IRQ
+
+config MFD_ATC260X_I2C
+	tristate "Actions Semi ATC260x PMICs with I2C"
+	select MFD_ATC260X
+	select REGMAP_I2C
+	depends on I2C
+	help
+	  Support for the Actions Semi ATC260x PMICs controlled via I2C.
+
+	  This driver provides common support for accessing the ATC2603C
+	  and ATC2609A chip variants, additional drivers must be enabled
+	  in order to use the functionality of the device.
+
 config MFD_KHADAS_MCU
 	tristate "Support for Khadas System control Microcontroller"
 	depends on I2C
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 834f5463af28..f7872d0bd570 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -268,3 +268,6 @@ obj-$(CONFIG_MFD_ACER_A500_EC)	+= acer-ec-a500.o
 obj-$(CONFIG_SGI_MFD_IOC3)	+= ioc3.o
 obj-$(CONFIG_MFD_SIMPLE_MFD_I2C)	+= simple-mfd-i2c.o
 obj-$(CONFIG_MFD_INTEL_M10_BMC)   += intel-m10-bmc.o
+
+obj-$(CONFIG_MFD_ATC260X)	+= atc260x-core.o
+obj-$(CONFIG_MFD_ATC260X_I2C)	+= atc260x-i2c.o
diff --git a/drivers/mfd/atc260x-core.c b/drivers/mfd/atc260x-core.c
new file mode 100644
index 000000000000..7148ff5b05b1
--- /dev/null
+++ b/drivers/mfd/atc260x-core.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Core support for ATC260x PMICs
+ *
+ * Copyright (C) 2019 Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ * Copyright (C) 2020 Cristian Ciocaltea <cristian.ciocaltea@gmail.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/mfd/atc260x/core.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+
+#define ATC260X_CHIP_REV_MAX	31
+
+struct atc260x_init_regs {
+	unsigned int cmu_devrst;
+	unsigned int cmu_devrst_ints;
+	unsigned int ints_msk;
+	unsigned int pad_en;
+	unsigned int pad_en_extirq;
+};
+
+static void regmap_lock_mutex(void *__mutex)
+{
+	struct mutex *mutex = __mutex;
+
+	/*
+	 * Using regmap within an atomic context (e.g. accessing a PMIC when
+	 * powering system down) is normally allowed only if the regmap type
+	 * is MMIO and the regcache type is either REGCACHE_NONE or
+	 * REGCACHE_FLAT. For slow buses like I2C and SPI, the regmap is
+	 * internally protected by a mutex which is acquired non-atomically.
+	 *
+	 * Let's improve this by using a customized locking scheme inspired
+	 * from I2C atomic transfer. See i2c_in_atomic_xfer_mode() for a
+	 * starting point.
+	 */
+	if (system_state > SYSTEM_RUNNING && irqs_disabled())
+		mutex_trylock(mutex);
+	else
+		mutex_lock(mutex);
+}
+
+static void regmap_unlock_mutex(void *__mutex)
+{
+	struct mutex *mutex = __mutex;
+
+	mutex_unlock(mutex);
+}
+
+static const struct regmap_config atc2603c_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 16,
+	.max_register = ATC2603C_SADDR,
+	.cache_type = REGCACHE_NONE,
+};
+
+static const struct regmap_config atc2609a_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 16,
+	.max_register = ATC2609A_SADDR,
+	.cache_type = REGCACHE_NONE,
+};
+
+static const struct regmap_irq atc2603c_regmap_irqs[] = {
+	REGMAP_IRQ_REG(ATC2603C_IRQ_AUDIO,	0, ATC2603C_INTS_MSK_AUDIO),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_OV,		0, ATC2603C_INTS_MSK_OV),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_OC,		0, ATC2603C_INTS_MSK_OC),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_OT,		0, ATC2603C_INTS_MSK_OT),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_UV,		0, ATC2603C_INTS_MSK_UV),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_ALARM,	0, ATC2603C_INTS_MSK_ALARM),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_ONOFF,	0, ATC2603C_INTS_MSK_ONOFF),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_SGPIO,	0, ATC2603C_INTS_MSK_SGPIO),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_IR,		0, ATC2603C_INTS_MSK_IR),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_REMCON,	0, ATC2603C_INTS_MSK_REMCON),
+	REGMAP_IRQ_REG(ATC2603C_IRQ_POWER_IN,	0, ATC2603C_INTS_MSK_POWERIN),
+};
+
+static const struct regmap_irq atc2609a_regmap_irqs[] = {
+	REGMAP_IRQ_REG(ATC2609A_IRQ_AUDIO,	0, ATC2609A_INTS_MSK_AUDIO),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_OV,		0, ATC2609A_INTS_MSK_OV),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_OC,		0, ATC2609A_INTS_MSK_OC),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_OT,		0, ATC2609A_INTS_MSK_OT),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_UV,		0, ATC2609A_INTS_MSK_UV),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_ALARM,	0, ATC2609A_INTS_MSK_ALARM),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_ONOFF,	0, ATC2609A_INTS_MSK_ONOFF),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_WKUP,	0, ATC2609A_INTS_MSK_WKUP),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_IR,		0, ATC2609A_INTS_MSK_IR),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_REMCON,	0, ATC2609A_INTS_MSK_REMCON),
+	REGMAP_IRQ_REG(ATC2609A_IRQ_POWER_IN,	0, ATC2609A_INTS_MSK_POWERIN),
+};
+
+static const struct regmap_irq_chip atc2603c_regmap_irq_chip = {
+	.name = "atc2603c",
+	.irqs = atc2603c_regmap_irqs,
+	.num_irqs = ARRAY_SIZE(atc2603c_regmap_irqs),
+	.num_regs = 1,
+	.status_base = ATC2603C_INTS_PD,
+	.mask_base = ATC2603C_INTS_MSK,
+	.mask_invert = true,
+};
+
+static const struct regmap_irq_chip atc2609a_regmap_irq_chip = {
+	.name = "atc2609a",
+	.irqs = atc2609a_regmap_irqs,
+	.num_irqs = ARRAY_SIZE(atc2609a_regmap_irqs),
+	.num_regs = 1,
+	.status_base = ATC2609A_INTS_PD,
+	.mask_base = ATC2609A_INTS_MSK,
+	.mask_invert = true,
+};
+
+static const struct resource atc2603c_onkey_resources[] = {
+	DEFINE_RES_IRQ(ATC2603C_IRQ_ONOFF),
+};
+
+static const struct resource atc2609a_onkey_resources[] = {
+	DEFINE_RES_IRQ(ATC2609A_IRQ_ONOFF),
+};
+
+static const struct mfd_cell atc2603c_mfd_cells[] = {
+	{ .name = "atc260x-regulator" },
+	{ .name = "atc260x-pwrc" },
+	{
+		.name = "atc260x-onkey",
+		.num_resources = ARRAY_SIZE(atc2603c_onkey_resources),
+		.resources = atc2603c_onkey_resources,
+	},
+};
+
+static const struct mfd_cell atc2609a_mfd_cells[] = {
+	{ .name = "atc260x-regulator" },
+	{ .name = "atc260x-pwrc" },
+	{
+		.name = "atc260x-onkey",
+		.num_resources = ARRAY_SIZE(atc2609a_onkey_resources),
+		.resources = atc2609a_onkey_resources,
+	},
+};
+
+static const struct atc260x_init_regs atc2603c_init_regs = {
+	.cmu_devrst = ATC2603C_CMU_DEVRST,
+	.cmu_devrst_ints = ATC2603C_CMU_DEVRST_INTS,
+	.ints_msk = ATC2603C_INTS_MSK,
+	.pad_en = ATC2603C_PAD_EN,
+	.pad_en_extirq = ATC2603C_PAD_EN_EXTIRQ,
+};
+
+static const struct atc260x_init_regs atc2609a_init_regs = {
+	.cmu_devrst = ATC2609A_CMU_DEVRST,
+	.cmu_devrst_ints = ATC2609A_CMU_DEVRST_INTS,
+	.ints_msk = ATC2609A_INTS_MSK,
+	.pad_en = ATC2609A_PAD_EN,
+	.pad_en_extirq = ATC2609A_PAD_EN_EXTIRQ,
+};
+
+static void atc260x_cmu_reset(struct atc260x *atc260x)
+{
+	const struct atc260x_init_regs *regs = atc260x->init_regs;
+
+	/* Assert reset */
+	regmap_update_bits(atc260x->regmap, regs->cmu_devrst,
+			   regs->cmu_devrst_ints, ~regs->cmu_devrst_ints);
+
+	/* De-assert reset */
+	regmap_update_bits(atc260x->regmap, regs->cmu_devrst,
+			   regs->cmu_devrst_ints, regs->cmu_devrst_ints);
+}
+
+static void atc260x_dev_init(struct atc260x *atc260x)
+{
+	const struct atc260x_init_regs *regs = atc260x->init_regs;
+
+	/* Initialize interrupt block */
+	atc260x_cmu_reset(atc260x);
+
+	/* Disable all interrupt sources */
+	regmap_write(atc260x->regmap, regs->ints_msk, 0);
+
+	/* Enable EXTIRQ pad */
+	regmap_update_bits(atc260x->regmap, regs->pad_en,
+			   regs->pad_en_extirq, regs->pad_en_extirq);
+}
+
+/**
+ * atc260x_match_device(): Setup ATC260x variant related fields
+ *
+ * @atc260x: ATC260x device to setup (.dev field must be set)
+ * @regmap_cfg: regmap config associated with this ATC260x device
+ *
+ * This lets the ATC260x core configure the MFD cells and register maps
+ * for later use.
+ */
+int atc260x_match_device(struct atc260x *atc260x, struct regmap_config *regmap_cfg)
+{
+	struct device *dev = atc260x->dev;
+	const void *of_data;
+
+	of_data = of_device_get_match_data(dev);
+	if (!of_data)
+		return -ENODEV;
+
+	atc260x->ic_type = (unsigned long)of_data;
+
+	switch (atc260x->ic_type) {
+	case ATC2603C:
+		*regmap_cfg = atc2603c_regmap_config;
+		atc260x->regmap_irq_chip = &atc2603c_regmap_irq_chip;
+		atc260x->cells = atc2603c_mfd_cells;
+		atc260x->nr_cells = ARRAY_SIZE(atc2603c_mfd_cells);
+		atc260x->type_name = "atc2603c";
+		atc260x->rev_reg = ATC2603C_CHIP_VER;
+		atc260x->init_regs = &atc2603c_init_regs;
+		break;
+	case ATC2609A:
+		*regmap_cfg = atc2609a_regmap_config;
+		atc260x->regmap_irq_chip = &atc2609a_regmap_irq_chip;
+		atc260x->cells = atc2609a_mfd_cells;
+		atc260x->nr_cells = ARRAY_SIZE(atc2609a_mfd_cells);
+		atc260x->type_name = "atc2609a";
+		atc260x->rev_reg = ATC2609A_CHIP_VER;
+		atc260x->init_regs = &atc2609a_init_regs;
+		break;
+	default:
+		dev_err(dev, "Unsupported ATC260x device type: %u\n",
+			atc260x->ic_type);
+		return -EINVAL;
+	}
+
+	atc260x->regmap_mutex = devm_kzalloc(dev, sizeof(*atc260x->regmap_mutex),
+					     GFP_KERNEL);
+	if (!atc260x->regmap_mutex)
+		return -ENOMEM;
+
+	mutex_init(atc260x->regmap_mutex);
+
+	regmap_cfg->lock = regmap_lock_mutex,
+	regmap_cfg->unlock = regmap_unlock_mutex,
+	regmap_cfg->lock_arg = atc260x->regmap_mutex;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(atc260x_match_device);
+
+/**
+ * atc260x_device_probe(): Probe a configured ATC260x device
+ *
+ * @atc260x: ATC260x device to probe (must be configured)
+ *
+ * This function lets the ATC260x core register the ATC260x MFD devices
+ * and IRQCHIP. The ATC260x device passed in must be fully configured
+ * with atc260x_match_device, its IRQ set, and regmap created.
+ */
+int atc260x_device_probe(struct atc260x *atc260x)
+{
+	struct device *dev = atc260x->dev;
+	unsigned int chip_rev;
+	int ret;
+
+	if (!atc260x->irq) {
+		dev_err(dev, "No interrupt support\n");
+		return -EINVAL;
+	}
+
+	/* Initialize the hardware */
+	atc260x_dev_init(atc260x);
+
+	ret = regmap_read(atc260x->regmap, atc260x->rev_reg, &chip_rev);
+	if (ret) {
+		dev_err(dev, "Failed to get chip revision\n");
+		return ret;
+	}
+
+	if (chip_rev > ATC260X_CHIP_REV_MAX) {
+		dev_err(dev, "Unknown chip revision: %u\n", chip_rev);
+		return -EINVAL;
+	}
+
+	atc260x->ic_ver = __ffs(chip_rev + 1U);
+
+	dev_info(dev, "Detected chip type %s rev.%c\n",
+		 atc260x->type_name, 'A' + atc260x->ic_ver);
+
+	ret = devm_regmap_add_irq_chip(dev, atc260x->regmap, atc260x->irq, IRQF_ONESHOT,
+				       -1, atc260x->regmap_irq_chip, &atc260x->irq_data);
+	if (ret) {
+		dev_err(dev, "Failed to add IRQ chip: %d\n", ret);
+		return ret;
+	}
+
+	ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE,
+				   atc260x->cells, atc260x->nr_cells, NULL, 0,
+				   regmap_irq_get_domain(atc260x->irq_data));
+	if (ret) {
+		dev_err(dev, "Failed to add child devices: %d\n", ret);
+		regmap_del_irq_chip(atc260x->irq, atc260x->irq_data);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(atc260x_device_probe);
+
+MODULE_DESCRIPTION("ATC260x PMICs Core support");
+MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>");
+MODULE_AUTHOR("Cristian Ciocaltea <cristian.ciocaltea@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/atc260x-i2c.c b/drivers/mfd/atc260x-i2c.c
new file mode 100644
index 000000000000..362005703367
--- /dev/null
+++ b/drivers/mfd/atc260x-i2c.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * I2C bus interface for ATC260x PMICs
+ *
+ * Copyright (C) 2019 Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ * Copyright (C) 2020 Cristian Ciocaltea <cristian.ciocaltea@gmail.com>
+ */
+
+#include <linux/i2c.h>
+#include <linux/mfd/atc260x/core.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/regmap.h>
+
+static int atc260x_i2c_probe(struct i2c_client *client,
+			     const struct i2c_device_id *id)
+{
+	struct atc260x *atc260x;
+	struct regmap_config regmap_cfg;
+	int ret;
+
+	atc260x = devm_kzalloc(&client->dev, sizeof(*atc260x), GFP_KERNEL);
+	if (!atc260x)
+		return -ENOMEM;
+
+	atc260x->dev = &client->dev;
+	atc260x->irq = client->irq;
+
+	ret = atc260x_match_device(atc260x, &regmap_cfg);
+	if (ret)
+		return ret;
+
+	i2c_set_clientdata(client, atc260x);
+
+	atc260x->regmap = devm_regmap_init_i2c(client, &regmap_cfg);
+	if (IS_ERR(atc260x->regmap)) {
+		ret = PTR_ERR(atc260x->regmap);
+		dev_err(&client->dev, "failed to init regmap: %d\n", ret);
+		return ret;
+	}
+
+	return atc260x_device_probe(atc260x);
+}
+
+const struct of_device_id atc260x_i2c_of_match[] = {
+	{ .compatible = "actions,atc2603c", .data = (void *)ATC2603C },
+	{ .compatible = "actions,atc2609a", .data = (void *)ATC2609A },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, atc260x_i2c_of_match);
+
+static struct i2c_driver atc260x_i2c_driver = {
+	.driver = {
+		.name = "atc260x",
+		.of_match_table	= of_match_ptr(atc260x_i2c_of_match),
+	},
+	.probe = atc260x_i2c_probe,
+};
+module_i2c_driver(atc260x_i2c_driver);
+
+MODULE_DESCRIPTION("ATC260x PMICs I2C bus interface");
+MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>");
+MODULE_AUTHOR("Cristian Ciocaltea <cristian.ciocaltea@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/atc260x/atc2603c.h b/include/linux/mfd/atc260x/atc2603c.h
new file mode 100644
index 000000000000..07ac640ef3e1
--- /dev/null
+++ b/include/linux/mfd/atc260x/atc2603c.h
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ATC2603C PMIC register definitions
+ *
+ * Copyright (C) 2020 Cristian Ciocaltea <cristian.ciocaltea@gmail.com>
+ */
+
+#ifndef __LINUX_MFD_ATC260X_ATC2603C_H
+#define __LINUX_MFD_ATC260X_ATC2603C_H
+
+enum atc2603c_irq_def {
+	ATC2603C_IRQ_AUDIO = 0,
+	ATC2603C_IRQ_OV,
+	ATC2603C_IRQ_OC,
+	ATC2603C_IRQ_OT,
+	ATC2603C_IRQ_UV,
+	ATC2603C_IRQ_ALARM,
+	ATC2603C_IRQ_ONOFF,
+	ATC2603C_IRQ_SGPIO,
+	ATC2603C_IRQ_IR,
+	ATC2603C_IRQ_REMCON,
+	ATC2603C_IRQ_POWER_IN,
+};
+
+/* PMU Registers */
+#define ATC2603C_PMU_SYS_CTL0			0x00
+#define ATC2603C_PMU_SYS_CTL1			0x01
+#define ATC2603C_PMU_SYS_CTL2			0x02
+#define ATC2603C_PMU_SYS_CTL3			0x03
+#define ATC2603C_PMU_SYS_CTL4			0x04
+#define ATC2603C_PMU_SYS_CTL5			0x05
+#define ATC2603C_PMU_SYS_CTL6			0x06
+#define ATC2603C_PMU_SYS_CTL7			0x07
+#define ATC2603C_PMU_SYS_CTL8			0x08
+#define ATC2603C_PMU_SYS_CTL9			0x09
+#define ATC2603C_PMU_BAT_CTL0			0x0A
+#define ATC2603C_PMU_BAT_CTL1			0x0B
+#define ATC2603C_PMU_VBUS_CTL0			0x0C
+#define ATC2603C_PMU_VBUS_CTL1			0x0D
+#define ATC2603C_PMU_WALL_CTL0			0x0E
+#define ATC2603C_PMU_WALL_CTL1			0x0F
+#define ATC2603C_PMU_SYS_PENDING		0x10
+#define ATC2603C_PMU_DC1_CTL0			0x11
+#define ATC2603C_PMU_DC1_CTL1			0x12 // Undocumented
+#define ATC2603C_PMU_DC1_CTL2			0x13 // Undocumented
+#define ATC2603C_PMU_DC2_CTL0			0x14
+#define ATC2603C_PMU_DC2_CTL1			0x15 // Undocumented
+#define ATC2603C_PMU_DC2_CTL2			0x16 // Undocumented
+#define ATC2603C_PMU_DC3_CTL0			0x17
+#define ATC2603C_PMU_DC3_CTL1			0x18 // Undocumented
+#define ATC2603C_PMU_DC3_CTL2			0x19 // Undocumented
+#define ATC2603C_PMU_DC4_CTL0			0x1A // Undocumented
+#define ATC2603C_PMU_DC4_CTL1			0x1B // Undocumented
+#define ATC2603C_PMU_DC5_CTL0			0x1C // Undocumented
+#define ATC2603C_PMU_DC5_CTL1			0x1D // Undocumented
+#define ATC2603C_PMU_LDO1_CTL			0x1E
+#define ATC2603C_PMU_LDO2_CTL			0x1F
+#define ATC2603C_PMU_LDO3_CTL			0x20
+#define ATC2603C_PMU_LDO4_CTL			0x21 // Undocumented
+#define ATC2603C_PMU_LDO5_CTL			0x22
+#define ATC2603C_PMU_LDO6_CTL			0x23
+#define ATC2603C_PMU_LDO7_CTL			0x24
+#define ATC2603C_PMU_LDO8_CTL			0x25 // Undocumented
+#define ATC2603C_PMU_LDO9_CTL			0x26 // Undocumented
+#define ATC2603C_PMU_LDO10_CTL			0x27 // Undocumented
+#define ATC2603C_PMU_LDO11_CTL			0x28
+#define ATC2603C_PMU_SWITCH_CTL			0x29
+#define ATC2603C_PMU_OV_CTL0			0x2A
+#define ATC2603C_PMU_OV_CTL1			0x2B
+#define ATC2603C_PMU_OV_STATUS			0x2C
+#define ATC2603C_PMU_OV_EN			0x2D
+#define ATC2603C_PMU_OV_INT_EN			0x2E
+#define ATC2603C_PMU_OC_CTL			0x2F
+#define ATC2603C_PMU_OC_STATUS			0x30
+#define ATC2603C_PMU_OC_EN			0x31
+#define ATC2603C_PMU_OC_INT_EN			0x32
+#define ATC2603C_PMU_UV_CTL0			0x33
+#define ATC2603C_PMU_UV_CTL1			0x34
+#define ATC2603C_PMU_UV_STATUS			0x35
+#define ATC2603C_PMU_UV_EN			0x36
+#define ATC2603C_PMU_UV_INT_EN			0x37
+#define ATC2603C_PMU_OT_CTL			0x38
+#define ATC2603C_PMU_CHARGER_CTL0		0x39
+#define ATC2603C_PMU_CHARGER_CTL1		0x3A
+#define ATC2603C_PMU_CHARGER_CTL2		0x3B
+#define ATC2603C_PMU_BAKCHARGER_CTL		0x3C // Undocumented
+#define ATC2603C_PMU_APDS_CTL			0x3D
+#define ATC2603C_PMU_AUXADC_CTL0		0x3E
+#define ATC2603C_PMU_AUXADC_CTL1		0x3F
+#define ATC2603C_PMU_BATVADC			0x40
+#define ATC2603C_PMU_BATIADC			0x41
+#define ATC2603C_PMU_WALLVADC			0x42
+#define ATC2603C_PMU_WALLIADC			0x43
+#define ATC2603C_PMU_VBUSVADC			0x44
+#define ATC2603C_PMU_VBUSIADC			0x45
+#define ATC2603C_PMU_SYSPWRADC			0x46
+#define ATC2603C_PMU_REMCONADC			0x47
+#define ATC2603C_PMU_SVCCADC			0x48
+#define ATC2603C_PMU_CHGIADC			0x49
+#define ATC2603C_PMU_IREFADC			0x4A
+#define ATC2603C_PMU_BAKBATADC			0x4B
+#define ATC2603C_PMU_ICTEMPADC			0x4C
+#define ATC2603C_PMU_AUXADC0			0x4D
+#define ATC2603C_PMU_AUXADC1			0x4E
+#define ATC2603C_PMU_AUXADC2			0x4F
+#define	ATC2603C_PMU_ICMADC			0x50
+#define ATC2603C_PMU_BDG_CTL			0x51 // Undocumented
+#define ATC2603C_RTC_CTL			0x52
+#define ATC2603C_RTC_MSALM			0x53
+#define ATC2603C_RTC_HALM			0x54
+#define ATC2603C_RTC_YMDALM			0x55
+#define ATC2603C_RTC_MS				0x56
+#define ATC2603C_RTC_H				0x57
+#define ATC2603C_RTC_DC				0x58
+#define ATC2603C_RTC_YMD			0x59
+#define ATC2603C_EFUSE_DAT			0x5A // Undocumented
+#define ATC2603C_EFUSECRTL1			0x5B // Undocumented
+#define ATC2603C_EFUSECRTL2			0x5C // Undocumented
+#define ATC2603C_PMU_FW_USE0			0x5D // Undocumented
+#define ATC2603C_PMU_FW_USE1			0x5E // Undocumented
+#define ATC2603C_PMU_FW_USE2			0x5F // Undocumented
+#define ATC2603C_PMU_FW_USE3			0x60 // Undocumented
+#define ATC2603C_PMU_FW_USE4			0x61 // Undocumented
+#define ATC2603C_PMU_ABNORMAL_STATUS		0x62
+#define ATC2603C_PMU_WALL_APDS_CTL		0x63
+#define ATC2603C_PMU_REMCON_CTL0		0x64
+#define ATC2603C_PMU_REMCON_CTL1		0x65
+#define ATC2603C_PMU_MUX_CTL0			0x66
+#define ATC2603C_PMU_SGPIO_CTL0			0x67
+#define ATC2603C_PMU_SGPIO_CTL1			0x68
+#define ATC2603C_PMU_SGPIO_CTL2			0x69
+#define ATC2603C_PMU_SGPIO_CTL3			0x6A
+#define ATC2603C_PMU_SGPIO_CTL4			0x6B
+#define ATC2603C_PWMCLK_CTL			0x6C
+#define ATC2603C_PWM0_CTL			0x6D
+#define ATC2603C_PWM1_CTL			0x6E
+#define ATC2603C_PMU_ADC_DBG0			0x70
+#define ATC2603C_PMU_ADC_DBG1			0x71
+#define ATC2603C_PMU_ADC_DBG2			0x72
+#define ATC2603C_PMU_ADC_DBG3			0x73
+#define ATC2603C_PMU_ADC_DBG4			0x74
+#define ATC2603C_IRC_CTL			0x80
+#define ATC2603C_IRC_STAT			0x81
+#define ATC2603C_IRC_CC				0x82
+#define ATC2603C_IRC_KDC			0x83
+#define ATC2603C_IRC_WK				0x84
+#define ATC2603C_IRC_RCC			0x85
+#define ATC2603C_IRC_FILTER			0x86
+
+/* AUDIO_OUT Registers */
+#define ATC2603C_AUDIOINOUT_CTL			0xA0
+#define ATC2603C_AUDIO_DEBUGOUTCTL		0xA1
+#define ATC2603C_DAC_DIGITALCTL			0xA2
+#define ATC2603C_DAC_VOLUMECTL0			0xA3
+#define ATC2603C_DAC_ANALOG0			0xA4
+#define ATC2603C_DAC_ANALOG1			0xA5
+#define ATC2603C_DAC_ANALOG2			0xA6
+#define ATC2603C_DAC_ANALOG3			0xA7
+
+/* AUDIO_IN Registers */
+#define ATC2603C_ADC_DIGITALCTL			0xA8
+#define ATC2603C_ADC_HPFCTL			0xA9
+#define ATC2603C_ADC_CTL			0xAA
+#define ATC2603C_AGC_CTL0			0xAB
+#define ATC2603C_AGC_CTL1			0xAC // Undocumented
+#define ATC2603C_AGC_CTL2			0xAD
+#define ATC2603C_ADC_ANALOG0			0xAE
+#define ATC2603C_ADC_ANALOG1			0xAF
+
+/* PCM_IF Registers */
+#define ATC2603C_PCM0_CTL			0xB0 // Undocumented
+#define ATC2603C_PCM1_CTL			0xB1 // Undocumented
+#define ATC2603C_PCM2_CTL			0xB2 // Undocumented
+#define ATC2603C_PCMIF_CTL			0xB3 // Undocumented
+
+/* CMU_CONTROL Registers */
+#define ATC2603C_CMU_DEVRST			0xC1 // Undocumented
+
+/* INTS Registers */
+#define ATC2603C_INTS_PD			0xC8
+#define ATC2603C_INTS_MSK			0xC9
+
+/* MFP Registers */
+#define ATC2603C_MFP_CTL			0xD0
+#define ATC2603C_PAD_VSEL			0xD1 // Undocumented
+#define ATC2603C_GPIO_OUTEN			0xD2
+#define ATC2603C_GPIO_INEN			0xD3
+#define ATC2603C_GPIO_DAT			0xD4
+#define ATC2603C_PAD_DRV			0xD5
+#define ATC2603C_PAD_EN				0xD6
+#define ATC2603C_DEBUG_SEL			0xD7 // Undocumented
+#define ATC2603C_DEBUG_IE			0xD8 // Undocumented
+#define ATC2603C_DEBUG_OE			0xD9 // Undocumented
+#define ATC2603C_BIST_START			0x0A // Undocumented
+#define ATC2603C_BIST_RESULT			0x0B // Undocumented
+#define ATC2603C_CHIP_VER			0xDC
+
+/* TWSI Registers */
+#define ATC2603C_SADDR				0xFF
+
+/* PMU_SYS_CTL0 Register Mask Bits */
+#define ATC2603C_PMU_SYS_CTL0_IR_WK_EN			BIT(5)
+#define ATC2603C_PMU_SYS_CTL0_RESET_WK_EN		BIT(6)
+#define ATC2603C_PMU_SYS_CTL0_HDSW_WK_EN		BIT(7)
+#define ATC2603C_PMU_SYS_CTL0_ALARM_WK_EN		BIT(8)
+#define ATC2603C_PMU_SYS_CTL0_REM_CON_WK_EN		BIT(9)
+#define ATC2603C_PMU_SYS_CTL0_RESTART_EN		BIT(10)
+#define ATC2603C_PMU_SYS_CTL0_SGPIOIRQ_WK_EN		BIT(11)
+#define ATC2603C_PMU_SYS_CTL0_ONOFF_SHORT_WK_EN		BIT(12)
+#define ATC2603C_PMU_SYS_CTL0_ONOFF_LONG_WK_EN		BIT(13)
+#define ATC2603C_PMU_SYS_CTL0_WALL_WK_EN		BIT(14)
+#define ATC2603C_PMU_SYS_CTL0_USB_WK_EN			BIT(15)
+#define ATC2603C_PMU_SYS_CTL0_WK_ALL			(GENMASK(15, 5) & (~BIT(10)))
+
+/* PMU_SYS_CTL1 Register Mask Bits */
+#define ATC2603C_PMU_SYS_CTL1_EN_S1			BIT(0)
+#define ATC2603C_PMU_SYS_CTL1_LB_S4_EN			BIT(2)
+#define ATC2603C_PMU_SYS_CTL1_LB_S4			GENMASK(4, 3)
+#define ATC2603C_PMU_SYS_CTL1_LB_S4_3_1V		BIT(4)
+#define ATC2603C_PMU_SYS_CTL1_IR_WK_FLAG		BIT(5)
+#define ATC2603C_PMU_SYS_CTL1_RESET_WK_FLAG		BIT(6)
+#define ATC2603C_PMU_SYS_CTL1_HDSW_WK_FLAG		BIT(7)
+#define ATC2603C_PMU_SYS_CTL1_ALARM_WK_FLAG		BIT(8)
+#define ATC2603C_PMU_SYS_CTL1_REM_CON_WK_FLAG		BIT(9)
+#define ATC2603C_PMU_SYS_CTL1_ONOFF_PRESS_RESET_IRQ_PD	BIT(10)
+#define ATC2603C_PMU_SYS_CTL1_SGPIOIRQ_WK_FLAG		BIT(11)
+#define ATC2603C_PMU_SYS_CTL1_ONOFF_SHORT_WK_FLAG	BIT(12)
+#define ATC2603C_PMU_SYS_CTL1_ONOFF_LONG_WK_FLAG	BIT(13)
+#define ATC2603C_PMU_SYS_CTL1_WALL_WK_FLAG		BIT(14)
+#define ATC2603C_PMU_SYS_CTL1_USB_WK_FLAG		BIT(15)
+
+/* PMU_SYS_CTL2 Register Mask Bits */
+#define ATC2603C_PMU_SYS_CTL2_PMU_A_EN			BIT(0)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_PRESS_INT_EN	BIT(1)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_PRESS_PD		BIT(2)
+#define ATC2603C_PMU_SYS_CTL2_S2TIMER			GENMASK(5, 3)
+#define ATC2603C_PMU_SYS_CTL2_S2_TIMER_EN		BIT(6)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_RESET_TIME_SEL	GENMASK(8, 7)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_PRESS_RESET_EN	BIT(9)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_PRESS_TIME		GENMASK(11, 10)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_INT_EN		BIT(12)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_LONG_PRESS		BIT(13)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_SHORT_PRESS		BIT(14)
+#define ATC2603C_PMU_SYS_CTL2_ONOFF_PRESS		BIT(15)
+
+/* PMU_SYS_CTL3 Register Mask Bits */
+#define ATC2603C_PMU_SYS_CTL3_S2S3TOS1_TIMER		GENMASK(8, 7)
+#define ATC2603C_PMU_SYS_CTL3_S2S3TOS1_TIMER_EN		BIT(9)
+#define ATC2603C_PMU_SYS_CTL3_S3_TIMER			GENMASK(12, 10)
+#define ATC2603C_PMU_SYS_CTL3_S3_TIMER_EN		BIT(13)
+#define ATC2603C_PMU_SYS_CTL3_EN_S3			BIT(14)
+#define ATC2603C_PMU_SYS_CTL3_EN_S2			BIT(15)
+
+/* PMU_SYS_CTL5 Register Mask Bits */
+#define ATC2603C_PMU_SYS_CTL5_WALLWKDTEN		BIT(7)
+#define ATC2603C_PMU_SYS_CTL5_VBUSWKDTEN		BIT(8)
+#define ATC2603C_PMU_SYS_CTL5_REMCON_DECT_EN		BIT(9)
+#define ATC2603C_PMU_SYS_CTL5_ONOFF_8S_SEL		BIT(10)
+
+/* INTS_MSK Register Mask Bits */
+#define ATC2603C_INTS_MSK_AUDIO				BIT(0)
+#define ATC2603C_INTS_MSK_OV				BIT(1)
+#define ATC2603C_INTS_MSK_OC				BIT(2)
+#define ATC2603C_INTS_MSK_OT				BIT(3)
+#define ATC2603C_INTS_MSK_UV				BIT(4)
+#define ATC2603C_INTS_MSK_ALARM				BIT(5)
+#define ATC2603C_INTS_MSK_ONOFF				BIT(6)
+#define ATC2603C_INTS_MSK_SGPIO				BIT(7)
+#define ATC2603C_INTS_MSK_IR				BIT(8)
+#define ATC2603C_INTS_MSK_REMCON			BIT(9)
+#define ATC2603C_INTS_MSK_POWERIN			BIT(10)
+
+/* CMU_DEVRST Register Mask Bits */
+#define ATC2603C_CMU_DEVRST_MFP				BIT(1)
+#define ATC2603C_CMU_DEVRST_INTS			BIT(2)
+#define ATC2603C_CMU_DEVRST_AUDIO			BIT(4)
+
+/* PAD_EN Register Mask Bits */
+#define ATC2603C_PAD_EN_EXTIRQ				BIT(0)
+
+#endif /* __LINUX_MFD_ATC260X_ATC2603C_H */
diff --git a/include/linux/mfd/atc260x/atc2609a.h b/include/linux/mfd/atc260x/atc2609a.h
new file mode 100644
index 000000000000..b957d7bd73e9
--- /dev/null
+++ b/include/linux/mfd/atc260x/atc2609a.h
@@ -0,0 +1,308 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ATC2609A PMIC register definitions
+ *
+ * Copyright (C) 2019 Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ */
+
+#ifndef __LINUX_MFD_ATC260X_ATC2609A_H
+#define __LINUX_MFD_ATC260X_ATC2609A_H
+
+enum atc2609a_irq_def {
+	ATC2609A_IRQ_AUDIO = 0,
+	ATC2609A_IRQ_OV,
+	ATC2609A_IRQ_OC,
+	ATC2609A_IRQ_OT,
+	ATC2609A_IRQ_UV,
+	ATC2609A_IRQ_ALARM,
+	ATC2609A_IRQ_ONOFF,
+	ATC2609A_IRQ_WKUP,
+	ATC2609A_IRQ_IR,
+	ATC2609A_IRQ_REMCON,
+	ATC2609A_IRQ_POWER_IN,
+};
+
+/* PMU Registers */
+#define ATC2609A_PMU_SYS_CTL0			0x00
+#define ATC2609A_PMU_SYS_CTL1			0x01
+#define ATC2609A_PMU_SYS_CTL2			0x02
+#define ATC2609A_PMU_SYS_CTL3			0x03
+#define ATC2609A_PMU_SYS_CTL4			0x04
+#define ATC2609A_PMU_SYS_CTL5			0x05
+#define ATC2609A_PMU_SYS_CTL6			0x06
+#define ATC2609A_PMU_SYS_CTL7			0x07
+#define ATC2609A_PMU_SYS_CTL8			0x08
+#define ATC2609A_PMU_SYS_CTL9			0x09
+#define ATC2609A_PMU_BAT_CTL0			0x0A
+#define ATC2609A_PMU_BAT_CTL1			0x0B
+#define ATC2609A_PMU_VBUS_CTL0			0x0C
+#define ATC2609A_PMU_VBUS_CTL1			0x0D
+#define ATC2609A_PMU_WALL_CTL0			0x0E
+#define ATC2609A_PMU_WALL_CTL1			0x0F
+#define ATC2609A_PMU_SYS_PENDING		0x10
+#define ATC2609A_PMU_APDS_CTL0			0x11
+#define ATC2609A_PMU_APDS_CTL1			0x12
+#define ATC2609A_PMU_APDS_CTL2			0x13
+#define ATC2609A_PMU_CHARGER_CTL		0x14
+#define ATC2609A_PMU_BAKCHARGER_CTL		0x15
+#define ATC2609A_PMU_SWCHG_CTL0			0x16
+#define ATC2609A_PMU_SWCHG_CTL1			0x17
+#define ATC2609A_PMU_SWCHG_CTL2			0x18
+#define ATC2609A_PMU_SWCHG_CTL3			0x19
+#define ATC2609A_PMU_SWCHG_CTL4			0x1A
+#define ATC2609A_PMU_DC_OSC			0x1B
+#define ATC2609A_PMU_DC0_CTL0			0x1C
+#define ATC2609A_PMU_DC0_CTL1			0x1D
+#define ATC2609A_PMU_DC0_CTL2			0x1E
+#define ATC2609A_PMU_DC0_CTL3			0x1F
+#define ATC2609A_PMU_DC0_CTL4			0x20
+#define ATC2609A_PMU_DC0_CTL5			0x21
+#define ATC2609A_PMU_DC0_CTL6			0x22
+#define ATC2609A_PMU_DC1_CTL0			0x23
+#define ATC2609A_PMU_DC1_CTL1			0x24
+#define ATC2609A_PMU_DC1_CTL2			0x25
+#define ATC2609A_PMU_DC1_CTL3			0x26
+#define ATC2609A_PMU_DC1_CTL4			0x27
+#define ATC2609A_PMU_DC1_CTL5			0x28
+#define ATC2609A_PMU_DC1_CTL6			0x29
+#define ATC2609A_PMU_DC2_CTL0			0x2A
+#define ATC2609A_PMU_DC2_CTL1			0x2B
+#define ATC2609A_PMU_DC2_CTL2			0x2C
+#define ATC2609A_PMU_DC2_CTL3			0x2D
+#define ATC2609A_PMU_DC2_CTL4			0x2E
+#define ATC2609A_PMU_DC2_CTL5			0x2F
+#define ATC2609A_PMU_DC2_CTL6			0x30
+#define ATC2609A_PMU_DC3_CTL0			0x31
+#define ATC2609A_PMU_DC3_CTL1			0x32
+#define ATC2609A_PMU_DC3_CTL2			0x33
+#define ATC2609A_PMU_DC3_CTL3			0x34
+#define ATC2609A_PMU_DC3_CTL4			0x35
+#define ATC2609A_PMU_DC3_CTL5			0x36
+#define ATC2609A_PMU_DC3_CTL6			0x37
+#define ATC2609A_PMU_DC_ZR			0x38
+#define ATC2609A_PMU_LDO0_CTL0			0x39
+#define ATC2609A_PMU_LDO0_CTL1			0x3A
+#define ATC2609A_PMU_LDO1_CTL0			0x3B
+#define ATC2609A_PMU_LDO1_CTL1			0x3C
+#define ATC2609A_PMU_LDO2_CTL0			0x3D
+#define ATC2609A_PMU_LDO2_CTL1			0x3E
+#define ATC2609A_PMU_LDO3_CTL0			0x3F
+#define ATC2609A_PMU_LDO3_CTL1			0x40
+#define ATC2609A_PMU_LDO4_CTL0			0x41
+#define ATC2609A_PMU_LDO4_CTL1			0x42
+#define ATC2609A_PMU_LDO5_CTL0			0x43
+#define ATC2609A_PMU_LDO5_CTL1			0x44
+#define ATC2609A_PMU_LDO6_CTL0			0x45
+#define ATC2609A_PMU_LDO6_CTL1			0x46
+#define ATC2609A_PMU_LDO7_CTL0			0x47
+#define ATC2609A_PMU_LDO7_CTL1			0x48
+#define ATC2609A_PMU_LDO8_CTL0			0x49
+#define ATC2609A_PMU_LDO8_CTL1			0x4A
+#define ATC2609A_PMU_LDO9_CTL			0x4B
+#define ATC2609A_PMU_OV_INT_EN			0x4C
+#define ATC2609A_PMU_OV_STATUS			0x4D
+#define ATC2609A_PMU_UV_INT_EN			0x4E
+#define ATC2609A_PMU_UV_STATUS			0x4F
+#define ATC2609A_PMU_OC_INT_EN			0x50
+#define ATC2609A_PMU_OC_STATUS			0x51
+#define ATC2609A_PMU_OT_CTL			0x52
+#define ATC2609A_PMU_CM_CTL0			0x53
+#define ATC2609A_PMU_FW_USE0			0x54
+#define ATC2609A_PMU_FW_USE1			0x55
+#define ATC2609A_PMU_ADC12B_I			0x56
+#define ATC2609A_PMU_ADC12B_V			0x57
+#define ATC2609A_PMU_ADC12B_DUMMY		0x58
+#define ATC2609A_PMU_AUXADC_CTL0		0x59
+#define ATC2609A_PMU_AUXADC_CTL1		0x5A
+#define ATC2609A_PMU_BATVADC			0x5B
+#define ATC2609A_PMU_BATIADC			0x5C
+#define ATC2609A_PMU_WALLVADC			0x5D
+#define ATC2609A_PMU_WALLIADC			0x5E
+#define ATC2609A_PMU_VBUSVADC			0x5F
+#define ATC2609A_PMU_VBUSIADC			0x60
+#define ATC2609A_PMU_SYSPWRADC			0x61
+#define ATC2609A_PMU_REMCONADC			0x62
+#define ATC2609A_PMU_SVCCADC			0x63
+#define ATC2609A_PMU_CHGIADC			0x64
+#define ATC2609A_PMU_IREFADC			0x65
+#define ATC2609A_PMU_BAKBATADC			0x66
+#define ATC2609A_PMU_ICTEMPADC			0x67
+#define ATC2609A_PMU_AUXADC0			0x68
+#define ATC2609A_PMU_AUXADC1			0x69
+#define ATC2609A_PMU_AUXADC2			0x6A
+#define ATC2609A_PMU_AUXADC3			0x6B
+#define ATC2609A_PMU_ICTEMPADC_ADJ		0x6C
+#define ATC2609A_PMU_BDG_CTL			0x6D
+#define ATC2609A_RTC_CTL			0x6E
+#define ATC2609A_RTC_MSALM			0x6F
+#define ATC2609A_RTC_HALM			0x70
+#define ATC2609A_RTC_YMDALM			0x71
+#define ATC2609A_RTC_MS				0x72
+#define ATC2609A_RTC_H				0x73
+#define ATC2609A_RTC_DC				0x74
+#define ATC2609A_RTC_YMD			0x75
+#define ATC2609A_EFUSE_DAT			0x76
+#define ATC2609A_EFUSECRTL1			0x77
+#define ATC2609A_EFUSECRTL2			0x78
+#define ATC2609A_PMU_DC4_CTL0			0x79
+#define ATC2609A_PMU_DC4_CTL1			0x7A
+#define ATC2609A_PMU_DC4_CTL2			0x7B
+#define ATC2609A_PMU_DC4_CTL3			0x7C
+#define ATC2609A_PMU_DC4_CTL4			0x7D
+#define ATC2609A_PMU_DC4_CTL5			0x7E
+#define ATC2609A_PMU_DC4_CTL6			0x7F
+#define ATC2609A_PMU_PWR_STATUS			0x80
+#define ATC2609A_PMU_S2_PWR			0x81
+#define ATC2609A_CLMT_CTL0			0x82
+#define ATC2609A_CLMT_DATA0			0x83
+#define ATC2609A_CLMT_DATA1			0x84
+#define ATC2609A_CLMT_DATA2			0x85
+#define ATC2609A_CLMT_DATA3			0x86
+#define ATC2609A_CLMT_ADD0			0x87
+#define ATC2609A_CLMT_ADD1			0x88
+#define ATC2609A_CLMT_OCV_TABLE			0x89
+#define ATC2609A_CLMT_R_TABLE			0x8A
+#define ATC2609A_PMU_PWRON_CTL0			0x8D
+#define ATC2609A_PMU_PWRON_CTL1			0x8E
+#define ATC2609A_PMU_PWRON_CTL2			0x8F
+#define ATC2609A_IRC_CTL			0x90
+#define ATC2609A_IRC_STAT			0x91
+#define ATC2609A_IRC_CC				0x92
+#define ATC2609A_IRC_KDC			0x93
+#define ATC2609A_IRC_WK				0x94
+#define ATC2609A_IRC_RCC			0x95
+
+/* AUDIO_OUT Registers */
+#define ATC2609A_AUDIOINOUT_CTL			0xA0
+#define ATC2609A_AUDIO_DEBUGOUTCTL		0xA1
+#define ATC2609A_DAC_DIGITALCTL			0xA2
+#define ATC2609A_DAC_VOLUMECTL0			0xA3
+#define ATC2609A_DAC_ANALOG0			0xA4
+#define ATC2609A_DAC_ANALOG1			0xA5
+#define ATC2609A_DAC_ANALOG2			0xA6
+#define ATC2609A_DAC_ANALOG3			0xA7
+
+/* AUDIO_IN Registers */
+#define ATC2609A_ADC_DIGITALCTL			0xA8
+#define ATC2609A_ADC_HPFCTL			0xA9
+#define ATC2609A_ADC_CTL			0xAA
+#define ATC2609A_AGC_CTL0			0xAB
+#define ATC2609A_AGC_CTL1			0xAC
+#define ATC2609A_AGC_CTL2			0xAD
+#define ATC2609A_ADC_ANALOG0			0xAE
+#define ATC2609A_ADC_ANALOG1			0xAF
+
+/* PCM_IF Registers */
+#define ATC2609A_PCM0_CTL			0xB0
+#define ATC2609A_PCM1_CTL			0xB1
+#define ATC2609A_PCM2_CTL			0xB2
+#define ATC2609A_PCMIF_CTL			0xB3
+
+/* CMU_CONTROL Registers */
+#define ATC2609A_CMU_DEVRST			0xC1
+
+/* INTS Registers */
+#define ATC2609A_INTS_PD			0xC8
+#define ATC2609A_INTS_MSK			0xC9
+
+/* MFP Registers */
+#define ATC2609A_MFP_CTL			0xD0
+#define ATC2609A_PAD_VSEL			0xD1
+#define ATC2609A_GPIO_OUTEN			0xD2
+#define ATC2609A_GPIO_INEN			0xD3
+#define ATC2609A_GPIO_DAT			0xD4
+#define ATC2609A_PAD_DRV			0xD5
+#define ATC2609A_PAD_EN				0xD6
+#define ATC2609A_DEBUG_SEL			0xD7
+#define ATC2609A_DEBUG_IE			0xD8
+#define ATC2609A_DEBUG_OE			0xD9
+#define ATC2609A_CHIP_VER			0xDC
+
+/* PWSI Registers */
+#define ATC2609A_PWSI_CTL			0xF0
+#define ATC2609A_PWSI_STATUS			0xF1
+
+/* TWSI Registers */
+#define ATC2609A_SADDR				0xFF
+
+/* PMU_SYS_CTL0 Register Mask Bits */
+#define ATC2609A_PMU_SYS_CTL0_IR_WK_EN			BIT(5)
+#define ATC2609A_PMU_SYS_CTL0_RESET_WK_EN		BIT(6)
+#define ATC2609A_PMU_SYS_CTL0_HDSW_WK_EN		BIT(7)
+#define ATC2609A_PMU_SYS_CTL0_ALARM_WK_EN		BIT(8)
+#define ATC2609A_PMU_SYS_CTL0_REM_CON_WK_EN		BIT(9)
+#define ATC2609A_PMU_SYS_CTL0_RESTART_EN		BIT(10)
+#define ATC2609A_PMU_SYS_CTL0_WKIRQ_WK_EN		BIT(11)
+#define ATC2609A_PMU_SYS_CTL0_ONOFF_SHORT_WK_EN		BIT(12)
+#define ATC2609A_PMU_SYS_CTL0_ONOFF_LONG_WK_EN		BIT(13)
+#define ATC2609A_PMU_SYS_CTL0_WALL_WK_EN		BIT(14)
+#define ATC2609A_PMU_SYS_CTL0_USB_WK_EN			BIT(15)
+#define ATC2609A_PMU_SYS_CTL0_WK_ALL			(GENMASK(15, 5) & (~BIT(10)))
+
+/* PMU_SYS_CTL1 Register Mask Bits */
+#define ATC2609A_PMU_SYS_CTL1_EN_S1			BIT(0)
+#define ATC2609A_PMU_SYS_CTL1_LB_S4_EN			BIT(2)
+#define ATC2609A_PMU_SYS_CTL1_LB_S4			GENMASK(4, 3)
+#define ATC2609A_PMU_SYS_CTL1_LB_S4_3_1V		BIT(4)
+#define ATC2609A_PMU_SYS_CTL1_IR_WK_FLAG		BIT(5)
+#define ATC2609A_PMU_SYS_CTL1_RESET_WK_FLAG		BIT(6)
+#define ATC2609A_PMU_SYS_CTL1_HDSW_WK_FLAG		BIT(7)
+#define ATC2609A_PMU_SYS_CTL1_ALARM_WK_FLAG		BIT(8)
+#define ATC2609A_PMU_SYS_CTL1_REM_CON_WK_FLAG		BIT(9)
+#define ATC2609A_PMU_SYS_CTL1_RESTART_WK_FLAG		BIT(10)
+#define ATC2609A_PMU_SYS_CTL1_WKIRQ_WK_FLAG		BIT(11)
+#define ATC2609A_PMU_SYS_CTL1_ONOFF_SHORT_WK_FLAG	BIT(12)
+#define ATC2609A_PMU_SYS_CTL1_ONOFF_LONG_WK_FLAG	BIT(13)
+#define ATC2609A_PMU_SYS_CTL1_WALL_WK_FLAG		BIT(14)
+#define ATC2609A_PMU_SYS_CTL1_USB_WK_FLAG		BIT(15)
+
+/* PMU_SYS_CTL2 Register Mask Bits */
+#define ATC2609A_PMU_SYS_CTL2_PMU_A_EN			BIT(0)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_PRESS_INT_EN	BIT(1)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_PRESS_PD		BIT(2)
+#define ATC2609A_PMU_SYS_CTL2_S2TIMER			GENMASK(5, 3)
+#define ATC2609A_PMU_SYS_CTL2_S2_TIMER_EN		BIT(6)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_RESET_TIME_SEL	GENMASK(8, 7)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_RESET_EN		BIT(9)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_PRESS_TIME		GENMASK(11, 10)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_LSP_INT_EN		BIT(12)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_LONG_PRESS		BIT(13)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_SHORT_PRESS		BIT(14)
+#define ATC2609A_PMU_SYS_CTL2_ONOFF_PRESS		BIT(15)
+
+/* PMU_SYS_CTL3 Register Mask Bits */
+#define ATC2609A_PMU_SYS_CTL3_S2S3TOS1_TIMER		GENMASK(8, 7)
+#define ATC2609A_PMU_SYS_CTL3_S2S3TOS1_TIMER_EN		BIT(9)
+#define ATC2609A_PMU_SYS_CTL3_S3_TIMER			GENMASK(12, 10)
+#define ATC2609A_PMU_SYS_CTL3_S3_TIMER_EN		BIT(13)
+#define ATC2609A_PMU_SYS_CTL3_EN_S3			BIT(14)
+#define ATC2609A_PMU_SYS_CTL3_EN_S2			BIT(15)
+
+/* PMU_SYS_CTL5 Register Mask Bits */
+#define ATC2609A_PMU_SYS_CTL5_WALLWKDTEN		BIT(7)
+#define ATC2609A_PMU_SYS_CTL5_VBUSWKDTEN		BIT(8)
+#define ATC2609A_PMU_SYS_CTL5_REMCON_DECT_EN		BIT(9)
+#define ATC2609A_PMU_SYS_CTL5_ONOFF_8S_SEL		BIT(10)
+
+/* INTS_MSK Register Mask Bits */
+#define ATC2609A_INTS_MSK_AUDIO				BIT(0)
+#define ATC2609A_INTS_MSK_OV				BIT(1)
+#define ATC2609A_INTS_MSK_OC				BIT(2)
+#define ATC2609A_INTS_MSK_OT				BIT(3)
+#define ATC2609A_INTS_MSK_UV				BIT(4)
+#define ATC2609A_INTS_MSK_ALARM				BIT(5)
+#define ATC2609A_INTS_MSK_ONOFF				BIT(6)
+#define ATC2609A_INTS_MSK_WKUP				BIT(7)
+#define ATC2609A_INTS_MSK_IR				BIT(8)
+#define ATC2609A_INTS_MSK_REMCON			BIT(9)
+#define ATC2609A_INTS_MSK_POWERIN			BIT(10)
+
+/* CMU_DEVRST Register Mask Bits */
+#define ATC2609A_CMU_DEVRST_AUDIO			BIT(0)
+#define ATC2609A_CMU_DEVRST_MFP				BIT(1)
+#define ATC2609A_CMU_DEVRST_INTS			BIT(2)
+
+/* PAD_EN Register Mask Bits */
+#define ATC2609A_PAD_EN_EXTIRQ				BIT(0)
+
+#endif /* __LINUX_MFD_ATC260X_ATC2609A_H */
diff --git a/include/linux/mfd/atc260x/core.h b/include/linux/mfd/atc260x/core.h
new file mode 100644
index 000000000000..777b6c345d44
--- /dev/null
+++ b/include/linux/mfd/atc260x/core.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Core MFD defines for ATC260x PMICs
+ *
+ * Copyright (C) 2019 Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+ * Copyright (C) 2020 Cristian Ciocaltea <cristian.ciocaltea@gmail.com>
+ */
+
+#ifndef __LINUX_MFD_ATC260X_CORE_H
+#define __LINUX_MFD_ATC260X_CORE_H
+
+#include <linux/mfd/atc260x/atc2603c.h>
+#include <linux/mfd/atc260x/atc2609a.h>
+
+enum atc260x_type {
+	ATC2603A = 0,
+	ATC2603C,
+	ATC2609A,
+};
+
+enum atc260x_ver {
+	ATC260X_A = 0,
+	ATC260X_B,
+	ATC260X_C,
+	ATC260X_D,
+	ATC260X_E,
+	ATC260X_F,
+	ATC260X_G,
+	ATC260X_H,
+};
+
+struct atc260x {
+	struct device *dev;
+
+	struct regmap *regmap;
+	const struct regmap_irq_chip *regmap_irq_chip;
+	struct regmap_irq_chip_data *irq_data;
+
+	struct mutex *regmap_mutex;	/* mutex for custom regmap locking */
+
+	const struct mfd_cell *cells;
+	int nr_cells;
+	int irq;
+
+	enum atc260x_type ic_type;
+	enum atc260x_ver ic_ver;
+	const char *type_name;
+	unsigned int rev_reg;
+
+	const struct atc260x_init_regs *init_regs; /* regs for device init */
+};
+
+struct regmap_config;
+
+int atc260x_match_device(struct atc260x *atc260x, struct regmap_config *regmap_cfg);
+int atc260x_device_probe(struct atc260x *atc260x);
+
+#endif /* __LINUX_MFD_ATC260X_CORE_H */
-- 
cgit v1.2.3


From 98b94b6e38ca0c4eeb29949c656f6a315000c23e Mon Sep 17 00:00:00 2001
From: Frieder Schrempf <frieder.schrempf@kontron.de>
Date: Mon, 22 Feb 2021 12:52:20 +0100
Subject: regulator: pca9450: Clear PRESET_EN bit to fix BUCK1/2/3 voltage
 setting

The driver uses the DVS registers PCA9450_REG_BUCKxOUT_DVS0 to set the
voltage for the buck regulators 1, 2 and 3. This has no effect as the
PRESET_EN bit is set by default and therefore the preset values are used
instead, which are set to 850 mV.

To fix this we clear the PRESET_EN bit at time of initialization.

Fixes: 0935ff5f1f0a ("regulator: pca9450: add pca9450 pmic driver")
Cc: <stable@vger.kernel.org>
Signed-off-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Link: https://lore.kernel.org/r/20210222115229.166620-1-frieder.schrempf@kontron.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/pca9450-regulator.c | 8 ++++++++
 include/linux/regulator/pca9450.h     | 3 +++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/pca9450-regulator.c b/drivers/regulator/pca9450-regulator.c
index 89b806be399f..2f7ee212cb8c 100644
--- a/drivers/regulator/pca9450-regulator.c
+++ b/drivers/regulator/pca9450-regulator.c
@@ -797,6 +797,14 @@ static int pca9450_i2c_probe(struct i2c_client *i2c,
 		return ret;
 	}
 
+	/* Clear PRESET_EN bit in BUCK123_DVS to use DVS registers */
+	ret = regmap_clear_bits(pca9450->regmap, PCA9450_REG_BUCK123_DVS,
+				BUCK123_PRESET_EN);
+	if (ret) {
+		dev_err(&i2c->dev, "Failed to clear PRESET_EN bit: %d\n", ret);
+		return ret;
+	}
+
 	/* Set reset behavior on assertion of WDOG_B signal */
 	ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL,
 				WDOG_B_CFG_MASK, WDOG_B_CFG_COLD_LDO12);
diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h
index ccdb5320a240..71902f41c919 100644
--- a/include/linux/regulator/pca9450.h
+++ b/include/linux/regulator/pca9450.h
@@ -147,6 +147,9 @@ enum {
 #define BUCK6_FPWM			0x04
 #define BUCK6_ENMODE_MASK		0x03
 
+/* PCA9450_REG_BUCK123_PRESET_EN bit */
+#define BUCK123_PRESET_EN		0x80
+
 /* PCA9450_BUCK1OUT_DVS0 bits */
 #define BUCK1OUT_DVS0_MASK		0x7F
 #define BUCK1OUT_DVS0_DEFAULT		0x14
-- 
cgit v1.2.3


From 2a92c90f2ecca4475d6050f2f938a1755a8954cc Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 1 Mar 2021 17:30:12 +0300
Subject: software node: Fix device_add_software_node()

The function device_add_software_node() was meant to
register the node supplied to it, but only if that node
wasn't already registered. Right now the function attempts
to always register the node. That will cause a failure with
nodes that are already registered.

Fixing that by incrementing the reference count of the nodes
that have already been registered, and only registering the
new nodes. Also, clarifying the behaviour in the function
documentation.

Fixes: e68d0119e328 ("software node: Introduce device_add_software_node()")
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/swnode.c    | 26 +++++++++++++++++---------
 include/linux/property.h |  2 +-
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c
index 74db8c971db7..fa3719ef80e4 100644
--- a/drivers/base/swnode.c
+++ b/drivers/base/swnode.c
@@ -1005,25 +1005,33 @@ EXPORT_SYMBOL_GPL(fwnode_remove_software_node);
 /**
  * device_add_software_node - Assign software node to a device
  * @dev: The device the software node is meant for.
- * @swnode: The software node.
+ * @node: The software node.
  *
- * This function will register @swnode and make it the secondary firmware node
- * pointer of @dev. If @dev has no primary node, then @swnode will become the primary
- * node.
+ * This function will make @node the secondary firmware node pointer of @dev. If
+ * @dev has no primary node, then @node will become the primary node. The
+ * function will register @node automatically if it wasn't already registered.
  */
-int device_add_software_node(struct device *dev, const struct software_node *swnode)
+int device_add_software_node(struct device *dev, const struct software_node *node)
 {
+	struct swnode *swnode;
 	int ret;
 
 	/* Only one software node per device. */
 	if (dev_to_swnode(dev))
 		return -EBUSY;
 
-	ret = software_node_register(swnode);
-	if (ret)
-		return ret;
+	swnode = software_node_to_swnode(node);
+	if (swnode) {
+		kobject_get(&swnode->kobj);
+	} else {
+		ret = software_node_register(node);
+		if (ret)
+			return ret;
+
+		swnode = software_node_to_swnode(node);
+	}
 
-	set_secondary_fwnode(dev, software_node_fwnode(swnode));
+	set_secondary_fwnode(dev, &swnode->fwnode);
 
 	return 0;
 }
diff --git a/include/linux/property.h b/include/linux/property.h
index dafccfce0262..dd4687b56239 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -488,7 +488,7 @@ fwnode_create_software_node(const struct property_entry *properties,
 			    const struct fwnode_handle *parent);
 void fwnode_remove_software_node(struct fwnode_handle *fwnode);
 
-int device_add_software_node(struct device *dev, const struct software_node *swnode);
+int device_add_software_node(struct device *dev, const struct software_node *node);
 void device_remove_software_node(struct device *dev);
 
 int device_create_managed_software_node(struct device *dev,
-- 
cgit v1.2.3


From 4d5f52838d11981c4d76cdc4e73230120de1ac85 Mon Sep 17 00:00:00 2001
From: Bhaumik Bhatt <bbhatt@codeaurora.org>
Date: Tue, 9 Mar 2021 10:46:35 -0800
Subject: bus: mhi: Make firmware image optional for controller

Some controllers can opt to not have MHI download a firmware
image to have the device bootup and can find the device in a
pass through execution environment, ready to go. Thus, MHI
controllers for those devices do not need fw_image defined.
Make it optional to accommodate different bootup modes.

Suggested-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
Reviewed-by: Jeffrey Hugo <jhugo@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1615315595-37750-1-git-send-email-bbhatt@codeaurora.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index d26acc8b21cd..8f5bf409f663 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -296,7 +296,7 @@ struct mhi_controller_config {
  * @wake_db: MHI WAKE doorbell register address
  * @iova_start: IOMMU starting address for data (required)
  * @iova_stop: IOMMU stop address for data (required)
- * @fw_image: Firmware image name for normal booting (required)
+ * @fw_image: Firmware image name for normal booting (optional)
  * @edl_image: Firmware image name for emergency download mode (optional)
  * @rddm_size: RAM dump size that host should allocate for debugging purpose
  * @sbl_size: SBL image size downloaded through BHIe (optional)
-- 
cgit v1.2.3


From b18adee4ce4443399963826b5d28d9e63d40740c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 9 Mar 2021 19:41:25 +0000
Subject: stacktrace: Move documentation for arch_stack_walk_reliable() to
 header

Currently arch_stack_walk_reliable() is documented with an identical
comment in both x86 and S/390 implementations which is a bit redundant.
Move this to the header and convert to kerneldoc while we're at it.

Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Vasily Gorbik <gor@linux.ibm.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lkml.kernel.org/r/20210309194125.652-1-broonie@kernel.org
---
 arch/s390/kernel/stacktrace.c |  6 ------
 arch/x86/kernel/stacktrace.c  |  6 ------
 include/linux/stacktrace.h    | 19 +++++++++++++++++++
 3 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index 7f1266c24f6b..101477b3e263 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -24,12 +24,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 	}
 }
 
-/*
- * This function returns an error if it detects any unreliable features of the
- * stack.  Otherwise it guarantees that the stack trace is reliable.
- *
- * If the task is not 'current', the caller *must* ensure the task is inactive.
- */
 int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
 			     void *cookie, struct task_struct *task)
 {
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8627fda8d993..15b058eefc4e 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -29,12 +29,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 	}
 }
 
-/*
- * This function returns an error if it detects any unreliable features of the
- * stack.  Otherwise it guarantees that the stack trace is reliable.
- *
- * If the task is not 'current', the caller *must* ensure the task is inactive.
- */
 int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
 			     void *cookie, struct task_struct *task)
 {
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 50e2df30b0aa..9edecb494e9e 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -52,8 +52,27 @@ typedef bool (*stack_trace_consume_fn)(void *cookie, unsigned long addr);
  */
 void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 		     struct task_struct *task, struct pt_regs *regs);
+
+/**
+ * arch_stack_walk_reliable - Architecture specific function to walk the
+ *			      stack reliably
+ *
+ * @consume_entry:	Callback which is invoked by the architecture code for
+ *			each entry.
+ * @cookie:		Caller supplied pointer which is handed back to
+ *			@consume_entry
+ * @task:		Pointer to a task struct, can be NULL
+ *
+ * This function returns an error if it detects any unreliable
+ * features of the stack. Otherwise it guarantees that the stack
+ * trace is reliable.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is
+ * inactive and its stack is pinned.
+ */
 int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie,
 			     struct task_struct *task);
+
 void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
 			  const struct pt_regs *regs);
 
-- 
cgit v1.2.3


From 4806f1e2fee84c053cb68cd5be5817170bf0aab6 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Wed, 3 Mar 2021 14:36:16 +0200
Subject: net/mlx5: Set QP timestamp mode to default

QPs which don't care from timestamp mode, should set the ts_format
to default, otherwise the QP creation could be failed if the timestamp
mode is not supported.

Fixes: 2fe8d4b87802 ("RDMA/mlx5: Fail QP creation if the device can not support the CQE TS")
Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c        | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c      | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 1 +
 include/linux/mlx5/qp.h                                    | 7 +++++++
 4 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index 80da50e12915..bd66ab2af5b5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -575,6 +575,7 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn,
 	MLX5_SET(qpc, qpc, log_sq_size, ilog2(conn->qp.sq.size));
 	MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn);
 	MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn);
+	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
 	MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma);
 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 756fa0401ab7..6f7cef47e04c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -233,6 +233,7 @@ int mlx5i_create_underlay_qp(struct mlx5e_priv *priv)
 	}
 
 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(priv->mdev));
 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
 	MLX5_SET(qpc, qpc, ulp_stateless_offload_mode,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index 83c4c877d558..8a6a56f9dc4e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -169,6 +169,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
 	MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
+	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
 	MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index d75ef8aa8fac..b7deb790f257 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -547,4 +547,11 @@ static inline const char *mlx5_qp_state_str(int state)
 	}
 }
 
+static inline int mlx5_get_qp_default_ts(struct mlx5_core_dev *dev)
+{
+	return !MLX5_CAP_ROCE(dev, qp_ts_format) ?
+		       MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING :
+		       MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT;
+}
+
 #endif /* MLX5_QP_H */
-- 
cgit v1.2.3


From 0bb3262c0248d44aea3be31076f44beb82a7b120 Mon Sep 17 00:00:00 2001
From: Menglong Dong <dong.menglong@zte.com.cn>
Date: Tue, 9 Mar 2021 17:51:35 -0800
Subject: net: socket: use BIT() for MSG_*

The bit mask for MSG_* seems a little confused here. Replace it
with BIT() to make it clear to understand.

Signed-off-by: Menglong Dong <dong.menglong@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 71 ++++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 385894b4a8bb..e88859f38cd0 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -283,42 +283,45 @@ struct ucred {
    Added those for 1003.1g not all are supported yet
  */
 
-#define MSG_OOB		1
-#define MSG_PEEK	2
-#define MSG_DONTROUTE	4
-#define MSG_TRYHARD     4       /* Synonym for MSG_DONTROUTE for DECnet */
-#define MSG_CTRUNC	8
-#define MSG_PROBE	0x10	/* Do not send. Only probe path f.e. for MTU */
-#define MSG_TRUNC	0x20
-#define MSG_DONTWAIT	0x40	/* Nonblocking io		 */
-#define MSG_EOR         0x80	/* End of record */
-#define MSG_WAITALL	0x100	/* Wait for a full request */
-#define MSG_FIN         0x200
-#define MSG_SYN		0x400
-#define MSG_CONFIRM	0x800	/* Confirm path validity */
-#define MSG_RST		0x1000
-#define MSG_ERRQUEUE	0x2000	/* Fetch message from error queue */
-#define MSG_NOSIGNAL	0x4000	/* Do not generate SIGPIPE */
-#define MSG_MORE	0x8000	/* Sender will send more */
-#define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
-#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */
-#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
-#define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
-#define MSG_EOF         MSG_FIN
-#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */
-#define MSG_SENDPAGE_DECRYPTED	0x100000 /* sendpage() internal : page may carry
-					  * plain text and require encryption
-					  */
-
-#define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
-#define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
-#define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
-					   descriptor received through
-					   SCM_RIGHTS */
+#define MSG_OOB		BIT(0)
+#define MSG_PEEK	BIT(1)
+#define MSG_DONTROUTE	BIT(2)
+#define MSG_TRYHARD	BIT(2)	/* Synonym for MSG_DONTROUTE for DECnet		*/
+#define MSG_CTRUNC	BIT(3)
+#define MSG_PROBE	BIT(4)	/* Do not send. Only probe path f.e. for MTU	*/
+#define MSG_TRUNC	BIT(5)
+#define MSG_DONTWAIT	BIT(6)	/* Nonblocking io		*/
+#define MSG_EOR		BIT(7)	/* End of record		*/
+#define MSG_WAITALL	BIT(8)	/* Wait for a full request	*/
+#define MSG_FIN		BIT(9)
+#define MSG_SYN		BIT(10)
+#define MSG_CONFIRM	BIT(11)	/* Confirm path validity	*/
+#define MSG_RST		BIT(12)
+#define MSG_ERRQUEUE	BIT(13)	/* Fetch message from error queue */
+#define MSG_NOSIGNAL	BIT(14)	/* Do not generate SIGPIPE	*/
+#define MSG_MORE	BIT(15)	/* Sender will send more	*/
+#define MSG_WAITFORONE	BIT(16)	/* recvmmsg(): block until 1+ packets avail */
+#define MSG_SENDPAGE_NOPOLICY	BIT(16)	/* sendpage() internal : do no apply policy */
+#define MSG_SENDPAGE_NOTLAST	BIT(17)	/* sendpage() internal : not the last page  */
+#define MSG_BATCH	BIT(18)		/* sendmmsg(): more messages coming */
+#define MSG_EOF		MSG_FIN
+#define MSG_NO_SHARED_FRAGS	BIT(19)	/* sendpage() internal : page frags
+					 * are not shared
+					 */
+#define MSG_SENDPAGE_DECRYPTED	BIT(20)	/* sendpage() internal : page may carry
+					 * plain text and require encryption
+					 */
+
+#define MSG_ZEROCOPY	BIT(26)		/* Use user data in kernel path */
+#define MSG_FASTOPEN	BIT(29)		/* Send data in TCP SYN */
+#define MSG_CMSG_CLOEXEC	BIT(30)	/* Set close_on_exec for file
+					 * descriptor received through
+					 * SCM_RIGHTS
+					 */
 #if defined(CONFIG_COMPAT)
-#define MSG_CMSG_COMPAT	0x80000000	/* This message needs 32 bit fixups */
+#define MSG_CMSG_COMPAT	BIT(31)	/* This message needs 32 bit fixups */
 #else
-#define MSG_CMSG_COMPAT	0		/* We never have 32 bit fixups */
+#define MSG_CMSG_COMPAT	0	/* We never have 32 bit fixups */
 #endif
 
 
-- 
cgit v1.2.3


From dd4fa1dae9f4847cc1fd78ca468ad69e16e5db3e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 10 Mar 2021 01:56:36 -0800
Subject: macvlan: macvlan_count_rx() needs to be aware of preemption

macvlan_count_rx() can be called from process context, it is thus
necessary to disable preemption before calling u64_stats_update_begin()

syzbot was able to spot this on 32bit arch:

WARNING: CPU: 1 PID: 4632 at include/linux/seqlock.h:271 __seqprop_assert include/linux/seqlock.h:271 [inline]
WARNING: CPU: 1 PID: 4632 at include/linux/seqlock.h:271 __seqprop_assert.constprop.0+0xf0/0x11c include/linux/seqlock.h:269
Modules linked in:
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 4632 Comm: kworker/1:3 Not tainted 5.12.0-rc2-syzkaller #0
Hardware name: ARM-Versatile Express
Workqueue: events macvlan_process_broadcast
Backtrace:
[<82740468>] (dump_backtrace) from [<827406dc>] (show_stack+0x18/0x1c arch/arm/kernel/traps.c:252)
 r7:00000080 r6:60000093 r5:00000000 r4:8422a3c4
[<827406c4>] (show_stack) from [<82751b58>] (__dump_stack lib/dump_stack.c:79 [inline])
[<827406c4>] (show_stack) from [<82751b58>] (dump_stack+0xb8/0xe8 lib/dump_stack.c:120)
[<82751aa0>] (dump_stack) from [<82741270>] (panic+0x130/0x378 kernel/panic.c:231)
 r7:830209b4 r6:84069ea4 r5:00000000 r4:844350d0
[<82741140>] (panic) from [<80244924>] (__warn+0xb0/0x164 kernel/panic.c:605)
 r3:8404ec8c r2:00000000 r1:00000000 r0:830209b4
 r7:0000010f
[<80244874>] (__warn) from [<82741520>] (warn_slowpath_fmt+0x68/0xd4 kernel/panic.c:628)
 r7:81363f70 r6:0000010f r5:83018e50 r4:00000000
[<827414bc>] (warn_slowpath_fmt) from [<81363f70>] (__seqprop_assert include/linux/seqlock.h:271 [inline])
[<827414bc>] (warn_slowpath_fmt) from [<81363f70>] (__seqprop_assert.constprop.0+0xf0/0x11c include/linux/seqlock.h:269)
 r8:5a109000 r7:0000000f r6:a568dac0 r5:89802300 r4:00000001
[<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (u64_stats_update_begin include/linux/u64_stats_sync.h:128 [inline])
[<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (macvlan_count_rx include/linux/if_macvlan.h:47 [inline])
[<81363e80>] (__seqprop_assert.constprop.0) from [<81364af0>] (macvlan_broadcast+0x154/0x26c drivers/net/macvlan.c:291)
 r5:89802300 r4:8a927740
[<8136499c>] (macvlan_broadcast) from [<81365020>] (macvlan_process_broadcast+0x258/0x2d0 drivers/net/macvlan.c:317)
 r10:81364f78 r9:8a86d000 r8:8a9c7e7c r7:8413aa5c r6:00000000 r5:00000000
 r4:89802840
[<81364dc8>] (macvlan_process_broadcast) from [<802696a4>] (process_one_work+0x2d4/0x998 kernel/workqueue.c:2275)
 r10:00000008 r9:8404ec98 r8:84367a02 r7:ddfe6400 r6:ddfe2d40 r5:898dac80
 r4:8a86d43c
[<802693d0>] (process_one_work) from [<80269dcc>] (worker_thread+0x64/0x54c kernel/workqueue.c:2421)
 r10:00000008 r9:8a9c6000 r8:84006d00 r7:ddfe2d78 r6:898dac94 r5:ddfe2d40
 r4:898dac80
[<80269d68>] (worker_thread) from [<80271f40>] (kthread+0x184/0x1a4 kernel/kthread.c:292)
 r10:85247e64 r9:898dac80 r8:80269d68 r7:00000000 r6:8a9c6000 r5:89a2ee40
 r4:8a97bd00
[<80271dbc>] (kthread) from [<80200114>] (ret_from_fork+0x14/0x20 arch/arm/kernel/entry-common.S:158)
Exception stack(0x8a9c7fb0 to 0x8a9c7ff8)

Fixes: 412ca1550cbe ("macvlan: Move broadcasts into a work queue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_macvlan.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 96556c64c95d..10c94a3936ca 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -43,13 +43,14 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 	if (likely(success)) {
 		struct vlan_pcpu_stats *pcpu_stats;
 
-		pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
+		pcpu_stats = get_cpu_ptr(vlan->pcpu_stats);
 		u64_stats_update_begin(&pcpu_stats->syncp);
 		pcpu_stats->rx_packets++;
 		pcpu_stats->rx_bytes += len;
 		if (multicast)
 			pcpu_stats->rx_multicast++;
 		u64_stats_update_end(&pcpu_stats->syncp);
+		put_cpu_ptr(vlan->pcpu_stats);
 	} else {
 		this_cpu_inc(vlan->pcpu_stats->rx_errors);
 	}
-- 
cgit v1.2.3


From ce6ed1c4c9876c2880f52f18c41ef2a30d070bc5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 4 Mar 2021 20:37:08 +0900
Subject: kbuild: rebuild GCC plugins when the compiler is upgraded

Linus reported a build error due to the GCC plugin incompatibility
when the compiler is upgraded. [1]

GCC plugins are tied to a particular GCC version. So, they must be
rebuilt when the compiler is upgraded.

This seems to be a long-standing flaw since the initial support of
GCC plugins.

Extend commit 8b59cd81dc5e ("kbuild: ensure full rebuild when the
compiler is updated"), so that GCC plugins are covered by the
compiler upgrade detection.

[1]: https://lore.kernel.org/lkml/CAHk-=wieoN5ttOy7SnsGwZv+Fni3R6m-Ut=oxih6bbZ28G+4dw@mail.gmail.com/

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 Makefile                         |  1 +
 include/linux/compiler-version.h | 14 ++++++++++++++
 include/linux/kconfig.h          |  2 --
 init/Kconfig                     |  8 ++++----
 scripts/gcc-plugins/Makefile     |  1 +
 5 files changed, 20 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/compiler-version.h

(limited to 'include/linux')

diff --git a/Makefile b/Makefile
index 3f9f44eac27f..10bb0a62cc7d 100644
--- a/Makefile
+++ b/Makefile
@@ -479,6 +479,7 @@ USERINCLUDE    := \
 		-I$(objtree)/arch/$(SRCARCH)/include/generated/uapi \
 		-I$(srctree)/include/uapi \
 		-I$(objtree)/include/generated/uapi \
+                -include $(srctree)/include/linux/compiler-version.h \
                 -include $(srctree)/include/linux/kconfig.h
 
 # Use LINUXINCLUDE when you must reference the include/ directory.
diff --git a/include/linux/compiler-version.h b/include/linux/compiler-version.h
new file mode 100644
index 000000000000..2b2972c77c62
--- /dev/null
+++ b/include/linux/compiler-version.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifdef  __LINUX_COMPILER_VERSION_H
+#error "Please do not include <linux/compiler-version.h>. This is done by the build system."
+#endif
+#define __LINUX_COMPILER_VERSION_H
+
+/*
+ * This header exists to force full rebuild when the compiler is upgraded.
+ *
+ * When fixdep scans this, it will find this string "CONFIG_CC_VERSION_TEXT"
+ * and add dependency on include/config/cc/version/text.h, which is touched
+ * by Kconfig when the version string from the compiler changes.
+ */
diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index e78e17a76dc9..24a59cb06963 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -2,8 +2,6 @@
 #ifndef __LINUX_KCONFIG_H
 #define __LINUX_KCONFIG_H
 
-/* CONFIG_CC_VERSION_TEXT (Do not delete this comment. See help in Kconfig) */
-
 #include <generated/autoconf.h>
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
diff --git a/init/Kconfig b/init/Kconfig
index 22946fe5ded9..30c849094c28 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -20,10 +20,10 @@ config CC_VERSION_TEXT
 	    When the compiler is updated, Kconfig will be invoked.
 
 	  - Ensure full rebuild when the compiler is updated
-	    include/linux/kconfig.h contains this option in the comment line so
-	    fixdep adds include/config/cc/version/text.h into the auto-generated
-	    dependency. When the compiler is updated, syncconfig will touch it
-	    and then every file will be rebuilt.
+	    include/linux/compiler-version.h contains this option in the comment
+	    line so fixdep adds include/config/cc/version/text.h into the
+	    auto-generated dependency. When the compiler is updated, syncconfig
+	    will touch it and then every file will be rebuilt.
 
 config CC_IS_GCC
 	def_bool $(success,test "$(cc-name)" = GCC)
diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile
index b5487cce69e8..1952d3bb80c6 100644
--- a/scripts/gcc-plugins/Makefile
+++ b/scripts/gcc-plugins/Makefile
@@ -22,6 +22,7 @@ always-y += $(GCC_PLUGIN)
 GCC_PLUGINS_DIR = $(shell $(CC) -print-file-name=plugin)
 
 plugin_cxxflags	= -Wp,-MMD,$(depfile) $(KBUILD_HOSTCXXFLAGS) -fPIC \
+		  -include $(srctree)/include/linux/compiler-version.h \
 		   -I $(GCC_PLUGINS_DIR)/include -I $(obj) -std=gnu++11 \
 		   -fno-rtti -fno-exceptions -fasynchronous-unwind-tables \
 		   -ggdb -Wno-narrowing -Wno-unused-variable \
-- 
cgit v1.2.3


From 2bc611844b5d2c43b63bdf71ae6395fa7a6566cc Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 27 Jan 2021 21:30:12 +0100
Subject: mtd: nand: Let ECC engines advertize the exact number of steps

This is an information that might be useful for specific uses, so
export it, which might avoid having to guess the number of steps when
necessary.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Adam Ford <aford173@gmail.com> #logicpd Torpedo
Link: https://lore.kernel.org/linux-mtd/20210127203020.9574-2-miquel.raynal@bootlin.com
---
 include/linux/mtd/nand.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 414f8a4d2853..632becb13b46 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -231,12 +231,14 @@ struct nand_ops {
 /**
  * struct nand_ecc_context - Context for the ECC engine
  * @conf: basic ECC engine parameters
+ * @nsteps: number of ECC steps
  * @total: total number of bytes used for storing ECC codes, this is used by
  *         generic OOB layouts
  * @priv: ECC engine driver private data
  */
 struct nand_ecc_context {
 	struct nand_ecc_props conf;
+	unsigned int nsteps;
 	unsigned int total;
 	void *priv;
 };
-- 
cgit v1.2.3


From e3554b10babd8ee1cf43bfc840ef4657eb1d12aa Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 27 Jan 2021 21:30:15 +0100
Subject: mtd: nand: Add a helper to retrieve the number of ECC steps

This operation is very common and deserves a helper. It of course only
works after the ECC engine initialization.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Adam Ford <aford173@gmail.com> #logicpd Torpedo
Link: https://lore.kernel.org/linux-mtd/20210127203020.9574-5-miquel.raynal@bootlin.com
---
 include/linux/mtd/nand.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 632becb13b46..8a0116396689 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -587,6 +587,16 @@ nanddev_get_ecc_conf(struct nand_device *nand)
 	return &nand->ecc.ctx.conf;
 }
 
+/**
+ * nanddev_get_ecc_nsteps() - Extract the number of ECC steps
+ * @nand: NAND device
+ */
+static inline unsigned int
+nanddev_get_ecc_nsteps(struct nand_device *nand)
+{
+	return nand->ecc.ctx.nsteps;
+}
+
 /**
  * nanddev_get_ecc_requirements() - Extract the ECC requirements from a NAND
  *                                  device
-- 
cgit v1.2.3


From ba4a40a483da86d76bd69957c21fcb975b8405ae Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 27 Jan 2021 21:30:16 +0100
Subject: mtd: nand: Add a helper to retrieve the number of ECC bytes per step

This operation is very common and deserves a helper. It of course only
works after the ECC engine initialization.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Adam Ford <aford173@gmail.com> #logicpd Torpedo
Link: https://lore.kernel.org/linux-mtd/20210127203020.9574-6-miquel.raynal@bootlin.com
---
 include/linux/mtd/nand.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 8a0116396689..32fc7edf65b3 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -597,6 +597,16 @@ nanddev_get_ecc_nsteps(struct nand_device *nand)
 	return nand->ecc.ctx.nsteps;
 }
 
+/**
+ * nanddev_get_ecc_bytes_per_step() - Extract the number of ECC bytes per step
+ * @nand: NAND device
+ */
+static inline unsigned int
+nanddev_get_ecc_bytes_per_step(struct nand_device *nand)
+{
+	return nand->ecc.ctx.total / nand->ecc.ctx.nsteps;
+}
+
 /**
  * nanddev_get_ecc_requirements() - Extract the ECC requirements from a NAND
  *                                  device
-- 
cgit v1.2.3


From 3e66843c74289b294b91547edd364c5a6fdef45b Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 27 Jan 2021 21:30:19 +0100
Subject: mtd: nand: ecc-bch: Use the public nsteps field

The software BCH ECC engine stores the nsteps variable in its own
private structure while it is also exported as a public ECC field.

Let's get rid of the redundant private one and let's use the
nand_ecc_context structure when possible.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Adam Ford <aford173@gmail.com> #logicpd Torpedo
Link: https://lore.kernel.org/linux-mtd/20210127203020.9574-9-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-bch.c       | 9 ++++-----
 include/linux/mtd/nand-ecc-sw-bch.h | 2 --
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-bch.c b/drivers/mtd/nand/ecc-sw-bch.c
index 5018bc0db626..405552d014a8 100644
--- a/drivers/mtd/nand/ecc-sw-bch.c
+++ b/drivers/mtd/nand/ecc-sw-bch.c
@@ -236,7 +236,6 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
 		goto free_engine_conf;
 
 	engine_conf->code_size = code_size;
-	engine_conf->nsteps = nsteps;
 	engine_conf->calc_buf = kzalloc(mtd->oobsize, GFP_KERNEL);
 	engine_conf->code_buf = kzalloc(mtd->oobsize, GFP_KERNEL);
 	if (!engine_conf->calc_buf || !engine_conf->code_buf) {
@@ -254,7 +253,7 @@ int nand_ecc_sw_bch_init_ctx(struct nand_device *nand)
 
 	/* Verify the layout validity */
 	if (mtd_ooblayout_count_eccbytes(mtd) !=
-	    engine_conf->nsteps * engine_conf->code_size) {
+	    nand->ecc.ctx.nsteps * engine_conf->code_size) {
 		pr_err("Invalid ECC layout\n");
 		ret = -EINVAL;
 		goto cleanup_bch_ctx;
@@ -296,7 +295,7 @@ static int nand_ecc_sw_bch_prepare_io_req(struct nand_device *nand,
 	struct mtd_info *mtd = nanddev_to_mtd(nand);
 	int eccsize = nand->ecc.ctx.conf.step_size;
 	int eccbytes = engine_conf->code_size;
-	int eccsteps = engine_conf->nsteps;
+	int eccsteps = nand->ecc.ctx.nsteps;
 	int total = nand->ecc.ctx.total;
 	u8 *ecccalc = engine_conf->calc_buf;
 	const u8 *data;
@@ -334,7 +333,7 @@ static int nand_ecc_sw_bch_finish_io_req(struct nand_device *nand,
 	int eccsize = nand->ecc.ctx.conf.step_size;
 	int total = nand->ecc.ctx.total;
 	int eccbytes = engine_conf->code_size;
-	int eccsteps = engine_conf->nsteps;
+	int eccsteps = nand->ecc.ctx.nsteps;
 	u8 *ecccalc = engine_conf->calc_buf;
 	u8 *ecccode = engine_conf->code_buf;
 	unsigned int max_bitflips = 0;
@@ -366,7 +365,7 @@ static int nand_ecc_sw_bch_finish_io_req(struct nand_device *nand,
 		nand_ecc_sw_bch_calculate(nand, data, &ecccalc[i]);
 
 	/* Finish a page read: compare and correct */
-	for (eccsteps = engine_conf->nsteps, i = 0, data = req->databuf.in;
+	for (eccsteps = nand->ecc.ctx.nsteps, i = 0, data = req->databuf.in;
 	     eccsteps;
 	     eccsteps--, i += eccbytes, data += eccsize) {
 		int stat =  nand_ecc_sw_bch_correct(nand, data,
diff --git a/include/linux/mtd/nand-ecc-sw-bch.h b/include/linux/mtd/nand-ecc-sw-bch.h
index 22c92073b3dd..9da9969505a8 100644
--- a/include/linux/mtd/nand-ecc-sw-bch.h
+++ b/include/linux/mtd/nand-ecc-sw-bch.h
@@ -16,7 +16,6 @@
  * @req_ctx: Save request context and tweak the original request to fit the
  *           engine needs
  * @code_size: Number of bytes needed to store a code (one code per step)
- * @nsteps: Number of steps
  * @calc_buf: Buffer to use when calculating ECC bytes
  * @code_buf: Buffer to use when reading (raw) ECC bytes from the chip
  * @bch: BCH control structure
@@ -26,7 +25,6 @@
 struct nand_ecc_sw_bch_conf {
 	struct nand_ecc_req_tweak_ctx req_ctx;
 	unsigned int code_size;
-	unsigned int nsteps;
 	u8 *calc_buf;
 	u8 *code_buf;
 	struct bch_control *bch;
-- 
cgit v1.2.3


From bf3816d28f0778de0d3d00a2a65525e19e5dbad2 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 27 Jan 2021 21:30:20 +0100
Subject: mtd: nand: ecc-hamming: Use the public nsteps field

The software Hamming ECC engine stores the nsteps variable in its own
private structure while it is also exported as a public ECC field.

Let's get rid of the redundant private one and let's use the
nand_ecc_context structure when possible.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Tested-by: Adam Ford <aford173@gmail.com> #logicpd Torpedo
Link: https://lore.kernel.org/linux-mtd/20210127203020.9574-10-miquel.raynal@bootlin.com
---
 drivers/mtd/nand/ecc-sw-hamming.c       | 9 ++++-----
 include/linux/mtd/nand-ecc-sw-hamming.h | 2 --
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c
index 5144775e5a59..a7655b668f32 100644
--- a/drivers/mtd/nand/ecc-sw-hamming.c
+++ b/drivers/mtd/nand/ecc-sw-hamming.c
@@ -504,7 +504,6 @@ int nand_ecc_sw_hamming_init_ctx(struct nand_device *nand)
 		goto free_engine_conf;
 
 	engine_conf->code_size = 3;
-	engine_conf->nsteps = mtd->writesize / conf->step_size;
 	engine_conf->calc_buf = kzalloc(mtd->oobsize, GFP_KERNEL);
 	engine_conf->code_buf = kzalloc(mtd->oobsize, GFP_KERNEL);
 	if (!engine_conf->calc_buf || !engine_conf->code_buf) {
@@ -514,7 +513,7 @@ int nand_ecc_sw_hamming_init_ctx(struct nand_device *nand)
 
 	nand->ecc.ctx.priv = engine_conf;
 	nand->ecc.ctx.nsteps = mtd->writesize / conf->step_size;
-	nand->ecc.ctx.total = engine_conf->nsteps * engine_conf->code_size;
+	nand->ecc.ctx.total = nand->ecc.ctx.nsteps * engine_conf->code_size;
 
 	return 0;
 
@@ -549,7 +548,7 @@ static int nand_ecc_sw_hamming_prepare_io_req(struct nand_device *nand,
 	struct mtd_info *mtd = nanddev_to_mtd(nand);
 	int eccsize = nand->ecc.ctx.conf.step_size;
 	int eccbytes = engine_conf->code_size;
-	int eccsteps = engine_conf->nsteps;
+	int eccsteps = nand->ecc.ctx.nsteps;
 	int total = nand->ecc.ctx.total;
 	u8 *ecccalc = engine_conf->calc_buf;
 	const u8 *data;
@@ -587,7 +586,7 @@ static int nand_ecc_sw_hamming_finish_io_req(struct nand_device *nand,
 	int eccsize = nand->ecc.ctx.conf.step_size;
 	int total = nand->ecc.ctx.total;
 	int eccbytes = engine_conf->code_size;
-	int eccsteps = engine_conf->nsteps;
+	int eccsteps = nand->ecc.ctx.nsteps;
 	u8 *ecccalc = engine_conf->calc_buf;
 	u8 *ecccode = engine_conf->code_buf;
 	unsigned int max_bitflips = 0;
@@ -619,7 +618,7 @@ static int nand_ecc_sw_hamming_finish_io_req(struct nand_device *nand,
 		nand_ecc_sw_hamming_calculate(nand, data, &ecccalc[i]);
 
 	/* Finish a page read: compare and correct */
-	for (eccsteps = engine_conf->nsteps, i = 0, data = req->databuf.in;
+	for (eccsteps = nand->ecc.ctx.nsteps, i = 0, data = req->databuf.in;
 	     eccsteps;
 	     eccsteps--, i += eccbytes, data += eccsize) {
 		int stat =  nand_ecc_sw_hamming_correct(nand, data,
diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h
index 9f9073d86ff3..c6c71894c575 100644
--- a/include/linux/mtd/nand-ecc-sw-hamming.h
+++ b/include/linux/mtd/nand-ecc-sw-hamming.h
@@ -17,7 +17,6 @@
  * @req_ctx: Save request context and tweak the original request to fit the
  *           engine needs
  * @code_size: Number of bytes needed to store a code (one code per step)
- * @nsteps: Number of steps
  * @calc_buf: Buffer to use when calculating ECC bytes
  * @code_buf: Buffer to use when reading (raw) ECC bytes from the chip
  * @sm_order: Smart Media special ordering
@@ -25,7 +24,6 @@
 struct nand_ecc_sw_hamming_conf {
 	struct nand_ecc_req_tweak_ctx req_ctx;
 	unsigned int code_size;
-	unsigned int nsteps;
 	u8 *calc_buf;
 	u8 *code_buf;
 	unsigned int sm_order;
-- 
cgit v1.2.3


From 6cfeb41a825913f3dcb131d6556cc9d1c4072015 Mon Sep 17 00:00:00 2001
From: Dejin Zheng <zhengdejin5@gmail.com>
Date: Sun, 14 Feb 2021 00:45:53 +0800
Subject: mtd: Add helper macro for register_mtd_blktrans boilerplate

This patch introduces the module_mtd_blktrans macro which is a convenience
macro for mtd blktrans modules similar to module_platform_driver.
It is intended to be used by drivers which init/exit section does nothing
but register/unregister the mtd blktrans driver. By using this macro it is
possible to eliminate a few lines of boilerplate code per mtd blktrans
driver.

Signed-off-by: Dejin Zheng <zhengdejin5@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210213164600.409061-2-zhengdejin5@gmail.com
---
 include/linux/mtd/blktrans.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h
index 3c668cb1e344..15cc9b95e32b 100644
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -77,5 +77,16 @@ extern int add_mtd_blktrans_dev(struct mtd_blktrans_dev *dev);
 extern int del_mtd_blktrans_dev(struct mtd_blktrans_dev *dev);
 extern int mtd_blktrans_cease_background(struct mtd_blktrans_dev *dev);
 
+/**
+ * module_mtd_blktrans() - Helper macro for registering a mtd blktrans driver
+ * @__mtd_blktrans: mtd_blktrans_ops struct
+ *
+ * Helper macro for mtd blktrans drivers which do not do anything special in
+ * module init/exit. This eliminates a lot of boilerplate. Each module may only
+ * use this macro once, and calling it replaces module_init() and module_exit()
+ */
+#define module_mtd_blktrans(__mtd_blktrans) \
+	module_driver(__mtd_blktrans, register_mtd_blktrans, \
+					deregister_mtd_blktrans)
 
 #endif /* __MTD_TRANS_H__ */
-- 
cgit v1.2.3


From 1ad55288829c78e85bfe7d0c86d75415adf5f305 Mon Sep 17 00:00:00 2001
From: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Date: Wed, 17 Feb 2021 22:18:45 +0100
Subject: mtd: char: Get rid of Big MTD Lock

Get rid of central chrdev MTD lock, which prevents simultaneous operations
on completely independent physical MTD chips. Replace it with newly
introduced per-master mutex.

Signed-off-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210217211845.43364-2-alexander.sverdlin@nokia.com
---
 drivers/mtd/mtdchar.c   | 14 ++++++++------
 drivers/mtd/mtdcore.c   |  1 +
 include/linux/mtd/mtd.h |  1 +
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index f31390d186ca..57c4a2f0b703 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -27,8 +27,6 @@
 
 #include "mtdcore.h"
 
-static DEFINE_MUTEX(mtd_mutex);
-
 /*
  * Data structure to hold the pointer to the mtd device as well
  * as mode information of various use cases.
@@ -1020,11 +1018,14 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 
 static long mtdchar_unlocked_ioctl(struct file *file, u_int cmd, u_long arg)
 {
+	struct mtd_file_info *mfi = file->private_data;
+	struct mtd_info *mtd = mfi->mtd;
+	struct mtd_info *master = mtd_get_master(mtd);
 	int ret;
 
-	mutex_lock(&mtd_mutex);
+	mutex_lock(&master->master.chrdev_lock);
 	ret = mtdchar_ioctl(file, cmd, arg);
-	mutex_unlock(&mtd_mutex);
+	mutex_unlock(&master->master.chrdev_lock);
 
 	return ret;
 }
@@ -1045,10 +1046,11 @@ static long mtdchar_compat_ioctl(struct file *file, unsigned int cmd,
 {
 	struct mtd_file_info *mfi = file->private_data;
 	struct mtd_info *mtd = mfi->mtd;
+	struct mtd_info *master = mtd_get_master(mtd);
 	void __user *argp = compat_ptr(arg);
 	int ret = 0;
 
-	mutex_lock(&mtd_mutex);
+	mutex_lock(&master->master.chrdev_lock);
 
 	switch (cmd) {
 	case MEMWRITEOOB32:
@@ -1111,7 +1113,7 @@ static long mtdchar_compat_ioctl(struct file *file, unsigned int cmd,
 		ret = mtdchar_ioctl(file, cmd, (unsigned long)argp);
 	}
 
-	mutex_unlock(&mtd_mutex);
+	mutex_unlock(&master->master.chrdev_lock);
 
 	return ret;
 }
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 2d6423d89a17..0b095975895e 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -773,6 +773,7 @@ static void mtd_set_dev_defaults(struct mtd_info *mtd)
 
 	INIT_LIST_HEAD(&mtd->partitions);
 	mutex_init(&mtd->master.partitions_lock);
+	mutex_init(&mtd->master.chrdev_lock);
 }
 
 /**
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 157357ec1441..ceabc2cae8a4 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -229,6 +229,7 @@ struct mtd_part {
  */
 struct mtd_master {
 	struct mutex partitions_lock;
+	struct mutex chrdev_lock;
 	unsigned int suspended : 1;
 };
 
-- 
cgit v1.2.3


From 659ab7a49cbebe0deffcbe1f9560e82006b21817 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Wed, 3 Mar 2021 14:32:29 +0100
Subject: drm: Use USB controller's DMA mask when importing dmabufs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

USB devices cannot perform DMA and hence have no dma_mask set in their
device structure. Therefore importing dmabuf into a USB-based driver
fails, which breaks joining and mirroring of display in X11.

For USB devices, pick the associated USB controller as attachment device.
This allows the DRM import helpers to perform the DMA setup. If the DMA
controller does not support DMA transfers, we're out of luck and cannot
import. Our current USB-based DRM drivers don't use DMA, so the actual
DMA device is not important.

Tested by joining/mirroring displays of udl and radeon under Gnome/X11.

v8:
	* release dmadev if device initialization fails (Noralf)
	* fix commit description (Noralf)
v7:
	* fix use-before-init bug in gm12u320 (Dan)
v6:
	* implement workaround in DRM drivers and hold reference to
	  DMA device while USB device is in use
	* remove dev_is_usb() (Greg)
	* collapse USB helper into usb_intf_get_dma_device() (Alan)
	* integrate Daniel's TODO statement (Daniel)
	* fix typos (Greg)
v5:
	* provide a helper for USB interfaces (Alan)
	* add FIXME item to documentation and TODO list (Daniel)
v4:
	* implement workaround with USB helper functions (Greg)
	* use struct usb_device->bus->sysdev as DMA device (Takashi)
v3:
	* drop gem_create_object
	* use DMA mask of USB controller, if any (Daniel, Christian, Noralf)
v2:
	* move fix to importer side (Christian, Daniel)
	* update SHMEM and CMA helpers for new PRIME callbacks

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Fixes: 6eb0233ec2d0 ("usb: don't inherity DMA properties for USB devices")
Tested-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Noralf Trønnes <noralf@tronnes.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: <stable@vger.kernel.org> # v5.10+
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20210303133229.3288-1-tzimmermann@suse.de
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
---
 Documentation/gpu/todo.rst      | 21 ++++++++++++++++++++
 drivers/gpu/drm/tiny/gm12u320.c | 44 +++++++++++++++++++++++++++++++++--------
 drivers/gpu/drm/udl/udl_drv.c   | 17 ++++++++++++++++
 drivers/gpu/drm/udl/udl_drv.h   |  1 +
 drivers/gpu/drm/udl/udl_main.c  | 10 ++++++++++
 drivers/usb/core/usb.c          | 32 ++++++++++++++++++++++++++++++
 include/linux/usb.h             |  2 ++
 7 files changed, 119 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
index 40ccac61137e..22ce801e3a8d 100644
--- a/Documentation/gpu/todo.rst
+++ b/Documentation/gpu/todo.rst
@@ -613,6 +613,27 @@ Some of these date from the very introduction of KMS in 2008 ...
 
 Level: Intermediate
 
+Remove automatic page mapping from dma-buf importing
+----------------------------------------------------
+
+When importing dma-bufs, the dma-buf and PRIME frameworks automatically map
+imported pages into the importer's DMA area. drm_gem_prime_fd_to_handle() and
+drm_gem_prime_handle_to_fd() require that importers call dma_buf_attach()
+even if they never do actual device DMA, but only CPU access through
+dma_buf_vmap(). This is a problem for USB devices, which do not support DMA
+operations.
+
+To fix the issue, automatic page mappings should be removed from the
+buffer-sharing code. Fixing this is a bit more involved, since the import/export
+cache is also tied to &drm_gem_object.import_attach. Meanwhile we paper over
+this problem for USB devices by fishing out the USB host controller device, as
+long as that supports DMA. Otherwise importing can still needlessly fail.
+
+Contact: Thomas Zimmermann <tzimmermann@suse.de>, Daniel Vetter
+
+Level: Advanced
+
+
 Better Testing
 ==============
 
diff --git a/drivers/gpu/drm/tiny/gm12u320.c b/drivers/gpu/drm/tiny/gm12u320.c
index 33f65f4626e5..23866a54e3f9 100644
--- a/drivers/gpu/drm/tiny/gm12u320.c
+++ b/drivers/gpu/drm/tiny/gm12u320.c
@@ -83,6 +83,7 @@ MODULE_PARM_DESC(eco_mode, "Turn on Eco mode (less bright, more silent)");
 
 struct gm12u320_device {
 	struct drm_device	         dev;
+	struct device                   *dmadev;
 	struct drm_simple_display_pipe   pipe;
 	struct drm_connector	         conn;
 	unsigned char                   *cmd_buf;
@@ -601,6 +602,22 @@ static const uint64_t gm12u320_pipe_modifiers[] = {
 	DRM_FORMAT_MOD_INVALID
 };
 
+/*
+ * FIXME: Dma-buf sharing requires DMA support by the importing device.
+ *        This function is a workaround to make USB devices work as well.
+ *        See todo.rst for how to fix the issue in the dma-buf framework.
+ */
+static struct drm_gem_object *gm12u320_gem_prime_import(struct drm_device *dev,
+							struct dma_buf *dma_buf)
+{
+	struct gm12u320_device *gm12u320 = to_gm12u320(dev);
+
+	if (!gm12u320->dmadev)
+		return ERR_PTR(-ENODEV);
+
+	return drm_gem_prime_import_dev(dev, dma_buf, gm12u320->dmadev);
+}
+
 DEFINE_DRM_GEM_FOPS(gm12u320_fops);
 
 static const struct drm_driver gm12u320_drm_driver = {
@@ -614,6 +631,7 @@ static const struct drm_driver gm12u320_drm_driver = {
 
 	.fops		 = &gm12u320_fops,
 	DRM_GEM_SHMEM_DRIVER_OPS,
+	.gem_prime_import = gm12u320_gem_prime_import,
 };
 
 static const struct drm_mode_config_funcs gm12u320_mode_config_funcs = {
@@ -640,15 +658,18 @@ static int gm12u320_usb_probe(struct usb_interface *interface,
 				      struct gm12u320_device, dev);
 	if (IS_ERR(gm12u320))
 		return PTR_ERR(gm12u320);
+	dev = &gm12u320->dev;
+
+	gm12u320->dmadev = usb_intf_get_dma_device(to_usb_interface(dev->dev));
+	if (!gm12u320->dmadev)
+		drm_warn(dev, "buffer sharing not supported"); /* not an error */
 
 	INIT_DELAYED_WORK(&gm12u320->fb_update.work, gm12u320_fb_update_work);
 	mutex_init(&gm12u320->fb_update.lock);
 
-	dev = &gm12u320->dev;
-
 	ret = drmm_mode_config_init(dev);
 	if (ret)
-		return ret;
+		goto err_put_device;
 
 	dev->mode_config.min_width = GM12U320_USER_WIDTH;
 	dev->mode_config.max_width = GM12U320_USER_WIDTH;
@@ -658,15 +679,15 @@ static int gm12u320_usb_probe(struct usb_interface *interface,
 
 	ret = gm12u320_usb_alloc(gm12u320);
 	if (ret)
-		return ret;
+		goto err_put_device;
 
 	ret = gm12u320_set_ecomode(gm12u320);
 	if (ret)
-		return ret;
+		goto err_put_device;
 
 	ret = gm12u320_conn_init(gm12u320);
 	if (ret)
-		return ret;
+		goto err_put_device;
 
 	ret = drm_simple_display_pipe_init(&gm12u320->dev,
 					   &gm12u320->pipe,
@@ -676,24 +697,31 @@ static int gm12u320_usb_probe(struct usb_interface *interface,
 					   gm12u320_pipe_modifiers,
 					   &gm12u320->conn);
 	if (ret)
-		return ret;
+		goto err_put_device;
 
 	drm_mode_config_reset(dev);
 
 	usb_set_intfdata(interface, dev);
 	ret = drm_dev_register(dev, 0);
 	if (ret)
-		return ret;
+		goto err_put_device;
 
 	drm_fbdev_generic_setup(dev, 0);
 
 	return 0;
+
+err_put_device:
+	put_device(gm12u320->dmadev);
+	return ret;
 }
 
 static void gm12u320_usb_disconnect(struct usb_interface *interface)
 {
 	struct drm_device *dev = usb_get_intfdata(interface);
+	struct gm12u320_device *gm12u320 = to_gm12u320(dev);
 
+	put_device(gm12u320->dmadev);
+	gm12u320->dmadev = NULL;
 	drm_dev_unplug(dev);
 	drm_atomic_helper_shutdown(dev);
 }
diff --git a/drivers/gpu/drm/udl/udl_drv.c b/drivers/gpu/drm/udl/udl_drv.c
index 9269092697d8..5703277c6f52 100644
--- a/drivers/gpu/drm/udl/udl_drv.c
+++ b/drivers/gpu/drm/udl/udl_drv.c
@@ -32,6 +32,22 @@ static int udl_usb_resume(struct usb_interface *interface)
 	return drm_mode_config_helper_resume(dev);
 }
 
+/*
+ * FIXME: Dma-buf sharing requires DMA support by the importing device.
+ *        This function is a workaround to make USB devices work as well.
+ *        See todo.rst for how to fix the issue in the dma-buf framework.
+ */
+static struct drm_gem_object *udl_driver_gem_prime_import(struct drm_device *dev,
+							  struct dma_buf *dma_buf)
+{
+	struct udl_device *udl = to_udl(dev);
+
+	if (!udl->dmadev)
+		return ERR_PTR(-ENODEV);
+
+	return drm_gem_prime_import_dev(dev, dma_buf, udl->dmadev);
+}
+
 DEFINE_DRM_GEM_FOPS(udl_driver_fops);
 
 static const struct drm_driver driver = {
@@ -40,6 +56,7 @@ static const struct drm_driver driver = {
 	/* GEM hooks */
 	.fops = &udl_driver_fops,
 	DRM_GEM_SHMEM_DRIVER_OPS,
+	.gem_prime_import = udl_driver_gem_prime_import,
 
 	.name = DRIVER_NAME,
 	.desc = DRIVER_DESC,
diff --git a/drivers/gpu/drm/udl/udl_drv.h b/drivers/gpu/drm/udl/udl_drv.h
index 875e73551ae9..cc16a13316e4 100644
--- a/drivers/gpu/drm/udl/udl_drv.h
+++ b/drivers/gpu/drm/udl/udl_drv.h
@@ -50,6 +50,7 @@ struct urb_list {
 struct udl_device {
 	struct drm_device drm;
 	struct device *dev;
+	struct device *dmadev;
 
 	struct drm_simple_display_pipe display_pipe;
 
diff --git a/drivers/gpu/drm/udl/udl_main.c b/drivers/gpu/drm/udl/udl_main.c
index 0e2a376cb075..853f147036f6 100644
--- a/drivers/gpu/drm/udl/udl_main.c
+++ b/drivers/gpu/drm/udl/udl_main.c
@@ -315,6 +315,10 @@ int udl_init(struct udl_device *udl)
 
 	DRM_DEBUG("\n");
 
+	udl->dmadev = usb_intf_get_dma_device(to_usb_interface(dev->dev));
+	if (!udl->dmadev)
+		drm_warn(dev, "buffer sharing not supported"); /* not an error */
+
 	mutex_init(&udl->gem_lock);
 
 	if (!udl_parse_vendor_descriptor(udl)) {
@@ -343,12 +347,18 @@ int udl_init(struct udl_device *udl)
 err:
 	if (udl->urbs.count)
 		udl_free_urb_list(dev);
+	put_device(udl->dmadev);
 	DRM_ERROR("%d\n", ret);
 	return ret;
 }
 
 int udl_drop_usb(struct drm_device *dev)
 {
+	struct udl_device *udl = to_udl(dev);
+
 	udl_free_urb_list(dev);
+	put_device(udl->dmadev);
+	udl->dmadev = NULL;
+
 	return 0;
 }
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 8f07b0516100..a566bb494e24 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -748,6 +748,38 @@ void usb_put_intf(struct usb_interface *intf)
 }
 EXPORT_SYMBOL_GPL(usb_put_intf);
 
+/**
+ * usb_intf_get_dma_device - acquire a reference on the usb interface's DMA endpoint
+ * @intf: the usb interface
+ *
+ * While a USB device cannot perform DMA operations by itself, many USB
+ * controllers can. A call to usb_intf_get_dma_device() returns the DMA endpoint
+ * for the given USB interface, if any. The returned device structure must be
+ * released with put_device().
+ *
+ * See also usb_get_dma_device().
+ *
+ * Returns: A reference to the usb interface's DMA endpoint; or NULL if none
+ *          exists.
+ */
+struct device *usb_intf_get_dma_device(struct usb_interface *intf)
+{
+	struct usb_device *udev = interface_to_usbdev(intf);
+	struct device *dmadev;
+
+	if (!udev->bus)
+		return NULL;
+
+	dmadev = get_device(udev->bus->sysdev);
+	if (!dmadev || !dmadev->dma_mask) {
+		put_device(dmadev);
+		return NULL;
+	}
+
+	return dmadev;
+}
+EXPORT_SYMBOL_GPL(usb_intf_get_dma_device);
+
 /*			USB device locking
  *
  * USB devices and interfaces are locked using the semaphore in their
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 7d72c4e0713c..d6a41841b93e 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -746,6 +746,8 @@ extern int usb_lock_device_for_reset(struct usb_device *udev,
 extern int usb_reset_device(struct usb_device *dev);
 extern void usb_queue_reset_device(struct usb_interface *dev);
 
+extern struct device *usb_intf_get_dma_device(struct usb_interface *intf);
+
 #ifdef CONFIG_ACPI
 extern int usb_acpi_set_power_state(struct usb_device *hdev, int index,
 	bool enable);
-- 
cgit v1.2.3


From a8affc03a9b375e19bc81573de0c9108317d78c7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Mar 2021 12:01:37 +0100
Subject: block: rename BIO_MAX_PAGES to BIO_MAX_VECS

Ever since the addition of multipage bio_vecs BIO_MAX_PAGES has been
horribly confusingly misnamed.  Rename it to BIO_MAX_VECS to stop
confusing users of the bio API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20210311110137.1132391-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                    | 14 +++++++-------
 block/blk-crypto-fallback.c    |  2 +-
 block/blk-lib.c                |  2 +-
 block/blk-map.c                |  2 +-
 block/bounce.c                 |  6 +++---
 drivers/block/drbd/drbd_int.h  |  2 +-
 drivers/md/bcache/super.c      |  2 +-
 drivers/md/dm-crypt.c          |  8 ++++----
 drivers/md/dm-writecache.c     |  4 ++--
 drivers/md/raid5-cache.c       |  4 ++--
 drivers/md/raid5-ppl.c         |  2 +-
 drivers/nvme/target/passthru.c |  6 +++---
 fs/block_dev.c                 |  6 +++---
 fs/btrfs/extent_io.c           |  2 +-
 fs/btrfs/scrub.c               |  2 +-
 fs/crypto/bio.c                |  6 +++---
 fs/erofs/zdata.c               |  2 +-
 fs/ext4/page-io.c              |  2 +-
 fs/f2fs/checkpoint.c           |  2 +-
 fs/f2fs/data.c                 |  4 ++--
 fs/f2fs/segment.c              |  2 +-
 fs/f2fs/segment.h              |  4 ++--
 fs/f2fs/super.c                |  4 ++--
 fs/gfs2/lops.c                 |  2 +-
 fs/iomap/buffered-io.c         |  4 ++--
 fs/iomap/direct-io.c           |  4 ++--
 fs/mpage.c                     |  2 +-
 fs/nilfs2/segbuf.c             |  2 +-
 fs/squashfs/block.c            |  2 +-
 fs/zonefs/super.c              |  2 +-
 include/linux/bio.h            |  4 ++--
 31 files changed, 56 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index a1c4d2900c7a..26b7f721cda8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -33,7 +33,7 @@ static struct biovec_slab {
 	{ .nr_vecs = 16, .name = "biovec-16" },
 	{ .nr_vecs = 64, .name = "biovec-64" },
 	{ .nr_vecs = 128, .name = "biovec-128" },
-	{ .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" },
+	{ .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
 };
 
 static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
@@ -46,7 +46,7 @@ static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
 		return &bvec_slabs[1];
 	case 65 ... 128:
 		return &bvec_slabs[2];
-	case 129 ... BIO_MAX_PAGES:
+	case 129 ... BIO_MAX_VECS:
 		return &bvec_slabs[3];
 	default:
 		BUG();
@@ -151,9 +151,9 @@ out:
 
 void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
 {
-	BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES);
+	BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
 
-	if (nr_vecs == BIO_MAX_PAGES)
+	if (nr_vecs == BIO_MAX_VECS)
 		mempool_free(bv, pool);
 	else if (nr_vecs > BIO_INLINE_VECS)
 		kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
@@ -186,15 +186,15 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
 	/*
 	 * Try a slab allocation first for all smaller allocations.  If that
 	 * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
-	 * The mempool is sized to handle up to BIO_MAX_PAGES entries.
+	 * The mempool is sized to handle up to BIO_MAX_VECS entries.
 	 */
-	if (*nr_vecs < BIO_MAX_PAGES) {
+	if (*nr_vecs < BIO_MAX_VECS) {
 		struct bio_vec *bvl;
 
 		bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
 		if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 			return bvl;
-		*nr_vecs = BIO_MAX_PAGES;
+		*nr_vecs = BIO_MAX_VECS;
 	}
 
 	return mempool_alloc(pool, gfp_mask);
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c176b7af56a7..c322176a1e09 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -219,7 +219,7 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
 
 	bio_for_each_segment(bv, bio, iter) {
 		num_sectors += bv.bv_len >> SECTOR_SHIFT;
-		if (++i == BIO_MAX_PAGES)
+		if (++i == BIO_MAX_VECS)
 			break;
 	}
 	if (num_sectors < bio_sectors(bio)) {
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 752f9c722062..7b256131b20b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -296,7 +296,7 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
 {
 	sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512);
 
-	return min(pages, (sector_t)BIO_MAX_PAGES);
+	return min(pages, (sector_t)BIO_MAX_VECS);
 }
 
 static int __blkdev_issue_zero_pages(struct block_device *bdev,
diff --git a/block/blk-map.c b/block/blk-map.c
index 369e204d14d0..1ffef782fcf2 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -249,7 +249,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 	if (!iov_iter_count(iter))
 		return -EINVAL;
 
-	bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES));
+	bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_VECS));
 	if (!bio)
 		return -ENOMEM;
 	bio->bi_opf |= req_op(rq);
diff --git a/block/bounce.c b/block/bounce.c
index 87983a35079c..6c441f4f1cd4 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -229,10 +229,10 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
 	 *  - The point of cloning the biovec is to produce a bio with a biovec
 	 *    the caller can modify: bi_idx and bi_bvec_done should be 0.
 	 *
-	 *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
+	 *  - The original bio could've had more than BIO_MAX_VECS biovecs; if
 	 *    we tried to clone the whole thing bio_alloc_bioset() would fail.
 	 *    But the clone should succeed as long as the number of biovecs we
-	 *    actually need to allocate is fewer than BIO_MAX_PAGES.
+	 *    actually need to allocate is fewer than BIO_MAX_VECS.
 	 *
 	 *  - Lastly, bi_vcnt should not be looked at or relied upon by code
 	 *    that does not own the bio - reason being drivers don't use it for
@@ -299,7 +299,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	int sectors = 0;
 
 	bio_for_each_segment(from, *bio_orig, iter) {
-		if (i++ < BIO_MAX_PAGES)
+		if (i++ < BIO_MAX_VECS)
 			sectors += from.bv_len >> 9;
 		if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn)
 			bounce = true;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 7d9cc433b758..5d9181382ce1 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1324,7 +1324,7 @@ struct bm_extent {
  * A followup commit may allow even bigger BIO sizes,
  * once we thought that through. */
 #define DRBD_MAX_BIO_SIZE (1U << 20)
-#if DRBD_MAX_BIO_SIZE > (BIO_MAX_PAGES << PAGE_SHIFT)
+#if DRBD_MAX_BIO_SIZE > (BIO_MAX_VECS << PAGE_SHIFT)
 #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
 #endif
 #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12)       /* Works always = 4k */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 71691f32959b..03e1fe4de53d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -965,7 +965,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
 	q->limits.max_hw_sectors	= UINT_MAX;
 	q->limits.max_sectors		= UINT_MAX;
 	q->limits.max_segment_size	= UINT_MAX;
-	q->limits.max_segments		= BIO_MAX_PAGES;
+	q->limits.max_segments		= BIO_MAX_VECS;
 	blk_queue_max_discard_sectors(q, UINT_MAX);
 	q->limits.discard_granularity	= 512;
 	q->limits.io_min		= block_size;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 11c105ecd165..b0ab080f2567 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -229,7 +229,7 @@ static DEFINE_SPINLOCK(dm_crypt_clients_lock);
 static unsigned dm_crypt_clients_n = 0;
 static volatile unsigned long dm_crypt_pages_per_client;
 #define DM_CRYPT_MEMORY_PERCENT			2
-#define DM_CRYPT_MIN_PAGES_PER_CLIENT		(BIO_MAX_PAGES * 16)
+#define DM_CRYPT_MIN_PAGES_PER_CLIENT		(BIO_MAX_VECS * 16)
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
@@ -3246,7 +3246,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
 		      ARCH_KMALLOC_MINALIGN);
 
-	ret = mempool_init(&cc->page_pool, BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc);
+	ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc);
 	if (ret) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;
@@ -3373,9 +3373,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	/*
 	 * Check if bio is too large, split as needed.
 	 */
-	if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) &&
+	if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) &&
 	    (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
-		dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT));
+		dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT));
 
 	/*
 	 * Ensure that bio is a multiple of internal sector encryption size
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 844c4be11768..4f72b6f66c3a 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1892,10 +1892,10 @@ restart:
 			list_add(&g->lru, &wbl.list);
 			wbl.size++;
 			g->write_in_progress = true;
-			g->wc_list_contiguous = BIO_MAX_PAGES;
+			g->wc_list_contiguous = BIO_MAX_VECS;
 			f = g;
 			e->wc_list_contiguous++;
-			if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) {
+			if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
 				if (unlikely(wc->writeback_all)) {
 					next_node = rb_next(&f->rb_node);
 					if (likely(next_node))
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 4337ae0e6af2..0b5dcaabbc15 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -735,7 +735,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 {
-	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs);
+	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &log->bs);
 
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 	bio_set_dev(bio, log->rdev->bdev);
@@ -1634,7 +1634,7 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
 {
 	struct page *page;
 
-	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs);
+	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_VECS, &log->bs);
 	if (!ctx->ra_bio)
 		return -ENOMEM;
 
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index e8c118e05dfd..3ddc2aa0b530 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -496,7 +496,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 		if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
 			struct bio *prev = bio;
 
-			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
+			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS,
 					       &ppl_conf->bs);
 			bio->bi_opf = prev->bi_opf;
 			bio->bi_write_hint = prev->bi_write_hint;
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 26c587ccd152..2798944899b7 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -50,9 +50,9 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
 
 	/*
 	 * nvmet_passthru_map_sg is limitted to using a single bio so limit
-	 * the mdts based on BIO_MAX_PAGES as well
+	 * the mdts based on BIO_MAX_VECS as well
 	 */
-	max_hw_sectors = min_not_zero(BIO_MAX_PAGES << (PAGE_SHIFT - 9),
+	max_hw_sectors = min_not_zero(BIO_MAX_VECS << (PAGE_SHIFT - 9),
 				      max_hw_sectors);
 
 	page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
@@ -191,7 +191,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
 	struct bio *bio;
 	int i;
 
-	if (req->sg_cnt > BIO_MAX_PAGES)
+	if (req->sg_cnt > BIO_MAX_VECS)
 		return -EINVAL;
 
 	if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 03166b3dea4d..92ed7d5df677 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -432,7 +432,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		dio->size += bio->bi_iter.bi_size;
 		pos += bio->bi_iter.bi_size;
 
-		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES);
+		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
 		if (!nr_pages) {
 			bool polled = false;
 
@@ -500,8 +500,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (!iov_iter_count(iter))
 		return 0;
 
-	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1);
-	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
+	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
+	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 
 	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dfb3ead1175..db8cb98c020c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3048,7 +3048,7 @@ struct bio *btrfs_bio_alloc(u64 first_byte)
 {
 	struct bio *bio;
 
-	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
+	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
 	bio->bi_iter.bi_sector = first_byte >> 9;
 	btrfs_io_bio_init(btrfs_io_bio(bio));
 	return bio;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 582df11d298a..6daa4309c974 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1428,7 +1428,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
 	if (!first_page->dev->bdev)
 		goto out;
 
-	bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
+	bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
 	bio_set_dev(bio, first_page->dev->bdev);
 
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index b048a0e38516..68a2de6b5a9b 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -52,7 +52,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
 	int num_pages = 0;
 
 	/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
-	bio = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+	bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
 
 	while (len) {
 		unsigned int blocks_this_page = min(len, blocks_per_page);
@@ -74,7 +74,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
 		len -= blocks_this_page;
 		lblk += blocks_this_page;
 		pblk += blocks_this_page;
-		if (num_pages == BIO_MAX_PAGES || !len ||
+		if (num_pages == BIO_MAX_VECS || !len ||
 		    !fscrypt_mergeable_bio(bio, inode, lblk)) {
 			err = submit_bio_wait(bio);
 			if (err)
@@ -126,7 +126,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 		return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk,
 							  len);
 
-	BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES);
+	BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
 	nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
 			 (len + blocks_per_page - 1) >> blocks_per_page_bits);
 
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 6cb356c4217b..3851e1a64f73 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1235,7 +1235,7 @@ submit_bio_retry:
 			}
 
 			if (!bio) {
-				bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+				bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
 
 				bio->bi_end_io = z_erofs_decompressqueue_endio;
 				bio_set_dev(bio, sb->s_bdev);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 03a44a0de86a..f038d578d8d8 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -398,7 +398,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io,
 	 * bio_alloc will _always_ be able to allocate a bio if
 	 * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
 	 */
-	bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+	bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
 	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio_set_dev(bio, bh->b_bdev);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 174a0819ad96..be5415a0dbbc 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -292,7 +292,7 @@ void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
 	f2fs_put_page(page, 0);
 
 	if (readahead)
-		f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
+		f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true);
 }
 
 static int __f2fs_write_meta_page(struct page *page,
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7c95818639a6..4e5257c763d0 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -857,7 +857,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
 		f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL);
 alloc_new:
 	if (!bio) {
-		bio = __bio_alloc(fio, BIO_MAX_PAGES);
+		bio = __bio_alloc(fio, BIO_MAX_VECS);
 		__attach_io_flag(fio);
 		f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
 				       fio->page->index, fio, GFP_NOIO);
@@ -932,7 +932,7 @@ alloc_new:
 			fio->retry = true;
 			goto skip;
 		}
-		io->bio = __bio_alloc(fio, BIO_MAX_PAGES);
+		io->bio = __bio_alloc(fio, BIO_MAX_VECS);
 		f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
 				       bio_page->index, fio, GFP_NOIO);
 		io->fio = *fio;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 993004f06a77..c2866561263e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -4381,7 +4381,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 	block_t total_node_blocks = 0;
 
 	do {
-		readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
+		readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS,
 							META_SIT, true);
 
 		start = start_blk * sit_i->sents_per_block;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 229814b4f4a6..e9a7a637d688 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -851,7 +851,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 	else if (type == NODE)
 		return 8 * sbi->blocks_per_seg;
 	else if (type == META)
-		return 8 * BIO_MAX_PAGES;
+		return 8 * BIO_MAX_VECS;
 	else
 		return 0;
 }
@@ -868,7 +868,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
 		return 0;
 
 	nr_to_write = wbc->nr_to_write;
-	desired = BIO_MAX_PAGES;
+	desired = BIO_MAX_VECS;
 	if (type == NODE)
 		desired <<= 1;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 7069793752f1..82592b19b4e0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -753,9 +753,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 		case Opt_io_size_bits:
 			if (args->from && match_int(args, &arg))
 				return -EINVAL;
-			if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) {
+			if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) {
 				f2fs_warn(sbi, "Not support %d, larger than %d",
-					  1 << arg, BIO_MAX_PAGES);
+					  1 << arg, BIO_MAX_VECS);
 				return -EINVAL;
 			}
 			F2FS_OPTION(sbi).write_io_size_bits = arg;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index dc1b93a877c6..a82f4747aa8d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -267,7 +267,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
 				      bio_end_io_t *end_io)
 {
 	struct super_block *sb = sdp->sd_vfs;
-	struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+	struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
 
 	bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
 	bio_set_dev(bio, sb->s_bdev);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 7ffcd7ef33d4..414769a6ad11 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1221,7 +1221,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
 	struct iomap_ioend *ioend;
 	struct bio *bio;
 
-	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset);
+	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset);
 	bio_set_dev(bio, wpc->iomap.bdev);
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
@@ -1252,7 +1252,7 @@ iomap_chain_bio(struct bio *prev)
 {
 	struct bio *new;
 
-	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+	new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
 	bio_copy_dev(new, prev);/* also copies over blkcg information */
 	new->bi_iter.bi_sector = bio_end_sector(prev);
 	new->bi_opf = prev->bi_opf;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index e2c4991833b8..bdd0d89bbf0a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -296,7 +296,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 	 */
 	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
 
-	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES);
+	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
 	do {
 		size_t n;
 		if (dio->error) {
@@ -338,7 +338,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 		copied += n;
 
 		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
-						 BIO_MAX_PAGES);
+						 BIO_MAX_VECS);
 		iomap_dio_submit_bio(dio, iomap, bio, pos);
 		pos += n;
 	} while (nr_pages);
diff --git a/fs/mpage.c b/fs/mpage.c
index 961234d68779..334e7d09aa65 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -616,7 +616,7 @@ alloc_new:
 				goto out;
 		}
 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-				BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
+				BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH);
 		if (bio == NULL)
 			goto confused;
 
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1e75417bfe6e..56872e93823d 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -399,7 +399,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
 {
 	wi->bio = NULL;
 	wi->rest_blocks = segbuf->sb_sum.nblocks;
-	wi->max_pages = BIO_MAX_PAGES;
+	wi->max_pages = BIO_MAX_VECS;
 	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
 	wi->start = wi->end = 0;
 	wi->blocknr = segbuf->sb_pseg_start;
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 45f44425d856..b9e87ebb1060 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -87,7 +87,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
 	int error, i;
 	struct bio *bio;
 
-	if (page_count <= BIO_MAX_PAGES)
+	if (page_count <= BIO_MAX_VECS)
 		bio = bio_alloc(GFP_NOIO, page_count);
 	else
 		bio = bio_kmalloc(GFP_NOIO, page_count);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b6ff4a21abac..0fe76f376dee 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -684,7 +684,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
 	max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
 	iov_iter_truncate(from, max);
 
-	nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
+	nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
 	if (!nr_pages)
 		return 0;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 983ed2fe7c85..d0246c92a6e8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -20,11 +20,11 @@
 #define BIO_BUG_ON
 #endif
 
-#define BIO_MAX_PAGES		256U
+#define BIO_MAX_VECS		256U
 
 static inline unsigned int bio_max_segs(unsigned int nr_segs)
 {
-	return min(nr_segs, BIO_MAX_PAGES);
+	return min(nr_segs, BIO_MAX_VECS);
 }
 
 #define bio_prio(bio)			(bio)->bi_ioprio
-- 
cgit v1.2.3


From b046664872dd78a8bebe3d5f3bb9da9baa93f5ca Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 11 Mar 2021 15:23:07 +0100
Subject: static_call: Move struct static_call_key definition to
 static_call_types.h

Having the definition of static_call() in static_call_types.h makes
no sense as long struct static_call_key isn't defined there, as the
generic implementation of static_call() is referencing this structure.

So move the definition of struct static_call_key to static_call_types.h.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210311142319.4723-3-jgross@suse.com
---
 include/linux/static_call.h             | 18 ------------------
 include/linux/static_call_types.h       | 18 ++++++++++++++++++
 tools/include/linux/static_call_types.h | 18 ++++++++++++++++++
 3 files changed, 36 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 85ecc789f4ff..76b881259144 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -128,16 +128,6 @@ struct static_call_mod {
 	struct static_call_site *sites;
 };
 
-struct static_call_key {
-	void *func;
-	union {
-		/* bit 0: 0 = mods, 1 = sites */
-		unsigned long type;
-		struct static_call_mod *mods;
-		struct static_call_site *sites;
-	};
-};
-
 /* For finding the key associated with a trampoline */
 struct static_call_tramp_key {
 	s32 tramp;
@@ -187,10 +177,6 @@ extern long __static_call_return0(void);
 
 static inline int static_call_init(void) { return 0; }
 
-struct static_call_key {
-	void *func;
-};
-
 #define __DEFINE_STATIC_CALL(name, _func, _func_init)			\
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
@@ -243,10 +229,6 @@ static inline long __static_call_return0(void)
 
 static inline int static_call_init(void) { return 0; }
 
-struct static_call_key {
-	void *func;
-};
-
 static inline long __static_call_return0(void)
 {
 	return 0;
diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h
index ae5662d368b9..5a00b8b2cf9f 100644
--- a/include/linux/static_call_types.h
+++ b/include/linux/static_call_types.h
@@ -58,11 +58,25 @@ struct static_call_site {
 	__raw_static_call(name);					\
 })
 
+struct static_call_key {
+	void *func;
+	union {
+		/* bit 0: 0 = mods, 1 = sites */
+		unsigned long type;
+		struct static_call_mod *mods;
+		struct static_call_site *sites;
+	};
+};
+
 #else /* !CONFIG_HAVE_STATIC_CALL_INLINE */
 
 #define __STATIC_CALL_ADDRESSABLE(name)
 #define __static_call(name)	__raw_static_call(name)
 
+struct static_call_key {
+	void *func;
+};
+
 #endif /* CONFIG_HAVE_STATIC_CALL_INLINE */
 
 #ifdef MODULE
@@ -77,6 +91,10 @@ struct static_call_site {
 
 #else
 
+struct static_call_key {
+	void *func;
+};
+
 #define static_call(name)						\
 	((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
 
diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h
index ae5662d368b9..5a00b8b2cf9f 100644
--- a/tools/include/linux/static_call_types.h
+++ b/tools/include/linux/static_call_types.h
@@ -58,11 +58,25 @@ struct static_call_site {
 	__raw_static_call(name);					\
 })
 
+struct static_call_key {
+	void *func;
+	union {
+		/* bit 0: 0 = mods, 1 = sites */
+		unsigned long type;
+		struct static_call_mod *mods;
+		struct static_call_site *sites;
+	};
+};
+
 #else /* !CONFIG_HAVE_STATIC_CALL_INLINE */
 
 #define __STATIC_CALL_ADDRESSABLE(name)
 #define __static_call(name)	__raw_static_call(name)
 
+struct static_call_key {
+	void *func;
+};
+
 #endif /* CONFIG_HAVE_STATIC_CALL_INLINE */
 
 #ifdef MODULE
@@ -77,6 +91,10 @@ struct static_call_site {
 
 #else
 
+struct static_call_key {
+	void *func;
+};
+
 #define static_call(name)						\
 	((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func))
 
-- 
cgit v1.2.3


From 6ea312d95e0226b306bb4b8ee3a0727d880378cb Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 11 Mar 2021 15:23:08 +0100
Subject: static_call: Add function to query current function

Some users of paravirtualized functions need to query which function
has been specified in a pv_ops vector element. In order to be able to
switch such paravirtualized functions to static_calls instead, there
needs to be a function to query the function which will be called via
static_call().

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210311142319.4723-4-jgross@suse.com
---
 include/linux/static_call.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 76b881259144..e01b61ab86b1 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -20,6 +20,7 @@
  *   static_call(name)(args...);
  *   static_call_cond(name)(args...);
  *   static_call_update(name, func);
+ *   static_call_query(name);
  *
  * Usage example:
  *
@@ -91,6 +92,10 @@
  *
  *   which will include the required value tests to avoid NULL-pointer
  *   dereferences.
+ *
+ *   To query which function is currently set to be called, use:
+ *
+ *   func = static_call_query(name);
  */
 
 #include <linux/types.h>
@@ -118,6 +123,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool
 			     STATIC_CALL_TRAMP_ADDR(name), func);	\
 })
 
+#define static_call_query(name) (READ_ONCE(STATIC_CALL_KEY(name).func))
+
 #ifdef CONFIG_HAVE_STATIC_CALL_INLINE
 
 extern int __init static_call_init(void);
@@ -191,6 +198,7 @@ static inline int static_call_init(void) { return 0; }
 	};								\
 	ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
 
+
 #define static_call_cond(name)	(void)__static_call(name)
 
 static inline
-- 
cgit v1.2.3


From 2cfc056ef2c28b4961bff5e2f6deed94afb14024 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Sat, 6 Mar 2021 19:24:18 +0800
Subject: remoteproc: introduce is_iomem to rproc_mem_entry

Introduce is_iomem to indicate this piece memory is iomem or not.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/1615029865-23312-4-git-send-email-peng.fan@oss.nxp.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/remoteproc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index f28ee75d1005..a5f6d2d9cde2 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -315,6 +315,7 @@ struct rproc;
 /**
  * struct rproc_mem_entry - memory entry descriptor
  * @va:	virtual address
+ * @is_iomem: io memory
  * @dma: dma address
  * @len: length, in bytes
  * @da: device address
@@ -329,6 +330,7 @@ struct rproc;
  */
 struct rproc_mem_entry {
 	void *va;
+	bool is_iomem;
 	dma_addr_t dma;
 	size_t len;
 	u32 da;
-- 
cgit v1.2.3


From 40df0a91b2a5228ded8e5f75b80d28c96c6831cd Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Sat, 6 Mar 2021 19:24:19 +0800
Subject: remoteproc: add is_iomem to da_to_va

Introduce an extra parameter is_iomem to da_to_va, then the caller
could take the memory as normal memory or io mapped memory.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/1615029865-23312-5-git-send-email-peng.fan@oss.nxp.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/imx_rproc.c             |  2 +-
 drivers/remoteproc/ingenic_rproc.c         |  2 +-
 drivers/remoteproc/keystone_remoteproc.c   |  2 +-
 drivers/remoteproc/mtk_scp.c               |  6 +++---
 drivers/remoteproc/omap_remoteproc.c       |  2 +-
 drivers/remoteproc/pru_rproc.c             |  2 +-
 drivers/remoteproc/qcom_q6v5_adsp.c        |  2 +-
 drivers/remoteproc/qcom_q6v5_pas.c         |  2 +-
 drivers/remoteproc/qcom_q6v5_wcss.c        |  2 +-
 drivers/remoteproc/qcom_wcnss.c            |  2 +-
 drivers/remoteproc/remoteproc_core.c       |  7 +++++--
 drivers/remoteproc/remoteproc_coredump.c   |  8 ++++++--
 drivers/remoteproc/remoteproc_debugfs.c    |  2 +-
 drivers/remoteproc/remoteproc_elf_loader.c | 21 +++++++++++++++------
 drivers/remoteproc/remoteproc_internal.h   |  2 +-
 drivers/remoteproc/st_slim_rproc.c         |  2 +-
 drivers/remoteproc/ti_k3_dsp_remoteproc.c  |  2 +-
 drivers/remoteproc/ti_k3_r5_remoteproc.c   |  2 +-
 drivers/remoteproc/wkup_m3_rproc.c         |  2 +-
 include/linux/remoteproc.h                 |  2 +-
 20 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c
index 8957ed271d20..6603e00bb6f4 100644
--- a/drivers/remoteproc/imx_rproc.c
+++ b/drivers/remoteproc/imx_rproc.c
@@ -208,7 +208,7 @@ static int imx_rproc_da_to_sys(struct imx_rproc *priv, u64 da,
 	return -ENOENT;
 }
 
-static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct imx_rproc *priv = rproc->priv;
 	void *va = NULL;
diff --git a/drivers/remoteproc/ingenic_rproc.c b/drivers/remoteproc/ingenic_rproc.c
index e2618c36eaab..a356738160a4 100644
--- a/drivers/remoteproc/ingenic_rproc.c
+++ b/drivers/remoteproc/ingenic_rproc.c
@@ -121,7 +121,7 @@ static void ingenic_rproc_kick(struct rproc *rproc, int vqid)
 	writel(vqid, vpu->aux_base + REG_CORE_MSG);
 }
 
-static void *ingenic_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *ingenic_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct vpu *vpu = rproc->priv;
 	void __iomem *va = NULL;
diff --git a/drivers/remoteproc/keystone_remoteproc.c b/drivers/remoteproc/keystone_remoteproc.c
index cd266163a65f..54781f553f4e 100644
--- a/drivers/remoteproc/keystone_remoteproc.c
+++ b/drivers/remoteproc/keystone_remoteproc.c
@@ -246,7 +246,7 @@ static void keystone_rproc_kick(struct rproc *rproc, int vqid)
  * can be used either by the remoteproc core for loading (when using kernel
  * remoteproc loader), or by any rpmsg bus drivers.
  */
-static void *keystone_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *keystone_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct keystone_rproc *ksproc = rproc->priv;
 	void __iomem *va = NULL;
diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c
index ce727598c41c..9679cc26895e 100644
--- a/drivers/remoteproc/mtk_scp.c
+++ b/drivers/remoteproc/mtk_scp.c
@@ -272,7 +272,7 @@ static int scp_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
 		}
 
 		/* grab the kernel address for this device address */
-		ptr = (void __iomem *)rproc_da_to_va(rproc, da, memsz);
+		ptr = (void __iomem *)rproc_da_to_va(rproc, da, memsz, NULL);
 		if (!ptr) {
 			dev_err(dev, "bad phdr da 0x%x mem 0x%x\n", da, memsz);
 			ret = -EINVAL;
@@ -509,7 +509,7 @@ static void *mt8192_scp_da_to_va(struct mtk_scp *scp, u64 da, size_t len)
 	return NULL;
 }
 
-static void *scp_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *scp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct mtk_scp *scp = (struct mtk_scp *)rproc->priv;
 
@@ -627,7 +627,7 @@ void *scp_mapping_dm_addr(struct mtk_scp *scp, u32 mem_addr)
 {
 	void *ptr;
 
-	ptr = scp_da_to_va(scp->rproc, mem_addr, 0);
+	ptr = scp_da_to_va(scp->rproc, mem_addr, 0, NULL);
 	if (!ptr)
 		return ERR_PTR(-EINVAL);
 
diff --git a/drivers/remoteproc/omap_remoteproc.c b/drivers/remoteproc/omap_remoteproc.c
index d94b7391bf9d..43531caa1959 100644
--- a/drivers/remoteproc/omap_remoteproc.c
+++ b/drivers/remoteproc/omap_remoteproc.c
@@ -728,7 +728,7 @@ out:
  * Return: translated virtual address in kernel memory space on success,
  *         or NULL on failure.
  */
-static void *omap_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *omap_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct omap_rproc *oproc = rproc->priv;
 	int i;
diff --git a/drivers/remoteproc/pru_rproc.c b/drivers/remoteproc/pru_rproc.c
index 16b204ed657b..d6086f90e809 100644
--- a/drivers/remoteproc/pru_rproc.c
+++ b/drivers/remoteproc/pru_rproc.c
@@ -465,7 +465,7 @@ static void *pru_i_da_to_va(struct pru_rproc *pru, u32 da, size_t len)
  * core for any PRU client drivers. The PRU Instruction RAM access is restricted
  * only to the PRU loader code.
  */
-static void *pru_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *pru_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct pru_rproc *pru = rproc->priv;
 
diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c b/drivers/remoteproc/qcom_q6v5_adsp.c
index e02450225e4a..8b0d8bbacd2e 100644
--- a/drivers/remoteproc/qcom_q6v5_adsp.c
+++ b/drivers/remoteproc/qcom_q6v5_adsp.c
@@ -281,7 +281,7 @@ static int adsp_stop(struct rproc *rproc)
 	return ret;
 }
 
-static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct qcom_adsp *adsp = (struct qcom_adsp *)rproc->priv;
 	int offset;
diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c
index e635454d6170..ef85b5511dc9 100644
--- a/drivers/remoteproc/qcom_q6v5_pas.c
+++ b/drivers/remoteproc/qcom_q6v5_pas.c
@@ -242,7 +242,7 @@ static int adsp_stop(struct rproc *rproc)
 	return ret;
 }
 
-static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct qcom_adsp *adsp = (struct qcom_adsp *)rproc->priv;
 	int offset;
diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c
index 78ebe1168b33..704cd63c9af4 100644
--- a/drivers/remoteproc/qcom_q6v5_wcss.c
+++ b/drivers/remoteproc/qcom_q6v5_wcss.c
@@ -410,7 +410,7 @@ static int q6v5_wcss_stop(struct rproc *rproc)
 	return 0;
 }
 
-static void *q6v5_wcss_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *q6v5_wcss_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct q6v5_wcss *wcss = rproc->priv;
 	int offset;
diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c
index 2a6a23cb14ca..3a131163064c 100644
--- a/drivers/remoteproc/qcom_wcnss.c
+++ b/drivers/remoteproc/qcom_wcnss.c
@@ -320,7 +320,7 @@ static int wcnss_stop(struct rproc *rproc)
 	return ret;
 }
 
-static void *wcnss_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *wcnss_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct qcom_wcnss *wcnss = (struct qcom_wcnss *)rproc->priv;
 	int offset;
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 553e42a4d2a0..5071cdbfc926 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -189,13 +189,13 @@ EXPORT_SYMBOL(rproc_va_to_pa);
  * here the output of the DMA API for the carveouts, which should be more
  * correct.
  */
-void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct rproc_mem_entry *carveout;
 	void *ptr = NULL;
 
 	if (rproc->ops->da_to_va) {
-		ptr = rproc->ops->da_to_va(rproc, da, len);
+		ptr = rproc->ops->da_to_va(rproc, da, len, is_iomem);
 		if (ptr)
 			goto out;
 	}
@@ -217,6 +217,9 @@ void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
 
 		ptr = carveout->va + offset;
 
+		if (is_iomem)
+			*is_iomem = carveout->is_iomem;
+
 		break;
 	}
 
diff --git a/drivers/remoteproc/remoteproc_coredump.c b/drivers/remoteproc/remoteproc_coredump.c
index 81ec154a6a5e..aee657cc08c6 100644
--- a/drivers/remoteproc/remoteproc_coredump.c
+++ b/drivers/remoteproc/remoteproc_coredump.c
@@ -153,18 +153,22 @@ static void rproc_copy_segment(struct rproc *rproc, void *dest,
 			       size_t offset, size_t size)
 {
 	void *ptr;
+	bool is_iomem;
 
 	if (segment->dump) {
 		segment->dump(rproc, segment, dest, offset, size);
 	} else {
-		ptr = rproc_da_to_va(rproc, segment->da + offset, size);
+		ptr = rproc_da_to_va(rproc, segment->da + offset, size, &is_iomem);
 		if (!ptr) {
 			dev_err(&rproc->dev,
 				"invalid copy request for segment %pad with offset %zu and size %zu)\n",
 				&segment->da, offset, size);
 			memset(dest, 0xff, size);
 		} else {
-			memcpy(dest, ptr, size);
+			if (is_iomem)
+				memcpy_fromio(dest, ptr, size);
+			else
+				memcpy(dest, ptr, size);
 		}
 	}
 }
diff --git a/drivers/remoteproc/remoteproc_debugfs.c b/drivers/remoteproc/remoteproc_debugfs.c
index 7e5845376e9f..b5a1e3b697d9 100644
--- a/drivers/remoteproc/remoteproc_debugfs.c
+++ b/drivers/remoteproc/remoteproc_debugfs.c
@@ -132,7 +132,7 @@ static ssize_t rproc_trace_read(struct file *filp, char __user *userbuf,
 	char buf[100];
 	int len;
 
-	va = rproc_da_to_va(data->rproc, trace->da, trace->len);
+	va = rproc_da_to_va(data->rproc, trace->da, trace->len, NULL);
 
 	if (!va) {
 		len = scnprintf(buf, sizeof(buf), "Trace %s not available\n",
diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c
index df68d87752e4..11423588965a 100644
--- a/drivers/remoteproc/remoteproc_elf_loader.c
+++ b/drivers/remoteproc/remoteproc_elf_loader.c
@@ -175,6 +175,7 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
 		u64 offset = elf_phdr_get_p_offset(class, phdr);
 		u32 type = elf_phdr_get_p_type(class, phdr);
 		void *ptr;
+		bool is_iomem;
 
 		if (type != PT_LOAD)
 			continue;
@@ -204,7 +205,7 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
 		}
 
 		/* grab the kernel address for this device address */
-		ptr = rproc_da_to_va(rproc, da, memsz);
+		ptr = rproc_da_to_va(rproc, da, memsz, &is_iomem);
 		if (!ptr) {
 			dev_err(dev, "bad phdr da 0x%llx mem 0x%llx\n", da,
 				memsz);
@@ -213,8 +214,12 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
 		}
 
 		/* put the segment where the remote processor expects it */
-		if (filesz)
-			memcpy(ptr, elf_data + offset, filesz);
+		if (filesz) {
+			if (is_iomem)
+				memcpy_fromio(ptr, (void __iomem *)(elf_data + offset), filesz);
+			else
+				memcpy(ptr, elf_data + offset, filesz);
+		}
 
 		/*
 		 * Zero out remaining memory for this segment.
@@ -223,8 +228,12 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
 		 * did this for us. albeit harmless, we may consider removing
 		 * this.
 		 */
-		if (memsz > filesz)
-			memset(ptr + filesz, 0, memsz - filesz);
+		if (memsz > filesz) {
+			if (is_iomem)
+				memset_io((void __iomem *)(ptr + filesz), 0, memsz - filesz);
+			else
+				memset(ptr + filesz, 0, memsz - filesz);
+		}
 	}
 
 	return ret;
@@ -377,6 +386,6 @@ struct resource_table *rproc_elf_find_loaded_rsc_table(struct rproc *rproc,
 		return NULL;
 	}
 
-	return rproc_da_to_va(rproc, sh_addr, sh_size);
+	return rproc_da_to_va(rproc, sh_addr, sh_size, NULL);
 }
 EXPORT_SYMBOL(rproc_elf_find_loaded_rsc_table);
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h
index c34002888d2c..9ea37aa687d2 100644
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -84,7 +84,7 @@ static inline void  rproc_char_device_remove(struct rproc *rproc)
 void rproc_free_vring(struct rproc_vring *rvring);
 int rproc_alloc_vring(struct rproc_vdev *rvdev, int i);
 
-void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len);
+void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem);
 phys_addr_t rproc_va_to_pa(void *cpu_addr);
 int rproc_trigger_recovery(struct rproc *rproc);
 
diff --git a/drivers/remoteproc/st_slim_rproc.c b/drivers/remoteproc/st_slim_rproc.c
index 09bcb4d8b9e0..22096adc1ad3 100644
--- a/drivers/remoteproc/st_slim_rproc.c
+++ b/drivers/remoteproc/st_slim_rproc.c
@@ -174,7 +174,7 @@ static int slim_rproc_stop(struct rproc *rproc)
 	return 0;
 }
 
-static void *slim_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *slim_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct st_slim_rproc *slim_rproc = rproc->priv;
 	void *va = NULL;
diff --git a/drivers/remoteproc/ti_k3_dsp_remoteproc.c b/drivers/remoteproc/ti_k3_dsp_remoteproc.c
index 863c0214e0a8..fd4eb67a6681 100644
--- a/drivers/remoteproc/ti_k3_dsp_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_dsp_remoteproc.c
@@ -354,7 +354,7 @@ static int k3_dsp_rproc_stop(struct rproc *rproc)
  * can be used either by the remoteproc core for loading (when using kernel
  * remoteproc loader), or by any rpmsg bus drivers.
  */
-static void *k3_dsp_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *k3_dsp_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct k3_dsp_rproc *kproc = rproc->priv;
 	void __iomem *va = NULL;
diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c b/drivers/remoteproc/ti_k3_r5_remoteproc.c
index 62b5a4c29456..5cf8d030a1f0 100644
--- a/drivers/remoteproc/ti_k3_r5_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c
@@ -590,7 +590,7 @@ out:
  * present in a DSP or IPU device). The translated addresses can be used
  * either by the remoteproc core for loading, or by any rpmsg bus drivers.
  */
-static void *k3_r5_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *k3_r5_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct k3_r5_rproc *kproc = rproc->priv;
 	struct k3_r5_core *core = kproc->core;
diff --git a/drivers/remoteproc/wkup_m3_rproc.c b/drivers/remoteproc/wkup_m3_rproc.c
index 92d387dfc03b..484f7605823e 100644
--- a/drivers/remoteproc/wkup_m3_rproc.c
+++ b/drivers/remoteproc/wkup_m3_rproc.c
@@ -89,7 +89,7 @@ static int wkup_m3_rproc_stop(struct rproc *rproc)
 	return error;
 }
 
-static void *wkup_m3_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len)
+static void *wkup_m3_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem)
 {
 	struct wkup_m3_rproc *wkupm3 = rproc->priv;
 	void *va = NULL;
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index a5f6d2d9cde2..1b7d56c7a453 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -386,7 +386,7 @@ struct rproc_ops {
 	int (*stop)(struct rproc *rproc);
 	int (*attach)(struct rproc *rproc);
 	void (*kick)(struct rproc *rproc, int vqid);
-	void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len);
+	void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len, bool *is_iomem);
 	int (*parse_fw)(struct rproc *rproc, const struct firmware *fw);
 	int (*handle_rsc)(struct rproc *rproc, u32 rsc_type, void *rsc,
 			  int offset, int avail);
-- 
cgit v1.2.3


From bade4be69a6ea6f38c5894468ede10ee60b6f7a0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Mar 2021 13:25:01 -0500
Subject: svcrdma: Revert "svcrdma: Reduce Receive doorbell rate"

I tested commit 43042b90cae1 ("svcrdma: Reduce Receive doorbell
rate") with mlx4 (IB) and software iWARP and didn't find any
issues. However, I recently got my hardware iWARP setup back on
line (FastLinQ) and it's crashing hard on this commit (confirmed
via bisect).

The failure mode is complex.
 - After a connection is established, the first Receive completes
   normally.
 - But the second and third Receives have garbage in their Receive
   buffers. The server responds with ERR_VERS as a result.
 - When the client tears down the connection to retry, a couple
   of posted Receives flush twice, and that corrupts the recv_ctxt
   free list.
 - __svc_rdma_free then faults or loops infinitely while destroying
   the xprt's recv_ctxts.

Since 43042b90cae1 ("svcrdma: Reduce Receive doorbell rate") does
not fix a bug but is a scalability enhancement, it's safe and
appropriate to revert it while working on a replacement.

Fixes: 43042b90cae1 ("svcrdma: Reduce Receive doorbell rate")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |  1 -
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 82 ++++++++++++++++-----------------
 2 files changed, 39 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 7c693b31965e..1e76ed688044 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -104,7 +104,6 @@ struct svcxprt_rdma {
 
 	wait_queue_head_t    sc_send_wait;	/* SQ exhaustion waitlist */
 	unsigned long	     sc_flags;
-	u32		     sc_pending_recvs;
 	struct list_head     sc_read_complete_q;
 	struct work_struct   sc_work;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 6d28f23ceb35..7d34290e2ff8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -266,46 +266,33 @@ void svc_rdma_release_rqst(struct svc_rqst *rqstp)
 		svc_rdma_recv_ctxt_put(rdma, ctxt);
 }
 
-static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
-				   unsigned int wanted, bool temp)
+static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
+				struct svc_rdma_recv_ctxt *ctxt)
 {
-	const struct ib_recv_wr *bad_wr = NULL;
-	struct svc_rdma_recv_ctxt *ctxt;
-	struct ib_recv_wr *recv_chain;
 	int ret;
 
-	recv_chain = NULL;
-	while (wanted--) {
-		ctxt = svc_rdma_recv_ctxt_get(rdma);
-		if (!ctxt)
-			break;
-
-		trace_svcrdma_post_recv(ctxt);
-		ctxt->rc_temp = temp;
-		ctxt->rc_recv_wr.next = recv_chain;
-		recv_chain = &ctxt->rc_recv_wr;
-		rdma->sc_pending_recvs++;
-	}
-	if (!recv_chain)
-		return false;
-
-	ret = ib_post_recv(rdma->sc_qp, recv_chain, &bad_wr);
+	trace_svcrdma_post_recv(ctxt);
+	ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
 	if (ret)
 		goto err_post;
-	return true;
+	return 0;
 
 err_post:
-	while (bad_wr) {
-		ctxt = container_of(bad_wr, struct svc_rdma_recv_ctxt,
-				    rc_recv_wr);
-		bad_wr = bad_wr->next;
-		svc_rdma_recv_ctxt_put(rdma, ctxt);
-	}
-
 	trace_svcrdma_rq_post_err(rdma, ret);
-	/* Since we're destroying the xprt, no need to reset
-	 * sc_pending_recvs. */
-	return false;
+	svc_rdma_recv_ctxt_put(rdma, ctxt);
+	return ret;
+}
+
+static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+		return 0;
+	ctxt = svc_rdma_recv_ctxt_get(rdma);
+	if (!ctxt)
+		return -ENOMEM;
+	return __svc_rdma_post_recv(rdma, ctxt);
 }
 
 /**
@@ -316,7 +303,20 @@ err_post:
  */
 bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
 {
-	return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true);
+	struct svc_rdma_recv_ctxt *ctxt;
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < rdma->sc_max_requests; i++) {
+		ctxt = svc_rdma_recv_ctxt_get(rdma);
+		if (!ctxt)
+			return false;
+		ctxt->rc_temp = true;
+		ret = __svc_rdma_post_recv(rdma, ctxt);
+		if (ret)
+			return false;
+	}
+	return true;
 }
 
 /**
@@ -324,6 +324,8 @@ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
  * @cq: Completion Queue context
  * @wc: Work Completion object
  *
+ * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
+ * the Receive completion handler could be running.
  */
 static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 {
@@ -331,8 +333,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct svc_rdma_recv_ctxt *ctxt;
 
-	rdma->sc_pending_recvs--;
-
 	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
 	ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
 
@@ -340,6 +340,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	if (wc->status != IB_WC_SUCCESS)
 		goto flushed;
 
+	if (svc_rdma_post_recv(rdma))
+		goto post_err;
+
 	/* All wc fields are now known to be valid */
 	ctxt->rc_byte_len = wc->byte_len;
 
@@ -350,18 +353,11 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	spin_unlock(&rdma->sc_rq_dto_lock);
 	if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
 		svc_xprt_enqueue(&rdma->sc_xprt);
-
-	if (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) &&
-	    rdma->sc_pending_recvs < rdma->sc_max_requests)
-		if (!svc_rdma_refresh_recvs(rdma, RPCRDMA_MAX_RECV_BATCH,
-					    false))
-			goto post_err;
-
 	return;
 
 flushed:
-	svc_rdma_recv_ctxt_put(rdma, ctxt);
 post_err:
+	svc_rdma_recv_ctxt_put(rdma, ctxt);
 	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
 	svc_xprt_enqueue(&rdma->sc_xprt);
 }
-- 
cgit v1.2.3


From e36db6a06937c6fce3291f0c362d4f757b8ec703 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:21 +0200
Subject: iio: kfifo: add devm_iio_kfifo_buffer_setup() helper

This change adds the devm_iio_kfifo_buffer_setup() helper/short-hand,
which groups the simple routine of allocating a kfifo buffers via
devm_iio_kfifo_allocate() and calling iio_device_attach_buffer().

The mode_flags parameter is required, as the IIO kfifo supports 2 modes:
INDIO_BUFFER_SOFTWARE & INDIO_BUFFER_TRIGGERED.
The setup_ops parameter is optional.

This function will be a bit more useful when needing to define multiple
buffers per IIO device.

The naming for this function has been inspired from
iio_triggered_buffer_setup() since that one does a kfifo alloc + a pollfunc
alloc. So, this should have a more familiar ring to what it is.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-3-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/driver-api/driver-model/devres.rst |  1 +
 drivers/iio/buffer/kfifo_buf.c                   | 39 ++++++++++++++++++++++++
 include/linux/iio/kfifo_buf.h                    |  7 +++++
 3 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index cd8b6e657b94..562f5722281e 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -286,6 +286,7 @@ IIO
   devm_iio_device_alloc()
   devm_iio_device_register()
   devm_iio_kfifo_allocate()
+  devm_iio_kfifo_buffer_setup()
   devm_iio_triggered_buffer_setup()
   devm_iio_trigger_alloc()
   devm_iio_trigger_register()
diff --git a/drivers/iio/buffer/kfifo_buf.c b/drivers/iio/buffer/kfifo_buf.c
index 1359abed3b31..6472c9fa1937 100644
--- a/drivers/iio/buffer/kfifo_buf.c
+++ b/drivers/iio/buffer/kfifo_buf.c
@@ -206,4 +206,43 @@ struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev)
 }
 EXPORT_SYMBOL(devm_iio_kfifo_allocate);
 
+/**
+ * devm_iio_kfifo_buffer_setup - Allocate a kfifo buffer & attach it to an IIO device
+ * @dev: Device object to which to attach the life-time of this kfifo buffer
+ * @indio_dev: The device the buffer should be attached to
+ * @mode_flags: The mode flags for this buffer (INDIO_BUFFER_SOFTWARE and/or
+ *		INDIO_BUFFER_TRIGGERED).
+ * @setup_ops: The setup_ops required to configure the HW part of the buffer (optional)
+ *
+ * This function allocates a kfifo buffer via devm_iio_kfifo_allocate() and
+ * attaches it to the IIO device via iio_device_attach_buffer().
+ * This is meant to be a bit of a short-hand/helper function as there are a few
+ * drivers that seem to do this.
+ */
+int devm_iio_kfifo_buffer_setup(struct device *dev,
+				struct iio_dev *indio_dev,
+				int mode_flags,
+				const struct iio_buffer_setup_ops *setup_ops)
+{
+	struct iio_buffer *buffer;
+
+	if (mode_flags)
+		mode_flags &= kfifo_access_funcs.modes;
+
+	if (!mode_flags)
+		return -EINVAL;
+
+	buffer = devm_iio_kfifo_allocate(dev);
+	if (!buffer)
+		return -ENOMEM;
+
+	iio_device_attach_buffer(indio_dev, buffer);
+
+	indio_dev->modes |= mode_flags;
+	indio_dev->setup_ops = setup_ops;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_iio_kfifo_buffer_setup);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h
index 1fc1efa7799d..92c411b9ac26 100644
--- a/include/linux/iio/kfifo_buf.h
+++ b/include/linux/iio/kfifo_buf.h
@@ -3,6 +3,8 @@
 #define __LINUX_IIO_KFIFO_BUF_H__
 
 struct iio_buffer;
+struct iio_buffer_setup_ops;
+struct iio_dev;
 struct device;
 
 struct iio_buffer *iio_kfifo_allocate(void);
@@ -10,4 +12,9 @@ void iio_kfifo_free(struct iio_buffer *r);
 
 struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev);
 
+int devm_iio_kfifo_buffer_setup(struct device *dev,
+				struct iio_dev *indio_dev,
+				int mode_flags,
+				const struct iio_buffer_setup_ops *setup_ops);
+
 #endif
-- 
cgit v1.2.3


From 99f6e8215b627730e2e6d371430a1f81e8ed6c27 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:24 +0200
Subject: iio: kfifo: un-export devm_iio_kfifo_allocate() function

At this point all drivers should use devm_iio_kfifo_buffer_setup() instead
of manually allocating via devm_iio_kfifo_allocate() and assigning ops and
modes.

With this change, the devm_iio_kfifo_allocate() will be made private to the
IIO core, since all drivers should call either
devm_iio_kfifo_buffer_setup() or devm_iio_triggered_buffer_setup() to
create a kfifo buffer.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-6-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/driver-api/driver-model/devres.rst | 1 -
 drivers/iio/buffer/kfifo_buf.c                   | 3 +--
 include/linux/iio/kfifo_buf.h                    | 2 --
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 562f5722281e..4b15b3e9358b 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -285,7 +285,6 @@ I2C
 IIO
   devm_iio_device_alloc()
   devm_iio_device_register()
-  devm_iio_kfifo_allocate()
   devm_iio_kfifo_buffer_setup()
   devm_iio_triggered_buffer_setup()
   devm_iio_trigger_alloc()
diff --git a/drivers/iio/buffer/kfifo_buf.c b/drivers/iio/buffer/kfifo_buf.c
index 6472c9fa1937..c35a625280b1 100644
--- a/drivers/iio/buffer/kfifo_buf.c
+++ b/drivers/iio/buffer/kfifo_buf.c
@@ -186,7 +186,7 @@ static void devm_iio_kfifo_release(struct device *dev, void *res)
  * RETURNS:
  * Pointer to allocated iio_buffer on success, NULL on failure.
  */
-struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev)
+static struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev)
 {
 	struct iio_buffer **ptr, *r;
 
@@ -204,7 +204,6 @@ struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev)
 
 	return r;
 }
-EXPORT_SYMBOL(devm_iio_kfifo_allocate);
 
 /**
  * devm_iio_kfifo_buffer_setup - Allocate a kfifo buffer & attach it to an IIO device
diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h
index 92c411b9ac26..1522896e1daf 100644
--- a/include/linux/iio/kfifo_buf.h
+++ b/include/linux/iio/kfifo_buf.h
@@ -10,8 +10,6 @@ struct device;
 struct iio_buffer *iio_kfifo_allocate(void);
 void iio_kfifo_free(struct iio_buffer *r);
 
-struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev);
-
 int devm_iio_kfifo_buffer_setup(struct device *dev,
 				struct iio_dev *indio_dev,
 				int mode_flags,
-- 
cgit v1.2.3


From a02c09e42b3ed6cefae671e302835f1f04bf474e Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:25 +0200
Subject: iio: buffer-dma,adi-axi-adc: introduce
 devm_iio_dmaengine_buffer_setup()

This change does a conversion of the devm_iio_dmaengine_buffer_alloc() to
devm_iio_dmaengine_buffer_setup(). This will allocate an IIO DMA buffer and
attach it to the IIO device, similar to devm_iio_triggered_buffer_setup()
(though the underlying code is different, the final logic is the same).

Since the only user of the devm_iio_dmaengine_buffer_alloc() was the
adi-axi-adc driver, this change does the replacement in a single go in the
driver.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-7-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 Documentation/driver-api/driver-model/devres.rst   |  1 +
 drivers/iio/adc/adi-axi-adc.c                      | 12 ++------
 drivers/iio/buffer/industrialio-buffer-dmaengine.c | 33 ++++++++++++++++++++--
 include/linux/iio/buffer-dmaengine.h               |  7 +++--
 4 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 4b15b3e9358b..5f8c6c303ff2 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -285,6 +285,7 @@ I2C
 IIO
   devm_iio_device_alloc()
   devm_iio_device_register()
+  devm_iio_dmaengine_buffer_setup()
   devm_iio_kfifo_buffer_setup()
   devm_iio_triggered_buffer_setup()
   devm_iio_trigger_alloc()
diff --git a/drivers/iio/adc/adi-axi-adc.c b/drivers/iio/adc/adi-axi-adc.c
index 9109da2d2e15..2e84623f732e 100644
--- a/drivers/iio/adc/adi-axi-adc.c
+++ b/drivers/iio/adc/adi-axi-adc.c
@@ -104,7 +104,6 @@ static unsigned int adi_axi_adc_read(struct adi_axi_adc_state *st,
 static int adi_axi_adc_config_dma_buffer(struct device *dev,
 					 struct iio_dev *indio_dev)
 {
-	struct iio_buffer *buffer;
 	const char *dma_name;
 
 	if (!device_property_present(dev, "dmas"))
@@ -113,15 +112,8 @@ static int adi_axi_adc_config_dma_buffer(struct device *dev,
 	if (device_property_read_string(dev, "dma-names", &dma_name))
 		dma_name = "rx";
 
-	buffer = devm_iio_dmaengine_buffer_alloc(indio_dev->dev.parent,
-						 dma_name);
-	if (IS_ERR(buffer))
-		return PTR_ERR(buffer);
-
-	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
-	iio_device_attach_buffer(indio_dev, buffer);
-
-	return 0;
+	return devm_iio_dmaengine_buffer_setup(indio_dev->dev.parent,
+					       indio_dev, dma_name);
 }
 
 static int adi_axi_adc_read_raw(struct iio_dev *indio_dev,
diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
index b0cb9a35f5cd..9981896e1495 100644
--- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c
+++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
@@ -244,7 +244,7 @@ static void __devm_iio_dmaengine_buffer_free(struct device *dev, void *res)
  *
  * The buffer will be automatically de-allocated once the device gets destroyed.
  */
-struct iio_buffer *devm_iio_dmaengine_buffer_alloc(struct device *dev,
+static struct iio_buffer *devm_iio_dmaengine_buffer_alloc(struct device *dev,
 	const char *channel)
 {
 	struct iio_buffer **bufferp, *buffer;
@@ -265,7 +265,36 @@ struct iio_buffer *devm_iio_dmaengine_buffer_alloc(struct device *dev,
 
 	return buffer;
 }
-EXPORT_SYMBOL_GPL(devm_iio_dmaengine_buffer_alloc);
+
+/**
+ * devm_iio_dmaengine_buffer_setup() - Setup a DMA buffer for an IIO device
+ * @dev: Parent device for the buffer
+ * @indio_dev: IIO device to which to attach this buffer.
+ * @channel: DMA channel name, typically "rx".
+ *
+ * This allocates a new IIO buffer with devm_iio_dmaengine_buffer_alloc()
+ * and attaches it to an IIO device with iio_device_attach_buffer().
+ * It also appends the INDIO_BUFFER_HARDWARE mode to the supported modes of the
+ * IIO device.
+ */
+int devm_iio_dmaengine_buffer_setup(struct device *dev,
+				    struct iio_dev *indio_dev,
+				    const char *channel)
+{
+	struct iio_buffer *buffer;
+
+	buffer = devm_iio_dmaengine_buffer_alloc(indio_dev->dev.parent,
+						 channel);
+	if (IS_ERR(buffer))
+		return PTR_ERR(buffer);
+
+	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
+
+	iio_device_attach_buffer(indio_dev, buffer);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_iio_dmaengine_buffer_setup);
 
 MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
 MODULE_DESCRIPTION("DMA buffer for the IIO framework");
diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h
index 5b502291d6a4..5c355be89814 100644
--- a/include/linux/iio/buffer-dmaengine.h
+++ b/include/linux/iio/buffer-dmaengine.h
@@ -7,10 +7,11 @@
 #ifndef __IIO_DMAENGINE_H__
 #define __IIO_DMAENGINE_H__
 
-struct iio_buffer;
+struct iio_dev;
 struct device;
 
-struct iio_buffer *devm_iio_dmaengine_buffer_alloc(struct device *dev,
-						   const char *channel);
+int devm_iio_dmaengine_buffer_setup(struct device *dev,
+				    struct iio_dev *indio_dev,
+				    const char *channel);
 
 #endif
-- 
cgit v1.2.3


From 32f171724e5cbecc80594fb6eced057cfdd6eb6f Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:29 +0200
Subject: iio: core: rework iio device group creation

Up until now, the device groups that an IIO device had were limited to 6.
Two of these groups would account for buffer attributes (the buffer/ and
scan_elements/ directories).

Since we want to add multiple buffers per IIO device, this number may not
be enough, when adding a second buffer. So, this change reallocates the
groups array whenever an IIO device group is added, via a
iio_device_register_sysfs_group() helper.

This also means that the groups array should be assigned to
'indio_dev.dev.groups' really late, right before {cdev_}device_add() is
called to do the entire setup.
And we also must take care to free this array when the sysfs resources are
being cleaned up.

With this change we can also move the 'groups' & 'groupcounter' fields to
the iio_dev_opaque object. Up until now, this didn't make a whole lot of
sense (especially since we weren't sure how multibuffer support would look
like in the end).
But doing it now kills one birds with one stone.

An alternative, would be to add a configurable Kconfig symbol
CONFIG_IIO_MAX_BUFFERS_PER_DEVICE (or something like that) and compute a
static maximum of the groups we can support per IIO device. But that would
probably annoy a few people since that would make the system less
configurable.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-11-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/iio_core.h             |  3 +++
 drivers/iio/industrialio-buffer.c  | 12 ++++++++++--
 drivers/iio/industrialio-core.c    | 30 +++++++++++++++++++++++++++---
 drivers/iio/industrialio-event.c   |  5 ++++-
 drivers/iio/industrialio-trigger.c |  6 ++----
 include/linux/iio/iio-opaque.h     |  4 ++++
 include/linux/iio/iio.h            |  5 -----
 7 files changed, 50 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index fced02cadcc3..7d5b179c1fe7 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -46,6 +46,9 @@ int __iio_add_chan_devattr(const char *postfix,
 			   struct list_head *attr_list);
 void iio_free_chan_devattr_list(struct list_head *attr_list);
 
+int iio_device_register_sysfs_group(struct iio_dev *indio_dev,
+				    const struct attribute_group *group);
+
 ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals);
 
 /* Event interface flags */
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 2f7426a2f47c..cc846988fdb9 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -1287,7 +1287,9 @@ static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 	buffer->buffer_group.name = "buffer";
 	buffer->buffer_group.attrs = attr;
 
-	indio_dev->groups[indio_dev->groupcounter++] = &buffer->buffer_group;
+	ret = iio_device_register_sysfs_group(indio_dev, &buffer->buffer_group);
+	if (ret)
+		goto error_free_buffer_attrs;
 
 	attrcount = 0;
 	INIT_LIST_HEAD(&buffer->scan_el_dev_attr_list);
@@ -1330,14 +1332,20 @@ static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 
 	list_for_each_entry(p, &buffer->scan_el_dev_attr_list, l)
 		buffer->scan_el_group.attrs[attrn++] = &p->dev_attr.attr;
-	indio_dev->groups[indio_dev->groupcounter++] = &buffer->scan_el_group;
+
+	ret = iio_device_register_sysfs_group(indio_dev, &buffer->scan_el_group);
+	if (ret)
+		goto error_free_scan_el_attrs;
 
 	return 0;
 
+error_free_scan_el_attrs:
+	kfree(buffer->scan_el_group.attrs);
 error_free_scan_mask:
 	bitmap_free(buffer->scan_mask);
 error_cleanup_dynamic:
 	iio_free_chan_devattr_list(&buffer->scan_el_dev_attr_list);
+error_free_buffer_attrs:
 	kfree(buffer->buffer_group.attrs);
 
 	return ret;
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 92efbc245098..924f3a167125 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1454,6 +1454,25 @@ static ssize_t iio_store_timestamp_clock(struct device *dev,
 	return len;
 }
 
+int iio_device_register_sysfs_group(struct iio_dev *indio_dev,
+				    const struct attribute_group *group)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	const struct attribute_group **new, **old = iio_dev_opaque->groups;
+	unsigned int cnt = iio_dev_opaque->groupcounter;
+
+	new = krealloc(old, sizeof(*new) * (cnt + 2), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	new[iio_dev_opaque->groupcounter++] = group;
+	new[iio_dev_opaque->groupcounter] = NULL;
+
+	iio_dev_opaque->groups = new;
+
+	return 0;
+}
+
 static DEVICE_ATTR(current_timestamp_clock, S_IRUGO | S_IWUSR,
 		   iio_show_timestamp_clock, iio_store_timestamp_clock);
 
@@ -1527,8 +1546,10 @@ static int iio_device_register_sysfs(struct iio_dev *indio_dev)
 	if (clk)
 		iio_dev_opaque->chan_attr_group.attrs[attrn++] = clk;
 
-	indio_dev->groups[indio_dev->groupcounter++] =
-		&iio_dev_opaque->chan_attr_group;
+	ret = iio_device_register_sysfs_group(indio_dev,
+					      &iio_dev_opaque->chan_attr_group);
+	if (ret)
+		goto error_clear_attrs;
 
 	return 0;
 
@@ -1545,6 +1566,7 @@ static void iio_device_unregister_sysfs(struct iio_dev *indio_dev)
 	iio_free_chan_devattr_list(&iio_dev_opaque->channel_attr_list);
 	kfree(iio_dev_opaque->chan_attr_group.attrs);
 	iio_dev_opaque->chan_attr_group.attrs = NULL;
+	kfree(iio_dev_opaque->groups);
 }
 
 static void iio_dev_release(struct device *device)
@@ -1594,7 +1616,6 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv)
 		ALIGN(sizeof(struct iio_dev_opaque), IIO_ALIGN);
 
 	dev->dev.parent = parent;
-	dev->dev.groups = dev->groups;
 	dev->dev.type = &iio_device_type;
 	dev->dev.bus = &iio_bus_type;
 	device_initialize(&dev->dev);
@@ -1857,6 +1878,9 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 		indio_dev->chrdev.owner = this_mod;
 	}
 
+	/* assign device groups now; they should be all registered now */
+	indio_dev->dev.groups = iio_dev_opaque->groups;
+
 	ret = cdev_device_add(&indio_dev->chrdev, &indio_dev->dev);
 	if (ret < 0)
 		goto error_unreg_eventset;
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 7e532117ac55..ea8947cc21e4 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -544,7 +544,10 @@ int iio_device_register_eventset(struct iio_dev *indio_dev)
 	/* Add all elements from the list. */
 	list_for_each_entry(p, &ev_int->dev_attr_list, l)
 		ev_int->group.attrs[attrn++] = &p->dev_attr.attr;
-	indio_dev->groups[indio_dev->groupcounter++] = &ev_int->group;
+
+	ret = iio_device_register_sysfs_group(indio_dev, &ev_int->group);
+	if (ret)
+		goto error_free_setup_event_lines;
 
 	ev_int->ioctl_handler.ioctl = iio_event_ioctl;
 	iio_device_ioctl_handler_register(&iio_dev_opaque->indio_dev,
diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index 438d5012e8b8..a035d5c2a445 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -694,10 +694,8 @@ EXPORT_SYMBOL(iio_trigger_validate_own_device);
 
 int iio_device_register_trigger_consumer(struct iio_dev *indio_dev)
 {
-	indio_dev->groups[indio_dev->groupcounter++] =
-		&iio_trigger_consumer_attr_group;
-
-	return 0;
+	return iio_device_register_sysfs_group(indio_dev,
+					       &iio_trigger_consumer_attr_group);
 }
 
 void iio_device_unregister_trigger_consumer(struct iio_dev *indio_dev)
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 07c5a8e52ca8..8ba13a5c7af6 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -12,6 +12,8 @@
  *				attributes
  * @chan_attr_group:		group for all attrs in base directory
  * @ioctl_handlers:		ioctl handlers registered with the core handler
+ * @groups:			attribute groups
+ * @groupcounter:		index of next attribute group
  * @debugfs_dentry:		device specific debugfs dentry
  * @cached_reg_addr:		cached register address for debugfs reads
  * @read_buf:			read buffer to be used for the initial reg read
@@ -24,6 +26,8 @@ struct iio_dev_opaque {
 	struct list_head		channel_attr_list;
 	struct attribute_group		chan_attr_group;
 	struct list_head		ioctl_handlers;
+	const struct attribute_group	**groups;
+	int				groupcounter;
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry			*debugfs_dentry;
 	unsigned			cached_reg_addr;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index e4a9822e6495..f8585d01fc76 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -518,8 +518,6 @@ struct iio_buffer_setup_ops {
  * @setup_ops:		[DRIVER] callbacks to call before and after buffer
  *			enable/disable
  * @chrdev:		[INTERN] associated character device
- * @groups:		[INTERN] attribute groups
- * @groupcounter:	[INTERN] index of next attribute group
  * @flags:		[INTERN] file ops related flags including busy flag.
  * @priv:		[DRIVER] reference to driver's private information
  *			**MUST** be accessed **ONLY** via iio_priv() helper
@@ -556,9 +554,6 @@ struct iio_dev {
 	struct mutex			info_exist_lock;
 	const struct iio_buffer_setup_ops	*setup_ops;
 	struct cdev			chrdev;
-#define IIO_MAX_GROUPS 6
-	const struct attribute_group	*groups[IIO_MAX_GROUPS + 1];
-	int				groupcounter;
 
 	unsigned long			flags;
 	void				*priv;
-- 
cgit v1.2.3


From d9a625744ed0e452f5c495cd8c51eed4b6623a4c Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:31 +0200
Subject: iio: core: merge buffer/ & scan_elements/ attributes

With this change, we create a new directory for the IIO device called
buffer0, under which both the old buffer/ and scan_elements/ are stored.

This is done to simplify the addition of multiple IIO buffers per IIO
device. Otherwise we would need to add a bufferX/ and scan_elementsX/
directory for each IIO buffer.
With the current way of storing attribute groups, we can't have directories
stored under each other (i.e. scan_elements/ under buffer/), so the best
approach moving forward is to merge their attributes.

The old/legacy buffer/ & scan_elements/ groups are not stored on the opaque
IIO device object. This way the IIO buffer can have just a single
attribute_group object, saving a bit of memory when adding multiple IIO
buffers.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-13-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-buffer.c | 115 +++++++++++++++++++++++++++++---------
 include/linux/iio/buffer_impl.h   |   9 +--
 include/linux/iio/iio-opaque.h    |   4 ++
 3 files changed, 95 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 01ab3dd0726a..76f0f6a61ebc 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -1175,8 +1175,6 @@ done:
 	return (ret < 0) ? ret : len;
 }
 
-static const char * const iio_scan_elements_group_name = "scan_elements";
-
 static ssize_t iio_buffer_show_watermark(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -1252,8 +1250,68 @@ static struct attribute *iio_buffer_attrs[] = {
 	&dev_attr_data_available.attr,
 };
 
+static int iio_buffer_register_legacy_sysfs_groups(struct iio_dev *indio_dev,
+						   struct attribute **buffer_attrs,
+						   int buffer_attrcount,
+						   int scan_el_attrcount)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	struct attribute_group *group;
+	struct attribute **attrs;
+	int ret;
+
+	attrs = kcalloc(buffer_attrcount + 1, sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		return -ENOMEM;
+
+	memcpy(attrs, buffer_attrs, buffer_attrcount * sizeof(*attrs));
+
+	group = &iio_dev_opaque->legacy_buffer_group;
+	group->attrs = attrs;
+	group->name = "buffer";
+
+	ret = iio_device_register_sysfs_group(indio_dev, group);
+	if (ret)
+		goto error_free_buffer_attrs;
+
+	attrs = kcalloc(scan_el_attrcount + 1, sizeof(*attrs), GFP_KERNEL);
+	if (!attrs) {
+		ret = -ENOMEM;
+		goto error_free_buffer_attrs;
+	}
+
+	memcpy(attrs, &buffer_attrs[buffer_attrcount],
+	       scan_el_attrcount * sizeof(*attrs));
+
+	group = &iio_dev_opaque->legacy_scan_el_group;
+	group->attrs = attrs;
+	group->name = "scan_elements";
+
+	ret = iio_device_register_sysfs_group(indio_dev, group);
+	if (ret)
+		goto error_free_scan_el_attrs;
+
+	return 0;
+
+error_free_buffer_attrs:
+	kfree(iio_dev_opaque->legacy_buffer_group.attrs);
+error_free_scan_el_attrs:
+	kfree(iio_dev_opaque->legacy_scan_el_group.attrs);
+
+	return ret;
+}
+
+static void iio_buffer_unregister_legacy_sysfs_groups(struct iio_dev *indio_dev)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+
+	kfree(iio_dev_opaque->legacy_buffer_group.attrs);
+	kfree(iio_dev_opaque->legacy_scan_el_group.attrs);
+}
+
 static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
-					     struct iio_dev *indio_dev)
+					     struct iio_dev *indio_dev,
+					     int index)
 {
 	struct iio_dev_attr *p;
 	struct attribute **attr;
@@ -1294,8 +1352,8 @@ static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 		}
 	}
 
-	attr = kcalloc(buffer_attrcount + ARRAY_SIZE(iio_buffer_attrs) + 1,
-		       sizeof(*attr), GFP_KERNEL);
+	attrn = buffer_attrcount + scan_el_attrcount + ARRAY_SIZE(iio_buffer_attrs);
+	attr = kcalloc(attrn + 1, sizeof(* attr), GFP_KERNEL);
 	if (!attr) {
 		ret = -ENOMEM;
 		goto error_free_scan_mask;
@@ -1313,37 +1371,38 @@ static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 		       sizeof(struct attribute *) * buffer_attrcount);
 
 	buffer_attrcount += ARRAY_SIZE(iio_buffer_attrs);
-	attr[buffer_attrcount] = NULL;
-
-	buffer->buffer_group.name = "buffer";
-	buffer->buffer_group.attrs = attr;
 
-	ret = iio_device_register_sysfs_group(indio_dev, &buffer->buffer_group);
-	if (ret)
-		goto error_free_buffer_attrs;
+	attrn = buffer_attrcount;
 
-	buffer->scan_el_group.name = iio_scan_elements_group_name;
+	list_for_each_entry(p, &buffer->scan_el_dev_attr_list, l)
+		attr[attrn++] = &p->dev_attr.attr;
 
-	buffer->scan_el_group.attrs = kcalloc(scan_el_attrcount + 1,
-					      sizeof(buffer->scan_el_group.attrs[0]),
-					      GFP_KERNEL);
-	if (buffer->scan_el_group.attrs == NULL) {
+	buffer->buffer_group.name = kasprintf(GFP_KERNEL, "buffer%d", index);
+	if (!buffer->buffer_group.name) {
 		ret = -ENOMEM;
-		goto error_free_scan_mask;
+		goto error_free_buffer_attrs;
 	}
-	attrn = 0;
 
-	list_for_each_entry(p, &buffer->scan_el_dev_attr_list, l)
-		buffer->scan_el_group.attrs[attrn++] = &p->dev_attr.attr;
+	buffer->buffer_group.attrs = attr;
 
-	ret = iio_device_register_sysfs_group(indio_dev, &buffer->scan_el_group);
+	ret = iio_device_register_sysfs_group(indio_dev, &buffer->buffer_group);
 	if (ret)
-		goto error_free_scan_el_attrs;
+		goto error_free_buffer_attr_group_name;
+
+	/* we only need to register the legacy groups for the first buffer */
+	if (index > 0)
+		return 0;
+
+	ret = iio_buffer_register_legacy_sysfs_groups(indio_dev, attr,
+						      buffer_attrcount,
+						      scan_el_attrcount);
+	if (ret)
+		goto error_free_buffer_attr_group_name;
 
 	return 0;
 
-error_free_scan_el_attrs:
-	kfree(buffer->scan_el_group.attrs);
+error_free_buffer_attr_group_name:
+	kfree(buffer->buffer_group.name);
 error_free_buffer_attrs:
 	kfree(buffer->buffer_group.attrs);
 error_free_scan_mask:
@@ -1372,14 +1431,14 @@ int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 	if (!buffer)
 		return 0;
 
-	return __iio_buffer_alloc_sysfs_and_mask(buffer, indio_dev);
+	return __iio_buffer_alloc_sysfs_and_mask(buffer, indio_dev, 0);
 }
 
 static void __iio_buffer_free_sysfs_and_mask(struct iio_buffer *buffer)
 {
 	bitmap_free(buffer->scan_mask);
+	kfree(buffer->buffer_group.name);
 	kfree(buffer->buffer_group.attrs);
-	kfree(buffer->scan_el_group.attrs);
 	iio_free_chan_devattr_list(&buffer->scan_el_dev_attr_list);
 }
 
@@ -1390,6 +1449,8 @@ void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev)
 	if (!buffer)
 		return;
 
+	iio_buffer_unregister_legacy_sysfs_groups(indio_dev);
+
 	__iio_buffer_free_sysfs_and_mask(buffer);
 }
 
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index a63dc07b7350..3e555e58475b 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -100,14 +100,11 @@ struct iio_buffer {
 	/* @scan_el_dev_attr_list: List of scan element related attributes. */
 	struct list_head scan_el_dev_attr_list;
 
-	/* @buffer_group: Attributes of the buffer group. */
-	struct attribute_group buffer_group;
-
 	/*
-	 * @scan_el_group: Attribute group for those attributes not
-	 * created from the iio_chan_info array.
+	 * @buffer_group: Attributes of the new buffer group.
+	 * Includes scan elements attributes.
 	 */
-	struct attribute_group scan_el_group;
+	struct attribute_group buffer_group;
 
 	/* @attrs: Standard attributes of the buffer. */
 	const struct attribute **attrs;
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 8ba13a5c7af6..1a9310b0145f 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -14,6 +14,8 @@
  * @ioctl_handlers:		ioctl handlers registered with the core handler
  * @groups:			attribute groups
  * @groupcounter:		index of next attribute group
+ * @legacy_scan_el_group:	attribute group for legacy scan elements attribute group
+ * @legacy_buffer_group:	attribute group for legacy buffer attributes group
  * @debugfs_dentry:		device specific debugfs dentry
  * @cached_reg_addr:		cached register address for debugfs reads
  * @read_buf:			read buffer to be used for the initial reg read
@@ -28,6 +30,8 @@ struct iio_dev_opaque {
 	struct list_head		ioctl_handlers;
 	const struct attribute_group	**groups;
 	int				groupcounter;
+	struct attribute_group		legacy_scan_el_group;
+	struct attribute_group		legacy_buffer_group;
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry			*debugfs_dentry;
 	unsigned			cached_reg_addr;
-- 
cgit v1.2.3


From 3e3d11b2e43b9a967d98261250c19636b893b7ed Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:32 +0200
Subject: iio: add reference to iio buffer on iio_dev_attr

This change adds a reference to a 'struct iio_buffer' object on the
iio_dev_attr object. This way, we can use the created iio_dev_attr objects
on per-buffer basis (since they're allocated anyway).

A minor downside of this change is that the number of parameters on
__iio_add_chan_devattr() grows by 1. This looks like it could do with a bit
of a re-think.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-14-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/iio_core.h            | 2 ++
 drivers/iio/industrialio-buffer.c | 4 ++++
 drivers/iio/industrialio-core.c   | 6 ++++++
 drivers/iio/industrialio-event.c  | 1 +
 include/linux/iio/sysfs.h         | 3 +++
 5 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index 7d5b179c1fe7..731f5170d5b9 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/device.h>
 
+struct iio_buffer;
 struct iio_chan_spec;
 struct iio_dev;
 
@@ -43,6 +44,7 @@ int __iio_add_chan_devattr(const char *postfix,
 			   u64 mask,
 			   enum iio_shared_by shared_by,
 			   struct device *dev,
+			   struct iio_buffer *buffer,
 			   struct list_head *attr_list);
 void iio_free_chan_devattr_list(struct list_head *attr_list);
 
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 76f0f6a61ebc..e6edec3bcb73 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -447,6 +447,7 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 				     0,
 				     IIO_SEPARATE,
 				     &indio_dev->dev,
+				     buffer,
 				     &buffer->scan_el_dev_attr_list);
 	if (ret)
 		return ret;
@@ -458,6 +459,7 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 				     0,
 				     0,
 				     &indio_dev->dev,
+				     buffer,
 				     &buffer->scan_el_dev_attr_list);
 	if (ret)
 		return ret;
@@ -470,6 +472,7 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 					     chan->scan_index,
 					     0,
 					     &indio_dev->dev,
+					     buffer,
 					     &buffer->scan_el_dev_attr_list);
 	else
 		ret = __iio_add_chan_devattr("en",
@@ -479,6 +482,7 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 					     chan->scan_index,
 					     0,
 					     &indio_dev->dev,
+					     buffer,
 					     &buffer->scan_el_dev_attr_list);
 	if (ret)
 		return ret;
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 924f3a167125..f4e5a08bbcd5 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1116,6 +1116,7 @@ int __iio_add_chan_devattr(const char *postfix,
 			   u64 mask,
 			   enum iio_shared_by shared_by,
 			   struct device *dev,
+			   struct iio_buffer *buffer,
 			   struct list_head *attr_list)
 {
 	int ret;
@@ -1131,6 +1132,7 @@ int __iio_add_chan_devattr(const char *postfix,
 		goto error_iio_dev_attr_free;
 	iio_attr->c = chan;
 	iio_attr->address = mask;
+	iio_attr->buffer = buffer;
 	list_for_each_entry(t, attr_list, l)
 		if (strcmp(t->dev_attr.attr.name,
 			   iio_attr->dev_attr.attr.name) == 0) {
@@ -1167,6 +1169,7 @@ static int iio_device_add_channel_label(struct iio_dev *indio_dev,
 				     0,
 				     IIO_SEPARATE,
 				     &indio_dev->dev,
+				     NULL,
 				     &iio_dev_opaque->channel_attr_list);
 	if (ret < 0)
 		return ret;
@@ -1192,6 +1195,7 @@ static int iio_device_add_info_mask_type(struct iio_dev *indio_dev,
 					     i,
 					     shared_by,
 					     &indio_dev->dev,
+					     NULL,
 					     &iio_dev_opaque->channel_attr_list);
 		if ((ret == -EBUSY) && (shared_by != IIO_SEPARATE))
 			continue;
@@ -1228,6 +1232,7 @@ static int iio_device_add_info_mask_type_avail(struct iio_dev *indio_dev,
 					     i,
 					     shared_by,
 					     &indio_dev->dev,
+					     NULL,
 					     &iio_dev_opaque->channel_attr_list);
 		kfree(avail_postfix);
 		if ((ret == -EBUSY) && (shared_by != IIO_SEPARATE))
@@ -1324,6 +1329,7 @@ static int iio_device_add_channel_sysfs(struct iio_dev *indio_dev,
 					i,
 					ext_info->shared,
 					&indio_dev->dev,
+					NULL,
 					&iio_dev_opaque->channel_attr_list);
 			i++;
 			if (ret == -EBUSY && ext_info->shared)
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index ea8947cc21e4..a30e289fc362 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -385,6 +385,7 @@ static int iio_device_add_event(struct iio_dev *indio_dev,
 
 		ret = __iio_add_chan_devattr(postfix, chan, show, store,
 			 (i << 16) | spec_index, shared_by, &indio_dev->dev,
+			 NULL,
 			&iio_dev_opaque->event_interface->dev_attr_list);
 		kfree(postfix);
 
diff --git a/include/linux/iio/sysfs.h b/include/linux/iio/sysfs.h
index b532c875bc24..e51fba66de4b 100644
--- a/include/linux/iio/sysfs.h
+++ b/include/linux/iio/sysfs.h
@@ -9,6 +9,7 @@
 #ifndef _INDUSTRIAL_IO_SYSFS_H_
 #define _INDUSTRIAL_IO_SYSFS_H_
 
+struct iio_buffer;
 struct iio_chan_spec;
 
 /**
@@ -17,12 +18,14 @@ struct iio_chan_spec;
  * @address:	associated register address
  * @l:		list head for maintaining list of dynamically created attrs
  * @c:		specification for the underlying channel
+ * @buffer:	the IIO buffer to which this attribute belongs to (if any)
  */
 struct iio_dev_attr {
 	struct device_attribute dev_attr;
 	u64 address;
 	struct list_head l;
 	struct iio_chan_spec const *c;
+	struct iio_buffer *buffer;
 };
 
 #define to_iio_dev_attr(_dev_attr)				\
-- 
cgit v1.2.3


From 15097c7a1adc0554ce8eb6f5fd6758d063bfea44 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:33 +0200
Subject: iio: buffer: wrap all buffer attributes into iio_dev_attr

This change wraps all buffer attributes into iio_dev_attr objects, and
assigns a reference to the IIO buffer they belong to.

With the addition of multiple IIO buffers per one IIO device, we need a way
to know which IIO buffer is being enabled/disabled/controlled.

We know that all buffer attributes are device_attributes. So we can wrap
them with a iio_dev_attr types. In the iio_dev_attr type, we can also hold
a reference to an IIO buffer.
So, we end up being able to allocate wrapped attributes for all buffer
attributes (even the one from other drivers).

The neat part with this mechanism, is that we don't need to add any extra
cleanup, because these attributes are being added to a dynamic list that
will get cleaned up via iio_free_chan_devattr_list().

With this change, the 'buffer->scan_el_dev_attr_list' list is being renamed
to 'buffer->buffer_attr_list', effectively merging (or finalizing the
merge) of the buffer/ & scan_elements/ attributes internally.

Accessing these new buffer attributes can now be done via
'to_iio_dev_attr(attr)->buffer' inside the show/store handlers.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-15-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-buffer.c | 77 ++++++++++++++++++++++++++-------------
 drivers/iio/industrialio-core.c   |  2 +-
 include/linux/iio/buffer_impl.h   |  4 +-
 3 files changed, 54 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index e6edec3bcb73..ea91f9578dee 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -253,8 +253,7 @@ static ssize_t iio_scan_el_show(struct device *dev,
 				char *buf)
 {
 	int ret;
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 
 	/* Ensure ret is 0 or 1. */
 	ret = !!test_bit(to_iio_dev_attr(attr)->address,
@@ -367,8 +366,8 @@ static ssize_t iio_scan_el_store(struct device *dev,
 	int ret;
 	bool state;
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
 	struct iio_dev_attr *this_attr = to_iio_dev_attr(attr);
+	struct iio_buffer *buffer = this_attr->buffer;
 
 	ret = strtobool(buf, &state);
 	if (ret < 0)
@@ -402,8 +401,7 @@ static ssize_t iio_scan_el_ts_show(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
 {
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 
 	return sprintf(buf, "%d\n", buffer->scan_timestamp);
 }
@@ -415,7 +413,7 @@ static ssize_t iio_scan_el_ts_store(struct device *dev,
 {
 	int ret;
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 	bool state;
 
 	ret = strtobool(buf, &state);
@@ -448,7 +446,7 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 				     IIO_SEPARATE,
 				     &indio_dev->dev,
 				     buffer,
-				     &buffer->scan_el_dev_attr_list);
+				     &buffer->buffer_attr_list);
 	if (ret)
 		return ret;
 	attrcount++;
@@ -460,7 +458,7 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 				     0,
 				     &indio_dev->dev,
 				     buffer,
-				     &buffer->scan_el_dev_attr_list);
+				     &buffer->buffer_attr_list);
 	if (ret)
 		return ret;
 	attrcount++;
@@ -473,7 +471,7 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 					     0,
 					     &indio_dev->dev,
 					     buffer,
-					     &buffer->scan_el_dev_attr_list);
+					     &buffer->buffer_attr_list);
 	else
 		ret = __iio_add_chan_devattr("en",
 					     chan,
@@ -483,7 +481,7 @@ static int iio_buffer_add_channel_sysfs(struct iio_dev *indio_dev,
 					     0,
 					     &indio_dev->dev,
 					     buffer,
-					     &buffer->scan_el_dev_attr_list);
+					     &buffer->buffer_attr_list);
 	if (ret)
 		return ret;
 	attrcount++;
@@ -495,8 +493,7 @@ static ssize_t iio_buffer_read_length(struct device *dev,
 				      struct device_attribute *attr,
 				      char *buf)
 {
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 
 	return sprintf(buf, "%d\n", buffer->length);
 }
@@ -506,7 +503,7 @@ static ssize_t iio_buffer_write_length(struct device *dev,
 				       const char *buf, size_t len)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 	unsigned int val;
 	int ret;
 
@@ -538,8 +535,7 @@ static ssize_t iio_buffer_show_enable(struct device *dev,
 				      struct device_attribute *attr,
 				      char *buf)
 {
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 
 	return sprintf(buf, "%d\n", iio_buffer_is_active(buffer));
 }
@@ -1154,7 +1150,7 @@ static ssize_t iio_buffer_store_enable(struct device *dev,
 	int ret;
 	bool requested_state;
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 	bool inlist;
 
 	ret = strtobool(buf, &requested_state);
@@ -1183,8 +1179,7 @@ static ssize_t iio_buffer_show_watermark(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 
 	return sprintf(buf, "%u\n", buffer->watermark);
 }
@@ -1195,7 +1190,7 @@ static ssize_t iio_buffer_store_watermark(struct device *dev,
 					  size_t len)
 {
 	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 	unsigned int val;
 	int ret;
 
@@ -1228,8 +1223,7 @@ static ssize_t iio_dma_show_data_available(struct device *dev,
 						struct device_attribute *attr,
 						char *buf)
 {
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_buffer *buffer = to_iio_dev_attr(attr)->buffer;
 
 	return sprintf(buf, "%zu\n", iio_buffer_data_available(buffer));
 }
@@ -1254,6 +1248,27 @@ static struct attribute *iio_buffer_attrs[] = {
 	&dev_attr_data_available.attr,
 };
 
+#define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)
+
+static struct attribute *iio_buffer_wrap_attr(struct iio_buffer *buffer,
+					      struct attribute *attr)
+{
+	struct device_attribute *dattr = to_dev_attr(attr);
+	struct iio_dev_attr *iio_attr;
+
+	iio_attr = kzalloc(sizeof(*iio_attr), GFP_KERNEL);
+	if (!iio_attr)
+		return NULL;
+
+	iio_attr->buffer = buffer;
+	memcpy(&iio_attr->dev_attr, dattr, sizeof(iio_attr->dev_attr));
+	iio_attr->dev_attr.attr.name = kstrdup_const(attr->name, GFP_KERNEL);
+
+	list_add(&iio_attr->l, &buffer->buffer_attr_list);
+
+	return &iio_attr->dev_attr.attr;
+}
+
 static int iio_buffer_register_legacy_sysfs_groups(struct iio_dev *indio_dev,
 						   struct attribute **buffer_attrs,
 						   int buffer_attrcount,
@@ -1329,7 +1344,7 @@ static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 	}
 
 	scan_el_attrcount = 0;
-	INIT_LIST_HEAD(&buffer->scan_el_dev_attr_list);
+	INIT_LIST_HEAD(&buffer->buffer_attr_list);
 	channels = indio_dev->channels;
 	if (channels) {
 		/* new magic */
@@ -1376,9 +1391,19 @@ static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 
 	buffer_attrcount += ARRAY_SIZE(iio_buffer_attrs);
 
-	attrn = buffer_attrcount;
+	for (i = 0; i < buffer_attrcount; i++) {
+		struct attribute *wrapped;
+
+		wrapped = iio_buffer_wrap_attr(buffer, attr[i]);
+		if (!wrapped) {
+			ret = -ENOMEM;
+			goto error_free_scan_mask;
+		}
+		attr[i] = wrapped;
+	}
 
-	list_for_each_entry(p, &buffer->scan_el_dev_attr_list, l)
+	attrn = 0;
+	list_for_each_entry(p, &buffer->buffer_attr_list, l)
 		attr[attrn++] = &p->dev_attr.attr;
 
 	buffer->buffer_group.name = kasprintf(GFP_KERNEL, "buffer%d", index);
@@ -1412,7 +1437,7 @@ error_free_buffer_attrs:
 error_free_scan_mask:
 	bitmap_free(buffer->scan_mask);
 error_cleanup_dynamic:
-	iio_free_chan_devattr_list(&buffer->scan_el_dev_attr_list);
+	iio_free_chan_devattr_list(&buffer->buffer_attr_list);
 
 	return ret;
 }
@@ -1443,7 +1468,7 @@ static void __iio_buffer_free_sysfs_and_mask(struct iio_buffer *buffer)
 	bitmap_free(buffer->scan_mask);
 	kfree(buffer->buffer_group.name);
 	kfree(buffer->buffer_group.attrs);
-	iio_free_chan_devattr_list(&buffer->scan_el_dev_attr_list);
+	iio_free_chan_devattr_list(&buffer->buffer_attr_list);
 }
 
 void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev)
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index f4e5a08bbcd5..b8b149ccecef 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1357,7 +1357,7 @@ void iio_free_chan_devattr_list(struct list_head *attr_list)
 	struct iio_dev_attr *p, *n;
 
 	list_for_each_entry_safe(p, n, attr_list, l) {
-		kfree(p->dev_attr.attr.name);
+		kfree_const(p->dev_attr.attr.name);
 		list_del(&p->l);
 		kfree(p);
 	}
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index 3e555e58475b..41044320e581 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -97,8 +97,8 @@ struct iio_buffer {
 	/* @scan_timestamp: Does the scan mode include a timestamp. */
 	bool scan_timestamp;
 
-	/* @scan_el_dev_attr_list: List of scan element related attributes. */
-	struct list_head scan_el_dev_attr_list;
+	/* @buffer_attr_list: List of buffer attributes. */
+	struct list_head buffer_attr_list;
 
 	/*
 	 * @buffer_group: Attributes of the new buffer group.
-- 
cgit v1.2.3


From ee708e6baacd3afdace9b721c25fbbe106cebb94 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:38 +0200
Subject: iio: buffer: introduce support for attaching more IIO buffers

With this change, calling iio_device_attach_buffer() will actually attach
more buffers.
Right now this doesn't do any validation of whether a buffer is attached
twice; maybe that can be added later (if needed). Attaching a buffer more
than once should yield noticeably bad results.

The first buffer is the legacy buffer, so a reference is kept to it.

At this point, accessing the data for the extra buffers (that are added
after the first one) isn't possible yet.

The iio_device_attach_buffer() is also changed to return an error code,
which for now is -ENOMEM if the array could not be realloc-ed for more
buffers.
To adapt to this new change iio_device_attach_buffer() is called last in
all place where it's called. The realloc failure is a bit difficult to
handle during un-managed calls when unwinding, so it's better to have this
as the last error in the setup_buffer calls.

At this point, no driver should call iio_device_attach_buffer() directly,
it should call one of the {devm_}iio_triggered_buffer_setup() or
devm_iio_kfifo_buffer_setup() or devm_iio_dmaengine_buffer_setup()
functions. This makes iio_device_attach_buffer() a bit easier to handle.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-20-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/industrialio-buffer-dmaengine.c |   4 +-
 drivers/iio/buffer/industrialio-triggered-buffer.c |  10 ++-
 drivers/iio/buffer/kfifo_buf.c                     |   4 +-
 drivers/iio/iio_core.h                             |  10 ++-
 drivers/iio/industrialio-buffer.c                  | 100 +++++++++++++++++----
 drivers/iio/industrialio-core.c                    |  12 +--
 include/linux/iio/buffer.h                         |   4 +-
 include/linux/iio/buffer_impl.h                    |   3 +
 include/linux/iio/iio-opaque.h                     |   4 +
 9 files changed, 111 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
index a64b222289be..d76179878ff9 100644
--- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c
+++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
@@ -290,9 +290,7 @@ int devm_iio_dmaengine_buffer_setup(struct device *dev,
 
 	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
 
-	iio_device_attach_buffer(indio_dev, buffer);
-
-	return 0;
+	return iio_device_attach_buffer(indio_dev, buffer);
 }
 EXPORT_SYMBOL_GPL(devm_iio_dmaengine_buffer_setup);
 
diff --git a/drivers/iio/buffer/industrialio-triggered-buffer.c b/drivers/iio/buffer/industrialio-triggered-buffer.c
index 92b8aea3e063..b2b1b7d27af4 100644
--- a/drivers/iio/buffer/industrialio-triggered-buffer.c
+++ b/drivers/iio/buffer/industrialio-triggered-buffer.c
@@ -50,8 +50,6 @@ int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev,
 		goto error_ret;
 	}
 
-	iio_device_attach_buffer(indio_dev, buffer);
-
 	indio_dev->pollfunc = iio_alloc_pollfunc(h,
 						 thread,
 						 IRQF_ONESHOT,
@@ -72,10 +70,16 @@ int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev,
 
 	buffer->attrs = buffer_attrs;
 
+	ret = iio_device_attach_buffer(indio_dev, buffer);
+	if (ret < 0)
+		goto error_dealloc_pollfunc;
+
 	return 0;
 
+error_dealloc_pollfunc:
+	iio_dealloc_pollfunc(indio_dev->pollfunc);
 error_kfifo_free:
-	iio_kfifo_free(indio_dev->buffer);
+	iio_kfifo_free(buffer);
 error_ret:
 	return ret;
 }
diff --git a/drivers/iio/buffer/kfifo_buf.c b/drivers/iio/buffer/kfifo_buf.c
index c35a625280b1..34289ce12f20 100644
--- a/drivers/iio/buffer/kfifo_buf.c
+++ b/drivers/iio/buffer/kfifo_buf.c
@@ -235,12 +235,10 @@ int devm_iio_kfifo_buffer_setup(struct device *dev,
 	if (!buffer)
 		return -ENOMEM;
 
-	iio_device_attach_buffer(indio_dev, buffer);
-
 	indio_dev->modes |= mode_flags;
 	indio_dev->setup_ops = setup_ops;
 
-	return 0;
+	return iio_device_attach_buffer(indio_dev, buffer);
 }
 EXPORT_SYMBOL_GPL(devm_iio_kfifo_buffer_setup);
 
diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index 87868fff7d37..7990c759f1f5 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -69,29 +69,31 @@ __poll_t iio_buffer_poll(struct file *filp,
 ssize_t iio_buffer_read_outer(struct file *filp, char __user *buf,
 			      size_t n, loff_t *f_ps);
 
-int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev);
-void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev);
+int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev);
+void iio_buffers_free_sysfs_and_mask(struct iio_dev *indio_dev);
 
 #define iio_buffer_poll_addr (&iio_buffer_poll)
 #define iio_buffer_read_outer_addr (&iio_buffer_read_outer)
 
 void iio_disable_all_buffers(struct iio_dev *indio_dev);
 void iio_buffer_wakeup_poll(struct iio_dev *indio_dev);
+void iio_buffers_put(struct iio_dev *indio_dev);
 
 #else
 
 #define iio_buffer_poll_addr NULL
 #define iio_buffer_read_outer_addr NULL
 
-static inline int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
+static inline int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 {
 	return 0;
 }
 
-static inline void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev) {}
+static inline void iio_buffers_free_sysfs_and_mask(struct iio_dev *indio_dev) {}
 
 static inline void iio_disable_all_buffers(struct iio_dev *indio_dev) {}
 static inline void iio_buffer_wakeup_poll(struct iio_dev *indio_dev) {}
+static inline void iio_buffers_put(struct iio_dev *indio_dev) {}
 
 #endif
 
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 349c0c011745..b60c2e66bd1c 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -193,12 +193,14 @@ __poll_t iio_buffer_poll(struct file *filp,
  */
 void iio_buffer_wakeup_poll(struct iio_dev *indio_dev)
 {
-	struct iio_buffer *buffer = indio_dev->buffer;
-
-	if (!buffer)
-		return;
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	struct iio_buffer *buffer;
+	unsigned int i;
 
-	wake_up(&buffer->pollq);
+	for (i = 0; i < iio_dev_opaque->attached_buffers_cnt; i++) {
+		buffer = iio_dev_opaque->attached_buffers[i];
+		wake_up(&buffer->pollq);
+	}
 }
 
 void iio_buffer_init(struct iio_buffer *buffer)
@@ -212,6 +214,18 @@ void iio_buffer_init(struct iio_buffer *buffer)
 }
 EXPORT_SYMBOL(iio_buffer_init);
 
+void iio_buffers_put(struct iio_dev *indio_dev)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	struct iio_buffer *buffer;
+	unsigned int i;
+
+	for (i = 0; i < iio_dev_opaque->attached_buffers_cnt; i++) {
+		buffer = iio_dev_opaque->attached_buffers[i];
+		iio_buffer_put(buffer);
+	}
+}
+
 static ssize_t iio_show_scan_index(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
@@ -1452,11 +1466,13 @@ static void __iio_buffer_free_sysfs_and_mask(struct iio_buffer *buffer)
 	iio_free_chan_devattr_list(&buffer->buffer_attr_list);
 }
 
-int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
+int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 {
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
 	const struct iio_chan_spec *channels;
-	int i;
+	struct iio_buffer *buffer;
+	int unwind_idx;
+	int ret, i;
 
 	channels = indio_dev->channels;
 	if (channels) {
@@ -1467,22 +1483,46 @@ int iio_buffer_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 		indio_dev->masklength = ml;
 	}
 
-	if (!buffer)
+	if (!iio_dev_opaque->attached_buffers_cnt)
 		return 0;
 
-	return __iio_buffer_alloc_sysfs_and_mask(buffer, indio_dev, 0);
+	for (i = 0; i < iio_dev_opaque->attached_buffers_cnt; i++) {
+		buffer = iio_dev_opaque->attached_buffers[i];
+		ret = __iio_buffer_alloc_sysfs_and_mask(buffer, indio_dev, i);
+		if (ret) {
+			unwind_idx = i;
+			goto error_unwind_sysfs_and_mask;
+		}
+	}
+
+	return 0;
+
+error_unwind_sysfs_and_mask:
+	for (; unwind_idx >= 0; unwind_idx--) {
+		buffer = iio_dev_opaque->attached_buffers[unwind_idx];
+		__iio_buffer_free_sysfs_and_mask(buffer);
+	}
+	kfree(iio_dev_opaque->attached_buffers);
+	return ret;
 }
 
-void iio_buffer_free_sysfs_and_mask(struct iio_dev *indio_dev)
+void iio_buffers_free_sysfs_and_mask(struct iio_dev *indio_dev)
 {
-	struct iio_buffer *buffer = indio_dev->buffer;
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	struct iio_buffer *buffer;
+	int i;
 
-	if (!buffer)
+	if (!iio_dev_opaque->attached_buffers_cnt)
 		return;
 
 	iio_buffer_unregister_legacy_sysfs_groups(indio_dev);
 
-	__iio_buffer_free_sysfs_and_mask(buffer);
+	for (i = iio_dev_opaque->attached_buffers_cnt - 1; i >= 0; i--) {
+		buffer = iio_dev_opaque->attached_buffers[i];
+		__iio_buffer_free_sysfs_and_mask(buffer);
+	}
+
+	kfree(iio_dev_opaque->attached_buffers);
 }
 
 /**
@@ -1600,13 +1640,35 @@ EXPORT_SYMBOL_GPL(iio_buffer_put);
  * @indio_dev: The device the buffer should be attached to
  * @buffer: The buffer to attach to the device
  *
+ * Return 0 if successful, negative if error.
+ *
  * This function attaches a buffer to a IIO device. The buffer stays attached to
- * the device until the device is freed. The function should only be called at
- * most once per device.
+ * the device until the device is freed. For legacy reasons, the first attached
+ * buffer will also be assigned to 'indio_dev->buffer'.
  */
-void iio_device_attach_buffer(struct iio_dev *indio_dev,
-			      struct iio_buffer *buffer)
+int iio_device_attach_buffer(struct iio_dev *indio_dev,
+			     struct iio_buffer *buffer)
 {
-	indio_dev->buffer = iio_buffer_get(buffer);
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	struct iio_buffer **new, **old = iio_dev_opaque->attached_buffers;
+	unsigned int cnt = iio_dev_opaque->attached_buffers_cnt;
+
+	cnt++;
+
+	new = krealloc(old, sizeof(*new) * cnt, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+	iio_dev_opaque->attached_buffers = new;
+
+	buffer = iio_buffer_get(buffer);
+
+	/* first buffer is legacy; attach it to the IIO device directly */
+	if (!indio_dev->buffer)
+		indio_dev->buffer = buffer;
+
+	iio_dev_opaque->attached_buffers[cnt - 1] = buffer;
+	iio_dev_opaque->attached_buffers_cnt = cnt;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(iio_device_attach_buffer);
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index b96acfe13e92..1645c739c44f 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1585,7 +1585,7 @@ static void iio_dev_release(struct device *device)
 	iio_device_unregister_eventset(indio_dev);
 	iio_device_unregister_sysfs(indio_dev);
 
-	iio_buffer_put(indio_dev->buffer);
+	iio_buffers_put(indio_dev);
 
 	ida_simple_remove(&iio_ida, indio_dev->id);
 	kfree(iio_dev_opaque);
@@ -1862,7 +1862,7 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 
 	iio_device_register_debugfs(indio_dev);
 
-	ret = iio_buffer_alloc_sysfs_and_mask(indio_dev);
+	ret = iio_buffers_alloc_sysfs_and_mask(indio_dev);
 	if (ret) {
 		dev_err(indio_dev->dev.parent,
 			"Failed to create buffer sysfs interfaces\n");
@@ -1888,12 +1888,12 @@ int __iio_device_register(struct iio_dev *indio_dev, struct module *this_mod)
 		indio_dev->setup_ops == NULL)
 		indio_dev->setup_ops = &noop_ring_setup_ops;
 
-	if (indio_dev->buffer)
+	if (iio_dev_opaque->attached_buffers_cnt)
 		cdev_init(&indio_dev->chrdev, &iio_buffer_fileops);
 	else if (iio_dev_opaque->event_interface)
 		cdev_init(&indio_dev->chrdev, &iio_event_fileops);
 
-	if (indio_dev->buffer || iio_dev_opaque->event_interface) {
+	if (iio_dev_opaque->attached_buffers_cnt || iio_dev_opaque->event_interface) {
 		indio_dev->dev.devt = MKDEV(MAJOR(iio_devt), indio_dev->id);
 		indio_dev->chrdev.owner = this_mod;
 	}
@@ -1912,7 +1912,7 @@ error_unreg_eventset:
 error_free_sysfs:
 	iio_device_unregister_sysfs(indio_dev);
 error_buffer_free_sysfs:
-	iio_buffer_free_sysfs_and_mask(indio_dev);
+	iio_buffers_free_sysfs_and_mask(indio_dev);
 error_unreg_debugfs:
 	iio_device_unregister_debugfs(indio_dev);
 	return ret;
@@ -1946,7 +1946,7 @@ void iio_device_unregister(struct iio_dev *indio_dev)
 
 	mutex_unlock(&indio_dev->info_exist_lock);
 
-	iio_buffer_free_sysfs_and_mask(indio_dev);
+	iio_buffers_free_sysfs_and_mask(indio_dev);
 }
 EXPORT_SYMBOL(iio_device_unregister);
 
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index 8febc23f5f26..b6928ac5c63d 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -41,7 +41,7 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev,
 bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev,
 				   const unsigned long *mask);
 
-void iio_device_attach_buffer(struct iio_dev *indio_dev,
-			      struct iio_buffer *buffer);
+int iio_device_attach_buffer(struct iio_dev *indio_dev,
+			     struct iio_buffer *buffer);
 
 #endif /* _IIO_BUFFER_GENERIC_H_ */
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index 41044320e581..768b90c64412 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -112,6 +112,9 @@ struct iio_buffer {
 	/* @demux_bounce: Buffer for doing gather from incoming scan. */
 	void *demux_bounce;
 
+	/* @attached_entry: Entry in the devices list of buffers attached by the driver. */
+	struct list_head attached_entry;
+
 	/* @buffer_list: Entry in the devices list of current buffers. */
 	struct list_head buffer_list;
 
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index 1a9310b0145f..b6ebc04af3e7 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -7,6 +7,8 @@
  * struct iio_dev_opaque - industrial I/O device opaque information
  * @indio_dev:			public industrial I/O device information
  * @event_interface:		event chrdevs associated with interrupt lines
+ * @attached_buffers:		array of buffers statically attached by the driver
+ * @attached_buffers_cnt:	number of buffers in the array of statically attached buffers
  * @buffer_list:		list of all buffers currently attached
  * @channel_attr_list:		keep track of automatically created channel
  *				attributes
@@ -24,6 +26,8 @@
 struct iio_dev_opaque {
 	struct iio_dev			indio_dev;
 	struct iio_event_interface	*event_interface;
+	struct iio_buffer		**attached_buffers;
+	unsigned int			attached_buffers_cnt;
 	struct list_head		buffer_list;
 	struct list_head		channel_attr_list;
 	struct attribute_group		chan_attr_group;
-- 
cgit v1.2.3


From f73f7f4da581875f9b1f2fb8ebd1ab15ed634488 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 15 Feb 2021 12:40:39 +0200
Subject: iio: buffer: add ioctl() to support opening extra buffers for IIO
 device

With this change, an ioctl() call is added to open a character device for a
buffer. The ioctl() number is 'i' 0x91, which follows the
IIO_GET_EVENT_FD_IOCTL ioctl.

The ioctl() will return an FD for the requested buffer index. The indexes
are the same from the /sys/iio/devices/iio:deviceX/bufferY (i.e. the Y
variable).

Since there doesn't seem to be a sane way to return the FD for buffer0 to
be the same FD for the /dev/iio:deviceX, this ioctl() will return another
FD for buffer0 (or the first buffer). This duplicate FD will be able to
access the same buffer object (for buffer0) as accessing directly the
/dev/iio:deviceX chardev.

Also, there is no IIO_BUFFER_GET_BUFFER_COUNT ioctl() implemented, as the
index for each buffer (and the count) can be deduced from the
'/sys/bus/iio/devices/iio:deviceX/bufferY' folders (i.e the number of
bufferY folders).

Used following C code to test this:
-------------------------------------------------------------------

 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
 #include <fcntl.h"
 #include <errno.h>

 #define IIO_BUFFER_GET_FD_IOCTL      _IOWR('i', 0x91, int)

int main(int argc, char *argv[])
{
        int fd;
        int fd1;
        int ret;

        if ((fd = open("/dev/iio:device0", O_RDWR))<0) {
                fprintf(stderr, "Error open() %d errno %d\n",fd, errno);
                return -1;
        }

        fprintf(stderr, "Using FD %d\n", fd);

        fd1 = atoi(argv[1]);

        ret = ioctl(fd, IIO_BUFFER_GET_FD_IOCTL, &fd1);
        if (ret < 0) {
                fprintf(stderr, "Error for buffer %d ioctl() %d errno %d\n", fd1, ret, errno);
                close(fd);
                return -1;
        }

        fprintf(stderr, "Got FD %d\n", fd1);

        close(fd1);
        close(fd);

        return 0;
}
-------------------------------------------------------------------

Results are:
-------------------------------------------------------------------
 # ./test 0
 Using FD 3
 Got FD 4

 # ./test 1
 Using FD 3
 Got FD 4

 # ./test 2
 Using FD 3
 Got FD 4

 # ./test 3
 Using FD 3
 Got FD 4

 # ls /sys/bus/iio/devices/iio\:device0
 buffer  buffer0  buffer1  buffer2  buffer3  dev
 in_voltage_sampling_frequency  in_voltage_scale
 in_voltage_scale_available
 name  of_node  power  scan_elements  subsystem  uevent
-------------------------------------------------------------------

iio:device0 has some fake kfifo buffers attached to an IIO device.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210215104043.91251-21-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/iio_core.h            |  12 ++--
 drivers/iio/industrialio-buffer.c | 144 ++++++++++++++++++++++++++++++++++++--
 include/linux/iio/buffer_impl.h   |   5 ++
 include/linux/iio/iio-opaque.h    |   2 +
 include/uapi/linux/iio/buffer.h   |  10 +++
 5 files changed, 162 insertions(+), 11 deletions(-)
 create mode 100644 include/uapi/linux/iio/buffer.h

(limited to 'include/linux')

diff --git a/drivers/iio/iio_core.h b/drivers/iio/iio_core.h
index 7990c759f1f5..062fe16c6c49 100644
--- a/drivers/iio/iio_core.h
+++ b/drivers/iio/iio_core.h
@@ -64,16 +64,16 @@ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals);
 #ifdef CONFIG_IIO_BUFFER
 struct poll_table_struct;
 
-__poll_t iio_buffer_poll(struct file *filp,
-			     struct poll_table_struct *wait);
-ssize_t iio_buffer_read_outer(struct file *filp, char __user *buf,
-			      size_t n, loff_t *f_ps);
+__poll_t iio_buffer_poll_wrapper(struct file *filp,
+				 struct poll_table_struct *wait);
+ssize_t iio_buffer_read_wrapper(struct file *filp, char __user *buf,
+				size_t n, loff_t *f_ps);
 
 int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev);
 void iio_buffers_free_sysfs_and_mask(struct iio_dev *indio_dev);
 
-#define iio_buffer_poll_addr (&iio_buffer_poll)
-#define iio_buffer_read_outer_addr (&iio_buffer_read_outer)
+#define iio_buffer_poll_addr (&iio_buffer_poll_wrapper)
+#define iio_buffer_read_outer_addr (&iio_buffer_read_wrapper)
 
 void iio_disable_all_buffers(struct iio_dev *indio_dev);
 void iio_buffer_wakeup_poll(struct iio_dev *indio_dev);
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index b60c2e66bd1c..a48e494a9fbb 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -9,9 +9,11 @@
  * - Better memory allocation techniques?
  * - Alternative access techniques?
  */
+#include <linux/anon_inodes.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/device.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/cdev.h>
 #include <linux/slab.h>
@@ -89,7 +91,7 @@ static bool iio_buffer_ready(struct iio_dev *indio_dev, struct iio_buffer *buf,
 }
 
 /**
- * iio_buffer_read_outer() - chrdev read for buffer access
+ * iio_buffer_read() - chrdev read for buffer access
  * @filp:	File structure pointer for the char device
  * @buf:	Destination buffer for iio buffer read
  * @n:		First n bytes to read
@@ -101,8 +103,8 @@ static bool iio_buffer_ready(struct iio_dev *indio_dev, struct iio_buffer *buf,
  * Return: negative values corresponding to error codes or ret != 0
  *	   for ending the reading activity
  **/
-ssize_t iio_buffer_read_outer(struct file *filp, char __user *buf,
-			      size_t n, loff_t *f_ps)
+static ssize_t iio_buffer_read(struct file *filp, char __user *buf,
+			       size_t n, loff_t *f_ps)
 {
 	struct iio_dev_buffer_pair *ib = filp->private_data;
 	struct iio_buffer *rb = ib->buffer;
@@ -168,8 +170,8 @@ ssize_t iio_buffer_read_outer(struct file *filp, char __user *buf,
  * Return: (EPOLLIN | EPOLLRDNORM) if data is available for reading
  *	   or 0 for other cases
  */
-__poll_t iio_buffer_poll(struct file *filp,
-			     struct poll_table_struct *wait)
+static __poll_t iio_buffer_poll(struct file *filp,
+				struct poll_table_struct *wait)
 {
 	struct iio_dev_buffer_pair *ib = filp->private_data;
 	struct iio_buffer *rb = ib->buffer;
@@ -184,6 +186,32 @@ __poll_t iio_buffer_poll(struct file *filp,
 	return 0;
 }
 
+ssize_t iio_buffer_read_wrapper(struct file *filp, char __user *buf,
+				size_t n, loff_t *f_ps)
+{
+	struct iio_dev_buffer_pair *ib = filp->private_data;
+	struct iio_buffer *rb = ib->buffer;
+
+	/* check if buffer was opened through new API */
+	if (test_bit(IIO_BUSY_BIT_POS, &rb->flags))
+		return -EBUSY;
+
+	return iio_buffer_read(filp, buf, n, f_ps);
+}
+
+__poll_t iio_buffer_poll_wrapper(struct file *filp,
+				 struct poll_table_struct *wait)
+{
+	struct iio_dev_buffer_pair *ib = filp->private_data;
+	struct iio_buffer *rb = ib->buffer;
+
+	/* check if buffer was opened through new API */
+	if (test_bit(IIO_BUSY_BIT_POS, &rb->flags))
+		return 0;
+
+	return iio_buffer_poll(filp, wait);
+}
+
 /**
  * iio_buffer_wakeup_poll - Wakes up the buffer waitqueue
  * @indio_dev: The IIO device
@@ -1344,6 +1372,96 @@ static void iio_buffer_unregister_legacy_sysfs_groups(struct iio_dev *indio_dev)
 	kfree(iio_dev_opaque->legacy_scan_el_group.attrs);
 }
 
+static int iio_buffer_chrdev_release(struct inode *inode, struct file *filep)
+{
+	struct iio_dev_buffer_pair *ib = filep->private_data;
+	struct iio_dev *indio_dev = ib->indio_dev;
+	struct iio_buffer *buffer = ib->buffer;
+
+	wake_up(&buffer->pollq);
+
+	kfree(ib);
+	clear_bit(IIO_BUSY_BIT_POS, &buffer->flags);
+	iio_device_put(indio_dev);
+
+	return 0;
+}
+
+static const struct file_operations iio_buffer_chrdev_fileops = {
+	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
+	.read = iio_buffer_read,
+	.poll = iio_buffer_poll,
+	.release = iio_buffer_chrdev_release,
+};
+
+static long iio_device_buffer_getfd(struct iio_dev *indio_dev, unsigned long arg)
+{
+	struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev);
+	int __user *ival = (int __user *)arg;
+	struct iio_dev_buffer_pair *ib;
+	struct iio_buffer *buffer;
+	int fd, idx, ret;
+
+	if (copy_from_user(&idx, ival, sizeof(idx)))
+		return -EFAULT;
+
+	if (idx >= iio_dev_opaque->attached_buffers_cnt)
+		return -ENODEV;
+
+	iio_device_get(indio_dev);
+
+	buffer = iio_dev_opaque->attached_buffers[idx];
+
+	if (test_and_set_bit(IIO_BUSY_BIT_POS, &buffer->flags)) {
+		ret = -EBUSY;
+		goto error_iio_dev_put;
+	}
+
+	ib = kzalloc(sizeof(*ib), GFP_KERNEL);
+	if (!ib) {
+		ret = -ENOMEM;
+		goto error_clear_busy_bit;
+	}
+
+	ib->indio_dev = indio_dev;
+	ib->buffer = buffer;
+
+	fd = anon_inode_getfd("iio:buffer", &iio_buffer_chrdev_fileops,
+			      ib, O_RDWR | O_CLOEXEC);
+	if (fd < 0) {
+		ret = fd;
+		goto error_free_ib;
+	}
+
+	if (copy_to_user(ival, &fd, sizeof(fd))) {
+		put_unused_fd(fd);
+		ret = -EFAULT;
+		goto error_free_ib;
+	}
+
+	return fd;
+
+error_free_ib:
+	kfree(ib);
+error_clear_busy_bit:
+	clear_bit(IIO_BUSY_BIT_POS, &buffer->flags);
+error_iio_dev_put:
+	iio_device_put(indio_dev);
+	return ret;
+}
+
+static long iio_device_buffer_ioctl(struct iio_dev *indio_dev, struct file *filp,
+				    unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case IIO_BUFFER_GET_FD_IOCTL:
+		return iio_device_buffer_getfd(indio_dev, arg);
+	default:
+		return IIO_IOCTL_UNHANDLED;
+	}
+}
+
 static int __iio_buffer_alloc_sysfs_and_mask(struct iio_buffer *buffer,
 					     struct iio_dev *indio_dev,
 					     int index)
@@ -1473,6 +1591,7 @@ int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 	struct iio_buffer *buffer;
 	int unwind_idx;
 	int ret, i;
+	size_t sz;
 
 	channels = indio_dev->channels;
 	if (channels) {
@@ -1494,6 +1613,18 @@ int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
 			goto error_unwind_sysfs_and_mask;
 		}
 	}
+	unwind_idx = iio_dev_opaque->attached_buffers_cnt - 1;
+
+	sz = sizeof(*(iio_dev_opaque->buffer_ioctl_handler));
+	iio_dev_opaque->buffer_ioctl_handler = kzalloc(sz, GFP_KERNEL);
+	if (!iio_dev_opaque->buffer_ioctl_handler) {
+		ret = -ENOMEM;
+		goto error_unwind_sysfs_and_mask;
+	}
+
+	iio_dev_opaque->buffer_ioctl_handler->ioctl = iio_device_buffer_ioctl;
+	iio_device_ioctl_handler_register(indio_dev,
+					  iio_dev_opaque->buffer_ioctl_handler);
 
 	return 0;
 
@@ -1515,6 +1646,9 @@ void iio_buffers_free_sysfs_and_mask(struct iio_dev *indio_dev)
 	if (!iio_dev_opaque->attached_buffers_cnt)
 		return;
 
+	iio_device_ioctl_handler_unregister(iio_dev_opaque->buffer_ioctl_handler);
+	kfree(iio_dev_opaque->buffer_ioctl_handler);
+
 	iio_buffer_unregister_legacy_sysfs_groups(indio_dev);
 
 	for (i = iio_dev_opaque->attached_buffers_cnt - 1; i >= 0; i--) {
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index 768b90c64412..245b32918ae1 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -6,6 +6,8 @@
 
 #ifdef CONFIG_IIO_BUFFER
 
+#include <uapi/linux/iio/buffer.h>
+
 struct iio_dev;
 struct iio_buffer;
 
@@ -72,6 +74,9 @@ struct iio_buffer {
 	/** @length: Number of datums in buffer. */
 	unsigned int length;
 
+	/** @flags: File ops flags including busy flag. */
+	unsigned long flags;
+
 	/**  @bytes_per_datum: Size of individual datum including timestamp. */
 	size_t bytes_per_datum;
 
diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h
index b6ebc04af3e7..32addd5e790e 100644
--- a/include/linux/iio/iio-opaque.h
+++ b/include/linux/iio/iio-opaque.h
@@ -9,6 +9,7 @@
  * @event_interface:		event chrdevs associated with interrupt lines
  * @attached_buffers:		array of buffers statically attached by the driver
  * @attached_buffers_cnt:	number of buffers in the array of statically attached buffers
+ * @buffer_ioctl_handler:	ioctl() handler for this IIO device's buffer interface
  * @buffer_list:		list of all buffers currently attached
  * @channel_attr_list:		keep track of automatically created channel
  *				attributes
@@ -28,6 +29,7 @@ struct iio_dev_opaque {
 	struct iio_event_interface	*event_interface;
 	struct iio_buffer		**attached_buffers;
 	unsigned int			attached_buffers_cnt;
+	struct iio_ioctl_handler	*buffer_ioctl_handler;
 	struct list_head		buffer_list;
 	struct list_head		channel_attr_list;
 	struct attribute_group		chan_attr_group;
diff --git a/include/uapi/linux/iio/buffer.h b/include/uapi/linux/iio/buffer.h
new file mode 100644
index 000000000000..13939032b3f6
--- /dev/null
+++ b/include/uapi/linux/iio/buffer.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* industrial I/O buffer definitions needed both in and out of kernel
+ */
+
+#ifndef _UAPI_IIO_BUFFER_H_
+#define _UAPI_IIO_BUFFER_H_
+
+#define IIO_BUFFER_GET_FD_IOCTL			_IOWR('i', 0x91, int)
+
+#endif /* _UAPI_IIO_BUFFER_H_ */
-- 
cgit v1.2.3


From 0e41fd515f94dcfcc24b6e510d29528431e46f60 Mon Sep 17 00:00:00 2001
From: Ye Xiang <xiang.ye@intel.com>
Date: Mon, 1 Feb 2021 13:49:20 +0800
Subject: iio: hid-sensors: Move get sensitivity attribute to hid-sensor-common

No functional change has been made with this patch. The main intent here
is to reduce code repetition of getting sensitivity attribute.

In the current implementation, sensor_hub_input_get_attribute_info() is
called from multiple drivers to get attribute info for sensitivity
field. Moving this to common place will avoid code repetition.

Signed-off-by: Ye Xiang <xiang.ye@intel.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20210201054921.18214-2-xiang.ye@intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/hid-sensor-accel-3d.c            | 23 ++++++----------
 .../iio/common/hid-sensors/hid-sensor-attributes.c | 17 +++++++++++-
 drivers/iio/gyro/hid-sensor-gyro-3d.c              | 19 +++++--------
 drivers/iio/humidity/hid-sensor-humidity.c         | 16 +++++------
 drivers/iio/light/hid-sensor-als.c                 | 19 +++++--------
 drivers/iio/light/hid-sensor-prox.c                | 27 ++++++------------
 drivers/iio/magnetometer/hid-sensor-magn-3d.c      | 32 +++++++---------------
 drivers/iio/orientation/hid-sensor-incl-3d.c       | 19 +++++--------
 drivers/iio/orientation/hid-sensor-rotation.c      | 23 ++++++----------
 .../iio/position/hid-sensor-custom-intel-hinge.c   | 20 +++++---------
 drivers/iio/pressure/hid-sensor-press.c            | 19 +++++--------
 drivers/iio/temperature/hid-sensor-temperature.c   | 16 +++++------
 drivers/rtc/rtc-hid-sensor-time.c                  |  4 ++-
 include/linux/hid-sensor-hub.h                     |  4 ++-
 14 files changed, 107 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/hid-sensor-accel-3d.c b/drivers/iio/accel/hid-sensor-accel-3d.c
index 5d63ed19e6e2..2f9465cb382f 100644
--- a/drivers/iio/accel/hid-sensor-accel-3d.c
+++ b/drivers/iio/accel/hid-sensor-accel-3d.c
@@ -43,6 +43,10 @@ static const u32 accel_3d_addresses[ACCEL_3D_CHANNEL_MAX] = {
 	HID_USAGE_SENSOR_ACCEL_Z_AXIS
 };
 
+static const u32 accel_3d_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_ACCELERATION,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec accel_3d_channels[] = {
 	{
@@ -317,18 +321,6 @@ static int accel_3d_parse_report(struct platform_device *pdev,
 				&st->accel[CHANNEL_SCAN_INDEX_X],
 				&st->scale_pre_decml, &st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_ACCELERATION,
-			&st->common_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->common_attributes.sensitivity.index,
-			st->common_attributes.sensitivity.report_id);
-	}
-
 	return ret;
 }
 
@@ -366,8 +358,11 @@ static int hid_accel_3d_probe(struct platform_device *pdev)
 		channel_size = sizeof(gravity_channels);
 		indio_dev->num_channels = ARRAY_SIZE(gravity_channels);
 	}
-	ret = hid_sensor_parse_common_attributes(hsdev, hsdev->usage,
-					&accel_state->common_attributes);
+	ret = hid_sensor_parse_common_attributes(hsdev,
+						 hsdev->usage,
+						 &accel_state->common_attributes,
+						 accel_3d_sensitivity_addresses,
+						 ARRAY_SIZE(accel_3d_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
diff --git a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
index 5b822a4298a0..d349ace2e33f 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
@@ -448,12 +448,15 @@ EXPORT_SYMBOL(hid_sensor_batch_mode_supported);
 
 int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 					u32 usage_id,
-					struct hid_sensor_common *st)
+					struct hid_sensor_common *st,
+					const u32 *sensitivity_addresses,
+					u32 sensitivity_addresses_len)
 {
 
 	struct hid_sensor_hub_attribute_info timestamp;
 	s32 value;
 	int ret;
+	int i;
 
 	hid_sensor_get_reporting_interval(hsdev, usage_id, st);
 
@@ -475,6 +478,18 @@ int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 			HID_USAGE_SENSOR_PROP_SENSITIVITY_ABS,
 			 &st->sensitivity);
 
+	/*
+	 * Set Sensitivity field ids, when there is no individual modifier, will
+	 * check absolute sensitivity of data field
+	 */
+	for (i = 0; i < sensitivity_addresses_len && st->sensitivity.index < 0; i++) {
+		sensor_hub_input_get_attribute_info(hsdev,
+				HID_FEATURE_REPORT, usage_id,
+				HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
+					sensitivity_addresses[i],
+				&st->sensitivity);
+	}
+
 	st->raw_hystersis = -1;
 
 	sensor_hub_input_get_attribute_info(hsdev,
diff --git a/drivers/iio/gyro/hid-sensor-gyro-3d.c b/drivers/iio/gyro/hid-sensor-gyro-3d.c
index fb0d678ece1a..dad26ee4fd1f 100644
--- a/drivers/iio/gyro/hid-sensor-gyro-3d.c
+++ b/drivers/iio/gyro/hid-sensor-gyro-3d.c
@@ -45,6 +45,10 @@ static const u32 gyro_3d_addresses[GYRO_3D_CHANNEL_MAX] = {
 	HID_USAGE_SENSOR_ANGL_VELOCITY_Z_AXIS
 };
 
+static const u32 gryo_3d_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_ANGL_VELOCITY,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec gyro_3d_channels[] = {
 	{
@@ -271,17 +275,6 @@ static int gyro_3d_parse_report(struct platform_device *pdev,
 				&st->gyro[CHANNEL_SCAN_INDEX_X],
 				&st->scale_pre_decml, &st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_ANGL_VELOCITY,
-			&st->common_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->common_attributes.sensitivity.index,
-			st->common_attributes.sensitivity.report_id);
-	}
 	return ret;
 }
 
@@ -305,7 +298,9 @@ static int hid_gyro_3d_probe(struct platform_device *pdev)
 
 	ret = hid_sensor_parse_common_attributes(hsdev,
 						HID_USAGE_SENSOR_GYRO_3D,
-						&gyro_state->common_attributes);
+						&gyro_state->common_attributes,
+						gryo_3d_sensitivity_addresses,
+						ARRAY_SIZE(gryo_3d_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
diff --git a/drivers/iio/humidity/hid-sensor-humidity.c b/drivers/iio/humidity/hid-sensor-humidity.c
index 52f605114ef7..ec88ae3f233d 100644
--- a/drivers/iio/humidity/hid-sensor-humidity.c
+++ b/drivers/iio/humidity/hid-sensor-humidity.c
@@ -22,6 +22,10 @@ struct hid_humidity_state {
 	int value_offset;
 };
 
+static const u32 humidity_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec humidity_channels[] = {
 	{
@@ -174,14 +178,6 @@ static int humidity_parse_report(struct platform_device *pdev,
 						&st->scale_pre_decml,
 						&st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0)
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY,
-			&st->common_attributes.sensitivity);
-
 	return ret;
 }
 
@@ -210,7 +206,9 @@ static int hid_humidity_probe(struct platform_device *pdev)
 
 	ret = hid_sensor_parse_common_attributes(hsdev,
 					HID_USAGE_SENSOR_HUMIDITY,
-					&humid_st->common_attributes);
+					&humid_st->common_attributes,
+					humidity_sensitivity_addresses,
+					ARRAY_SIZE(humidity_sensitivity_addresses));
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/light/hid-sensor-als.c b/drivers/iio/light/hid-sensor-als.c
index 4093f2353d95..8bf6e9e0a0e0 100644
--- a/drivers/iio/light/hid-sensor-als.c
+++ b/drivers/iio/light/hid-sensor-als.c
@@ -39,6 +39,10 @@ struct als_state {
 	s64 timestamp;
 };
 
+static const u32 als_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_LIGHT,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec als_channels[] = {
 	{
@@ -252,17 +256,6 @@ static int als_parse_report(struct platform_device *pdev,
 				&st->als_illum,
 				&st->scale_pre_decml, &st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_LIGHT,
-			&st->common_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->common_attributes.sensitivity.index,
-			st->common_attributes.sensitivity.report_id);
-	}
 	return ret;
 }
 
@@ -285,7 +278,9 @@ static int hid_als_probe(struct platform_device *pdev)
 	als_state->common_attributes.pdev = pdev;
 
 	ret = hid_sensor_parse_common_attributes(hsdev, HID_USAGE_SENSOR_ALS,
-					&als_state->common_attributes);
+					&als_state->common_attributes,
+					als_sensitivity_addresses,
+					ARRAY_SIZE(als_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
diff --git a/drivers/iio/light/hid-sensor-prox.c b/drivers/iio/light/hid-sensor-prox.c
index 330cf359e0b8..4ab285a418d5 100644
--- a/drivers/iio/light/hid-sensor-prox.c
+++ b/drivers/iio/light/hid-sensor-prox.c
@@ -25,6 +25,11 @@ struct prox_state {
 	u32 human_presence;
 };
 
+static const u32 prox_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_HUMAN_PRESENCE,
+	HID_USAGE_SENSOR_DATA_PRESENCE,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec prox_channels[] = {
 	{
@@ -216,24 +221,6 @@ static int prox_parse_report(struct platform_device *pdev,
 	dev_dbg(&pdev->dev, "prox %x:%x\n", st->prox_attr.index,
 			st->prox_attr.report_id);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_PRESENCE,
-			&st->common_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->common_attributes.sensitivity.index,
-			st->common_attributes.sensitivity.report_id);
-	}
-	if (st->common_attributes.sensitivity.index < 0)
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_HUMAN_PRESENCE,
-			&st->common_attributes.sensitivity);
-
 	return ret;
 }
 
@@ -257,7 +244,9 @@ static int hid_prox_probe(struct platform_device *pdev)
 	prox_state->common_attributes.pdev = pdev;
 
 	ret = hid_sensor_parse_common_attributes(hsdev, HID_USAGE_SENSOR_PROX,
-					&prox_state->common_attributes);
+					&prox_state->common_attributes,
+					prox_sensitivity_addresses,
+					ARRAY_SIZE(prox_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
diff --git a/drivers/iio/magnetometer/hid-sensor-magn-3d.c b/drivers/iio/magnetometer/hid-sensor-magn-3d.c
index fa48044b7f5b..b78691523dd4 100644
--- a/drivers/iio/magnetometer/hid-sensor-magn-3d.c
+++ b/drivers/iio/magnetometer/hid-sensor-magn-3d.c
@@ -62,6 +62,11 @@ static const u32 magn_3d_addresses[MAGN_3D_CHANNEL_MAX] = {
 	HID_USAGE_SENSOR_TIME_TIMESTAMP,
 };
 
+static const u32 magn_3d_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_ORIENTATION,
+	HID_USAGE_SENSOR_ORIENT_MAGN_FLUX,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec magn_3d_channels[] = {
 	{
@@ -448,27 +453,6 @@ static int magn_3d_parse_report(struct platform_device *pdev,
 			&st->rot_attr.scale_pre_decml,
 			&st->rot_attr.scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->magn_flux_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_ORIENTATION,
-			&st->magn_flux_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->magn_flux_attributes.sensitivity.index,
-			st->magn_flux_attributes.sensitivity.report_id);
-	}
-	if (st->magn_flux_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_ORIENT_MAGN_FLUX,
-			&st->magn_flux_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->magn_flux_attributes.sensitivity.index,
-			st->magn_flux_attributes.sensitivity.report_id);
-	}
 	if (st->rot_attributes.sensitivity.index < 0) {
 		sensor_hub_input_get_attribute_info(hsdev,
 			HID_FEATURE_REPORT, usage_id,
@@ -507,12 +491,16 @@ static int hid_magn_3d_probe(struct platform_device *pdev)
 
 	ret = hid_sensor_parse_common_attributes(hsdev,
 				HID_USAGE_SENSOR_COMPASS_3D,
-				&magn_state->magn_flux_attributes);
+				&magn_state->magn_flux_attributes,
+				magn_3d_sensitivity_addresses,
+				ARRAY_SIZE(magn_3d_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
 	}
 	magn_state->rot_attributes = magn_state->magn_flux_attributes;
+	/* sensitivity of rot_attribute is not the same as magn_flux_attributes */
+	magn_state->rot_attributes.sensitivity.index = -1;
 
 	ret = magn_3d_parse_report(pdev, hsdev,
 				&channels, &chan_count,
diff --git a/drivers/iio/orientation/hid-sensor-incl-3d.c b/drivers/iio/orientation/hid-sensor-incl-3d.c
index 52ebef30f9be..6e69f6e673cc 100644
--- a/drivers/iio/orientation/hid-sensor-incl-3d.c
+++ b/drivers/iio/orientation/hid-sensor-incl-3d.c
@@ -47,6 +47,10 @@ static const u32 incl_3d_addresses[INCLI_3D_CHANNEL_MAX] = {
 	HID_USAGE_SENSOR_ORIENT_TILT_Z
 };
 
+static const u32 incl_3d_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_ORIENTATION,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec incl_3d_channels[] = {
 	{
@@ -291,17 +295,6 @@ static int incl_3d_parse_report(struct platform_device *pdev,
 				&st->incl[CHANNEL_SCAN_INDEX_X],
 				&st->scale_pre_decml, &st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_ORIENTATION,
-			&st->common_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->common_attributes.sensitivity.index,
-			st->common_attributes.sensitivity.report_id);
-	}
 	return ret;
 }
 
@@ -327,7 +320,9 @@ static int hid_incl_3d_probe(struct platform_device *pdev)
 
 	ret = hid_sensor_parse_common_attributes(hsdev,
 				HID_USAGE_SENSOR_INCLINOMETER_3D,
-				&incl_state->common_attributes);
+				&incl_state->common_attributes,
+				incl_3d_sensitivity_addresses,
+				ARRAY_SIZE(incl_3d_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
diff --git a/drivers/iio/orientation/hid-sensor-rotation.c b/drivers/iio/orientation/hid-sensor-rotation.c
index c087d8f72a54..baa2ee353ba4 100644
--- a/drivers/iio/orientation/hid-sensor-rotation.c
+++ b/drivers/iio/orientation/hid-sensor-rotation.c
@@ -31,6 +31,10 @@ struct dev_rot_state {
 	s64 timestamp;
 };
 
+static const u32 rotation_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_ORIENTATION,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec dev_rot_channels[] = {
 	{
@@ -221,18 +225,6 @@ static int dev_rot_parse_report(struct platform_device *pdev,
 				&st->quaternion,
 				&st->scale_pre_decml, &st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_ORIENTATION,
-			&st->common_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->common_attributes.sensitivity.index,
-			st->common_attributes.sensitivity.report_id);
-	}
-
 	return 0;
 }
 
@@ -270,8 +262,11 @@ static int hid_dev_rot_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	ret = hid_sensor_parse_common_attributes(hsdev, hsdev->usage,
-				&rot_state->common_attributes);
+	ret = hid_sensor_parse_common_attributes(hsdev,
+						 hsdev->usage,
+						 &rot_state->common_attributes,
+						 rotation_sensitivity_addresses,
+						 ARRAY_SIZE(rotation_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
diff --git a/drivers/iio/position/hid-sensor-custom-intel-hinge.c b/drivers/iio/position/hid-sensor-custom-intel-hinge.c
index 64a7fa7db6af..fd77e7ee87f3 100644
--- a/drivers/iio/position/hid-sensor-custom-intel-hinge.c
+++ b/drivers/iio/position/hid-sensor-custom-intel-hinge.c
@@ -47,6 +47,10 @@ struct hinge_state {
 	u64 timestamp;
 };
 
+static const u32 hinge_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_FIELD_CUSTOM_VALUE(1),
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec hinge_channels[] = {
 	{
@@ -251,18 +255,6 @@ static int hinge_parse_report(struct platform_device *pdev,
 			&st->hinge[CHANNEL_SCAN_INDEX_HINGE_ANGLE],
 			&st->scale_pre_decml, &st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-				HID_FEATURE_REPORT, usage_id,
-				HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-					HID_USAGE_SENSOR_DATA_FIELD_CUSTOM_VALUE(1),
-				&st->common_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->common_attributes.sensitivity.index,
-			st->common_attributes.sensitivity.report_id);
-	}
-
 	return ret;
 }
 
@@ -289,7 +281,9 @@ static int hid_hinge_probe(struct platform_device *pdev)
 		st->labels[i] = hinge_labels[i];
 
 	ret = hid_sensor_parse_common_attributes(hsdev, hsdev->usage,
-						 &st->common_attributes);
+						 &st->common_attributes,
+						 hinge_sensitivity_addresses,
+						 ARRAY_SIZE(hinge_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
diff --git a/drivers/iio/pressure/hid-sensor-press.c b/drivers/iio/pressure/hid-sensor-press.c
index 5c458788f346..8cac2c94e75a 100644
--- a/drivers/iio/pressure/hid-sensor-press.c
+++ b/drivers/iio/pressure/hid-sensor-press.c
@@ -29,6 +29,10 @@ struct press_state {
 	int value_offset;
 };
 
+static const u32 press_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_ATMOSPHERIC_PRESSURE,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec press_channels[] = {
 	{
@@ -225,17 +229,6 @@ static int press_parse_report(struct platform_device *pdev,
 				&st->press_attr,
 				&st->scale_pre_decml, &st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0) {
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_ATMOSPHERIC_PRESSURE,
-			&st->common_attributes.sensitivity);
-		dev_dbg(&pdev->dev, "Sensitivity index:report %d:%d\n",
-			st->common_attributes.sensitivity.index,
-			st->common_attributes.sensitivity.report_id);
-	}
 	return ret;
 }
 
@@ -260,7 +253,9 @@ static int hid_press_probe(struct platform_device *pdev)
 
 	ret = hid_sensor_parse_common_attributes(hsdev,
 					HID_USAGE_SENSOR_PRESSURE,
-					&press_state->common_attributes);
+					&press_state->common_attributes,
+					press_sensitivity_addresses,
+					ARRAY_SIZE(press_sensitivity_addresses));
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes\n");
 		return ret;
diff --git a/drivers/iio/temperature/hid-sensor-temperature.c b/drivers/iio/temperature/hid-sensor-temperature.c
index 81688f1b932f..e3d38cbcf354 100644
--- a/drivers/iio/temperature/hid-sensor-temperature.c
+++ b/drivers/iio/temperature/hid-sensor-temperature.c
@@ -22,6 +22,10 @@ struct temperature_state {
 	int value_offset;
 };
 
+static const u32 temperature_sensitivity_addresses[] = {
+	HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE,
+};
+
 /* Channel definitions */
 static const struct iio_chan_spec temperature_channels[] = {
 	{
@@ -171,14 +175,6 @@ static int temperature_parse_report(struct platform_device *pdev,
 				&st->temperature_attr,
 				&st->scale_pre_decml, &st->scale_post_decml);
 
-	/* Set Sensitivity field ids, when there is no individual modifier */
-	if (st->common_attributes.sensitivity.index < 0)
-		sensor_hub_input_get_attribute_info(hsdev,
-			HID_FEATURE_REPORT, usage_id,
-			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
-			HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE,
-			&st->common_attributes.sensitivity);
-
 	return ret;
 }
 
@@ -207,7 +203,9 @@ static int hid_temperature_probe(struct platform_device *pdev)
 
 	ret = hid_sensor_parse_common_attributes(hsdev,
 					HID_USAGE_SENSOR_TEMPERATURE,
-					&temp_st->common_attributes);
+					&temp_st->common_attributes,
+					temperature_sensitivity_addresses,
+					ARRAY_SIZE(temperature_sensitivity_addresses));
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c
index 1b42ee0758d2..47cd12db2356 100644
--- a/drivers/rtc/rtc-hid-sensor-time.c
+++ b/drivers/rtc/rtc-hid-sensor-time.c
@@ -238,7 +238,9 @@ static int hid_time_probe(struct platform_device *pdev)
 
 	ret = hid_sensor_parse_common_attributes(hsdev,
 				HID_USAGE_SENSOR_TIME,
-				&time_state->common_attributes);
+				&time_state->common_attributes,
+				NULL,
+				0);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to setup common attributes!\n");
 		return ret;
diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index 763802b2b8f9..637ec53a98a1 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -248,7 +248,9 @@ static inline int hid_sensor_convert_exponent(int unit_expo)
 
 int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 					u32 usage_id,
-					struct hid_sensor_common *st);
+					struct hid_sensor_common *st,
+					const u32 *sensitivity_addresses,
+					u32 sensitivity_addresses_len);
 int hid_sensor_write_raw_hyst_value(struct hid_sensor_common *st,
 					int val1, int val2);
 int hid_sensor_read_raw_hyst_value(struct hid_sensor_common *st,
-- 
cgit v1.2.3


From 1c71a2863a0c56123e5a67880cf658083c0a0b1e Mon Sep 17 00:00:00 2001
From: Ye Xiang <xiang.ye@intel.com>
Date: Sun, 7 Feb 2021 15:00:46 +0800
Subject: iio: Add relative sensitivity support

Some hid sensors may use relative sensitivity such as als sensor.
This patch adds relative sensitivity checking for all hid sensors.

Signed-off-by: Ye Xiang <xiang.ye@intel.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Link: https://lore.kernel.org/r/20210207070048.23935-2-xiang.ye@intel.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 .../iio/common/hid-sensors/hid-sensor-attributes.c | 74 ++++++++++++++++++++--
 drivers/iio/industrialio-core.c                    |  1 +
 include/linux/hid-sensor-hub.h                     |  5 ++
 include/linux/hid-sensor-ids.h                     |  1 +
 include/linux/iio/types.h                          |  1 +
 5 files changed, 78 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
index d349ace2e33f..cb52b4fd6bf7 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
@@ -263,6 +263,29 @@ int hid_sensor_read_raw_hyst_value(struct hid_sensor_common *st,
 }
 EXPORT_SYMBOL(hid_sensor_read_raw_hyst_value);
 
+int hid_sensor_read_raw_hyst_rel_value(struct hid_sensor_common *st, int *val1,
+				       int *val2)
+{
+	s32 value;
+	int ret;
+
+	ret = sensor_hub_get_feature(st->hsdev,
+				     st->sensitivity_rel.report_id,
+				     st->sensitivity_rel.index, sizeof(value),
+				     &value);
+	if (ret < 0 || value < 0) {
+		*val1 = *val2 = 0;
+		return -EINVAL;
+	}
+
+	convert_from_vtf_format(value, st->sensitivity_rel.size,
+				st->sensitivity_rel.unit_expo, val1, val2);
+
+	return IIO_VAL_INT_PLUS_MICRO;
+}
+EXPORT_SYMBOL(hid_sensor_read_raw_hyst_rel_value);
+
+
 int hid_sensor_write_raw_hyst_value(struct hid_sensor_common *st,
 					int val1, int val2)
 {
@@ -294,6 +317,37 @@ int hid_sensor_write_raw_hyst_value(struct hid_sensor_common *st,
 }
 EXPORT_SYMBOL(hid_sensor_write_raw_hyst_value);
 
+int hid_sensor_write_raw_hyst_rel_value(struct hid_sensor_common *st,
+					int val1, int val2)
+{
+	s32 value;
+	int ret;
+
+	if (val1 < 0 || val2 < 0)
+		return -EINVAL;
+
+	value = convert_to_vtf_format(st->sensitivity_rel.size,
+				st->sensitivity_rel.unit_expo,
+				val1, val2);
+	ret = sensor_hub_set_feature(st->hsdev, st->sensitivity_rel.report_id,
+				     st->sensitivity_rel.index, sizeof(value),
+				     &value);
+	if (ret < 0 || value < 0)
+		return -EINVAL;
+
+	ret = sensor_hub_get_feature(st->hsdev,
+				     st->sensitivity_rel.report_id,
+				     st->sensitivity_rel.index, sizeof(value),
+				     &value);
+	if (ret < 0 || value < 0)
+		return -EINVAL;
+
+	st->raw_hystersis = value;
+
+	return 0;
+}
+EXPORT_SYMBOL(hid_sensor_write_raw_hyst_rel_value);
+
 /*
  * This fuction applies the unit exponent to the scale.
  * For example:
@@ -478,16 +532,28 @@ int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 			HID_USAGE_SENSOR_PROP_SENSITIVITY_ABS,
 			 &st->sensitivity);
 
+	sensor_hub_input_get_attribute_info(hsdev,
+			HID_FEATURE_REPORT, usage_id,
+			HID_USAGE_SENSOR_PROP_SENSITIVITY_REL_PCT,
+			&st->sensitivity_rel);
 	/*
 	 * Set Sensitivity field ids, when there is no individual modifier, will
-	 * check absolute sensitivity of data field
+	 * check absolute sensitivity and relative sensitivity of data field
 	 */
-	for (i = 0; i < sensitivity_addresses_len && st->sensitivity.index < 0; i++) {
-		sensor_hub_input_get_attribute_info(hsdev,
-				HID_FEATURE_REPORT, usage_id,
+	for (i = 0; i < sensitivity_addresses_len; i++) {
+		if (st->sensitivity.index < 0)
+			sensor_hub_input_get_attribute_info(
+				hsdev, HID_FEATURE_REPORT, usage_id,
 				HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
 					sensitivity_addresses[i],
 				&st->sensitivity);
+
+		if (st->sensitivity_rel.index < 0)
+			sensor_hub_input_get_attribute_info(
+				hsdev, HID_FEATURE_REPORT, usage_id,
+				HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_REL_PCT |
+					sensitivity_addresses[i],
+				&st->sensitivity_rel);
 	}
 
 	st->raw_hystersis = -1;
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 2e2970f3e21c..cb2735d2ae4b 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -157,6 +157,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_PHASE] = "phase",
 	[IIO_CHAN_INFO_HARDWAREGAIN] = "hardwaregain",
 	[IIO_CHAN_INFO_HYSTERESIS] = "hysteresis",
+	[IIO_CHAN_INFO_HYSTERESIS_RELATIVE] = "hysteresis_relative",
 	[IIO_CHAN_INFO_INT_TIME] = "integration_time",
 	[IIO_CHAN_INFO_ENABLE] = "en",
 	[IIO_CHAN_INFO_CALIBHEIGHT] = "calibheight",
diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index 637ec53a98a1..c27329e2a5ad 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -231,6 +231,7 @@ struct hid_sensor_common {
 	struct hid_sensor_hub_attribute_info report_state;
 	struct hid_sensor_hub_attribute_info power_state;
 	struct hid_sensor_hub_attribute_info sensitivity;
+	struct hid_sensor_hub_attribute_info sensitivity_rel;
 	struct hid_sensor_hub_attribute_info report_latency;
 	struct work_struct work;
 };
@@ -253,8 +254,12 @@ int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 					u32 sensitivity_addresses_len);
 int hid_sensor_write_raw_hyst_value(struct hid_sensor_common *st,
 					int val1, int val2);
+int hid_sensor_write_raw_hyst_rel_value(struct hid_sensor_common *st, int val1,
+					int val2);
 int hid_sensor_read_raw_hyst_value(struct hid_sensor_common *st,
 					int *val1, int *val2);
+int hid_sensor_read_raw_hyst_rel_value(struct hid_sensor_common *st,
+				       int *val1, int *val2);
 int hid_sensor_write_samp_freq_value(struct hid_sensor_common *st,
 					int val1, int val2);
 int hid_sensor_read_samp_freq_value(struct hid_sensor_common *st,
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 3bbdbccc5805..ac631159403a 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -149,6 +149,7 @@
 /* Per data field properties */
 #define HID_USAGE_SENSOR_DATA_MOD_NONE					0x00
 #define HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS		0x1000
+#define HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_REL_PCT            0xE000
 
 /* Power state enumerations */
 #define HID_USAGE_SENSOR_PROP_POWER_STATE_UNDEFINED_ENUM	0x200850
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 1e3ed6f55bca..5aa7f66d4345 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -50,6 +50,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_PHASE,
 	IIO_CHAN_INFO_HARDWAREGAIN,
 	IIO_CHAN_INFO_HYSTERESIS,
+	IIO_CHAN_INFO_HYSTERESIS_RELATIVE,
 	IIO_CHAN_INFO_INT_TIME,
 	IIO_CHAN_INFO_ENABLE,
 	IIO_CHAN_INFO_CALIBHEIGHT,
-- 
cgit v1.2.3


From aa29cf932fb345e111cd7fef04320846fa73e372 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Fri, 19 Feb 2021 11:01:34 +0200
Subject: iio: adc: adi-axi-adc: fix typo in doc-string

The channels are of type iio_chan_spec, not axi_adc_chan_spec. They were in
some earlier version, but forgot to rename in the doc-string.

Fixes: ef04070692a21 ("iio: adc: adi-axi-adc: add support for AXI ADC IP core")
Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210219090134.48057-1-alexandru.ardelean@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/adc/adi-axi-adc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/adc/adi-axi-adc.h b/include/linux/iio/adc/adi-axi-adc.h
index c5d48e1c2d36..52620e5b8052 100644
--- a/include/linux/iio/adc/adi-axi-adc.h
+++ b/include/linux/iio/adc/adi-axi-adc.h
@@ -15,7 +15,7 @@ struct iio_chan_spec;
  * struct adi_axi_adc_chip_info - Chip specific information
  * @name		Chip name
  * @id			Chip ID (usually product ID)
- * @channels		Channel specifications of type @struct axi_adc_chan_spec
+ * @channels		Channel specifications of type @struct iio_chan_spec
  * @num_channels	Number of @channels
  * @scale_table		Supported scales by the chip; tuples of 2 ints
  * @num_scales		Number of scales in the table
-- 
cgit v1.2.3


From 15aacc980dcb326ef33dfc32772faea1067f9178 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Thu, 18 Feb 2021 12:40:39 +0100
Subject: iio: adis: add helpers for locking

Add some helpers to lock and unlock the device. As this is such a simple
change, we update all the users that were using the lock already in this
patch.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20210218114039.216091-5-nuno.sa@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/imu/adis16400.c  | 22 ++++++++++------------
 drivers/iio/imu/adis16475.c  | 10 +++++-----
 drivers/iio/imu/adis16480.c  | 15 +++++++--------
 include/linux/iio/imu/adis.h | 10 ++++++++++
 4 files changed, 32 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/imu/adis16400.c b/drivers/iio/imu/adis16400.c
index 54af2ed664f6..b1ec14bf439f 100644
--- a/drivers/iio/imu/adis16400.c
+++ b/drivers/iio/imu/adis16400.c
@@ -505,7 +505,6 @@ static int adis16400_write_raw(struct iio_dev *indio_dev,
 	struct iio_chan_spec const *chan, int val, int val2, long info)
 {
 	struct adis16400_state *st = iio_priv(indio_dev);
-	struct mutex *slock = &st->adis.state_lock;
 	int ret, sps;
 
 	switch (info) {
@@ -518,18 +517,18 @@ static int adis16400_write_raw(struct iio_dev *indio_dev,
 		 * Need to cache values so we can update if the frequency
 		 * changes.
 		 */
-		mutex_lock(slock);
+		adis_dev_lock(&st->adis);
 		st->filt_int = val;
 		/* Work out update to current value */
 		sps = st->variant->get_freq(st);
 		if (sps < 0) {
-			mutex_unlock(slock);
+			adis_dev_unlock(&st->adis);
 			return sps;
 		}
 
 		ret = __adis16400_set_filter(indio_dev, sps,
 			val * 1000 + val2 / 1000);
-		mutex_unlock(slock);
+		adis_dev_unlock(&st->adis);
 		return ret;
 	case IIO_CHAN_INFO_SAMP_FREQ:
 		sps = val * 1000 + val2 / 1000;
@@ -537,9 +536,9 @@ static int adis16400_write_raw(struct iio_dev *indio_dev,
 		if (sps <= 0)
 			return -EINVAL;
 
-		mutex_lock(slock);
+		adis_dev_lock(&st->adis);
 		ret = st->variant->set_freq(st, sps);
-		mutex_unlock(slock);
+		adis_dev_unlock(&st->adis);
 		return ret;
 	default:
 		return -EINVAL;
@@ -550,7 +549,6 @@ static int adis16400_read_raw(struct iio_dev *indio_dev,
 	struct iio_chan_spec const *chan, int *val, int *val2, long info)
 {
 	struct adis16400_state *st = iio_priv(indio_dev);
-	struct mutex *slock = &st->adis.state_lock;
 	int16_t val16;
 	int ret;
 
@@ -606,17 +604,17 @@ static int adis16400_read_raw(struct iio_dev *indio_dev,
 		*val = st->variant->temp_offset;
 		return IIO_VAL_INT;
 	case IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY:
-		mutex_lock(slock);
+		adis_dev_lock(&st->adis);
 		/* Need both the number of taps and the sampling frequency */
 		ret = __adis_read_reg_16(&st->adis,
 						ADIS16400_SENS_AVG,
 						&val16);
 		if (ret) {
-			mutex_unlock(slock);
+			adis_dev_unlock(&st->adis);
 			return ret;
 		}
 		ret = st->variant->get_freq(st);
-		mutex_unlock(slock);
+		adis_dev_unlock(&st->adis);
 		if (ret)
 			return ret;
 		ret /= adis16400_3db_divisors[val16 & 0x07];
@@ -624,9 +622,9 @@ static int adis16400_read_raw(struct iio_dev *indio_dev,
 		*val2 = (ret % 1000) * 1000;
 		return IIO_VAL_INT_PLUS_MICRO;
 	case IIO_CHAN_INFO_SAMP_FREQ:
-		mutex_lock(slock);
+		adis_dev_lock(&st->adis);
 		ret = st->variant->get_freq(st);
-		mutex_unlock(slock);
+		adis_dev_unlock(&st->adis);
 		if (ret)
 			return ret;
 		*val = ret / 1000;
diff --git a/drivers/iio/imu/adis16475.c b/drivers/iio/imu/adis16475.c
index 5759cd8ffbef..8f6bea4b6608 100644
--- a/drivers/iio/imu/adis16475.c
+++ b/drivers/iio/imu/adis16475.c
@@ -263,7 +263,7 @@ static int adis16475_get_freq(struct adis16475 *st, u32 *freq)
 	u16 dec;
 	u32 sample_rate = st->clk_freq;
 
-	mutex_lock(&st->adis.state_lock);
+	adis_dev_lock(&st->adis);
 
 	if (st->sync_mode == ADIS16475_SYNC_SCALED) {
 		u16 sync_scale;
@@ -279,13 +279,13 @@ static int adis16475_get_freq(struct adis16475 *st, u32 *freq)
 	if (ret)
 		goto error;
 
-	mutex_unlock(&st->adis.state_lock);
+	adis_dev_unlock(&st->adis);
 
 	*freq = DIV_ROUND_CLOSEST(sample_rate, dec + 1);
 
 	return 0;
 error:
-	mutex_unlock(&st->adis.state_lock);
+	adis_dev_unlock(&st->adis);
 	return ret;
 }
 
@@ -298,7 +298,7 @@ static int adis16475_set_freq(struct adis16475 *st, const u32 freq)
 	if (!freq)
 		return -EINVAL;
 
-	mutex_lock(&st->adis.state_lock);
+	adis_dev_lock(&st->adis);
 	/*
 	 * When using sync scaled mode, the input clock needs to be scaled so that we have
 	 * an IMU sample rate between (optimally) 1900 and 2100. After this, we can use the
@@ -366,7 +366,7 @@ static int adis16475_set_freq(struct adis16475 *st, const u32 freq)
 
 	return 0;
 error:
-	mutex_unlock(&st->adis.state_lock);
+	adis_dev_unlock(&st->adis);
 	return ret;
 }
 
diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c
index c41b8ef1e250..f81b86690b76 100644
--- a/drivers/iio/imu/adis16480.c
+++ b/drivers/iio/imu/adis16480.c
@@ -329,7 +329,7 @@ static int adis16480_set_freq(struct iio_dev *indio_dev, int val, int val2)
 	if (t == 0)
 		return -EINVAL;
 
-	mutex_lock(&st->adis.state_lock);
+	adis_dev_lock(&st->adis);
 	/*
 	 * When using PPS mode, the input clock needs to be scaled so that we have an IMU
 	 * sample rate between (optimally) 4000 and 4250. After this, we can use the
@@ -386,7 +386,7 @@ static int adis16480_set_freq(struct iio_dev *indio_dev, int val, int val2)
 
 	ret = __adis_write_reg_16(&st->adis, ADIS16480_REG_DEC_RATE, t);
 error:
-	mutex_unlock(&st->adis.state_lock);
+	adis_dev_unlock(&st->adis);
 	return ret;
 }
 
@@ -397,7 +397,7 @@ static int adis16480_get_freq(struct iio_dev *indio_dev, int *val, int *val2)
 	int ret;
 	unsigned int freq, sample_rate = st->clk_freq;
 
-	mutex_lock(&st->adis.state_lock);
+	adis_dev_lock(&st->adis);
 
 	if (st->clk_mode == ADIS16480_CLK_PPS) {
 		u16 sync_scale;
@@ -413,7 +413,7 @@ static int adis16480_get_freq(struct iio_dev *indio_dev, int *val, int *val2)
 	if (ret)
 		goto error;
 
-	mutex_unlock(&st->adis.state_lock);
+	adis_dev_unlock(&st->adis);
 
 	freq = DIV_ROUND_CLOSEST(sample_rate, (t + 1));
 
@@ -422,7 +422,7 @@ static int adis16480_get_freq(struct iio_dev *indio_dev, int *val, int *val2)
 
 	return IIO_VAL_INT_PLUS_MICRO;
 error:
-	mutex_unlock(&st->adis.state_lock);
+	adis_dev_unlock(&st->adis);
 	return ret;
 }
 
@@ -598,7 +598,6 @@ static int adis16480_set_filter_freq(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, unsigned int freq)
 {
 	struct adis16480 *st = iio_priv(indio_dev);
-	struct mutex *slock = &st->adis.state_lock;
 	unsigned int enable_mask, offset, reg;
 	unsigned int diff, best_diff;
 	unsigned int i, best_freq;
@@ -609,7 +608,7 @@ static int adis16480_set_filter_freq(struct iio_dev *indio_dev,
 	offset = ad16480_filter_data[chan->scan_index][1];
 	enable_mask = BIT(offset + 2);
 
-	mutex_lock(slock);
+	adis_dev_lock(&st->adis);
 
 	ret = __adis_read_reg_16(&st->adis, reg, &val);
 	if (ret)
@@ -637,7 +636,7 @@ static int adis16480_set_filter_freq(struct iio_dev *indio_dev,
 
 	ret = __adis_write_reg_16(&st->adis, reg, val);
 out_unlock:
-	mutex_unlock(slock);
+	adis_dev_unlock(&st->adis);
 
 	return ret;
 }
diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index 04e96d688ba9..f9b728d490b1 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -428,6 +428,16 @@ static inline int adis_initial_startup(struct adis *adis)
 	return ret;
 }
 
+static inline void adis_dev_lock(struct adis *adis)
+{
+	mutex_lock(&adis->state_lock);
+}
+
+static inline void adis_dev_unlock(struct adis *adis)
+{
+	mutex_unlock(&adis->state_lock);
+}
+
 int adis_single_conversion(struct iio_dev *indio_dev,
 	const struct iio_chan_spec *chan, unsigned int error_mask,
 	int *val);
-- 
cgit v1.2.3


From 7852546f524595245382a919e752468f73421451 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Thu, 4 Mar 2021 14:45:15 +0200
Subject: RDMA/mlx5: Fix query RoCE port

mlx5_is_roce_enabled returns the devlink RoCE init value, therefore it
should be used only when driver is loaded. Instead we just need to read
the roce_en field.

In addition, rename mlx5_is_roce_enabled to mlx5_is_roce_init_enabled.

Fixes: 7a58779edd75 ("IB/mlx5: Improve query port for representor port")
Link: https://lore.kernel.org/r/20210304124517.1100608-2-leon@kernel.org
Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c | 6 +++---
 include/linux/mlx5/driver.h       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 0d69a697d75f..4be7bccefaa4 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -499,7 +499,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 	translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
 				 &props->active_width, ext);
 
-	if (!dev->is_rep && mlx5_is_roce_enabled(mdev)) {
+	if (!dev->is_rep && dev->mdev->roce.roce_en) {
 		u16 qkey_viol_cntr;
 
 		props->port_cap_flags |= IB_PORT_CM_SUP;
@@ -4174,7 +4174,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
 
 		/* Register only for native ports */
 		err = mlx5_add_netdev_notifier(dev, port_num);
-		if (err || dev->is_rep || !mlx5_is_roce_enabled(mdev))
+		if (err || dev->is_rep || !mlx5_is_roce_init_enabled(mdev))
 			/*
 			 * We don't enable ETH interface for
 			 * 1. IB representors
@@ -4711,7 +4711,7 @@ static int mlx5r_probe(struct auxiliary_device *adev,
 	dev->mdev = mdev;
 	dev->num_ports = num_ports;
 
-	if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_enabled(mdev))
+	if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_init_enabled(mdev))
 		profile = &raw_eth_profile;
 	else
 		profile = &pf_profile;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 53b89631a1d9..ab07f09f2bad 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1226,7 +1226,7 @@ enum {
 	MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
 };
 
-static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev)
+static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev)
 {
 	struct devlink *devlink = priv_to_devlink(dev);
 	union devlink_param_value val;
-- 
cgit v1.2.3


From f4dae54e486d528d4dd98df116e7a522bbf12667 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 11 Mar 2021 12:35:04 -0800
Subject: tcp: plug skb_still_in_host_queue() to TSQ

Jakub and Neil reported an increase of RTO timers whenever
TX completions are delayed a bit more (by increasing
NIC TX coalescing parameters)

Main issue is that TCP stack has a logic preventing a packet
being retransmit if the prior clone has not yet been
orphaned or freed.

This logic came with commit 1f3279ae0c13 ("tcp: avoid
retransmits of TCP packets hanging in host queues")

Thankfully, in the case skb_still_in_host_queue() detects
the initial clone is still in flight, it can use TSQ logic
that will eventually retry later, at the moment the clone
is freed or orphaned.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Neil Spring <ntspring@fb.com>
Reported-by: Jakub Kicinski <kuba@kernel.org>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 +-
 net/ipv4/tcp_output.c  | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0503c917d773..483e89348f78 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1140,7 +1140,7 @@ static inline bool skb_fclone_busy(const struct sock *sk,
 
 	return skb->fclone == SKB_FCLONE_ORIG &&
 	       refcount_read(&fclones->fclone_ref) > 1 &&
-	       fclones->skb2.sk == sk;
+	       READ_ONCE(fclones->skb2.sk) == sk;
 }
 
 /**
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fbf140a770d8..0dbf208a4f2f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2775,13 +2775,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
  * a packet is still in a qdisc or driver queue.
  * In this case, there is very little point doing a retransmit !
  */
-static bool skb_still_in_host_queue(const struct sock *sk,
+static bool skb_still_in_host_queue(struct sock *sk,
 				    const struct sk_buff *skb)
 {
 	if (unlikely(skb_fclone_busy(sk, skb))) {
-		NET_INC_STATS(sock_net(sk),
-			      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
-		return true;
+		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+		smp_mb__after_atomic();
+		if (skb_fclone_busy(sk, skb)) {
+			NET_INC_STATS(sock_net(sk),
+				      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+			return true;
+		}
 	}
 	return false;
 }
-- 
cgit v1.2.3


From ccf953d8f3d68e85e577e843fdcde8872b0a9769 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 10 Mar 2021 18:55:30 +0000
Subject: fb_defio: Remove custom address_space_operations

There's no need to give the page an address_space.  Leaving the
page->mapping as NULL will cause the VM to handle set_page_dirty()
the same way that it's handled now, and that was the only reason to
set the address_space in the first place.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210310185530.1053320-1-willy@infradead.org
---
 drivers/video/fbdev/core/fb_defio.c | 35 -----------------------------------
 drivers/video/fbdev/core/fbmem.c    |  4 ----
 include/linux/fb.h                  |  3 ---
 3 files changed, 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index a591d291b231..b292887a2481 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -52,13 +52,6 @@ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	get_page(page);
-
-	if (vmf->vma->vm_file)
-		page->mapping = vmf->vma->vm_file->f_mapping;
-	else
-		printk(KERN_ERR "no mapping available\n");
-
-	BUG_ON(!page->mapping);
 	page->index = vmf->pgoff;
 
 	vmf->page = page;
@@ -151,17 +144,6 @@ static const struct vm_operations_struct fb_deferred_io_vm_ops = {
 	.page_mkwrite	= fb_deferred_io_mkwrite,
 };
 
-static int fb_deferred_io_set_page_dirty(struct page *page)
-{
-	if (!PageDirty(page))
-		SetPageDirty(page);
-	return 0;
-}
-
-static const struct address_space_operations fb_deferred_io_aops = {
-	.set_page_dirty = fb_deferred_io_set_page_dirty,
-};
-
 int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
 	vma->vm_ops = &fb_deferred_io_vm_ops;
@@ -212,29 +194,12 @@ void fb_deferred_io_init(struct fb_info *info)
 }
 EXPORT_SYMBOL_GPL(fb_deferred_io_init);
 
-void fb_deferred_io_open(struct fb_info *info,
-			 struct inode *inode,
-			 struct file *file)
-{
-	file->f_mapping->a_ops = &fb_deferred_io_aops;
-}
-EXPORT_SYMBOL_GPL(fb_deferred_io_open);
-
 void fb_deferred_io_cleanup(struct fb_info *info)
 {
 	struct fb_deferred_io *fbdefio = info->fbdefio;
-	struct page *page;
-	int i;
 
 	BUG_ON(!fbdefio);
 	cancel_delayed_work_sync(&info->deferred_work);
-
-	/* clear out the mapping that we setup */
-	for (i = 0 ; i < info->fix.smem_len; i += PAGE_SIZE) {
-		page = fb_deferred_io_page(info, i);
-		page->mapping = NULL;
-	}
-
 	mutex_destroy(&fbdefio->lock);
 }
 EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup);
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 06f5805de2de..372b52a2befa 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -1415,10 +1415,6 @@ __releases(&info->lock)
 		if (res)
 			module_put(info->fbops->owner);
 	}
-#ifdef CONFIG_FB_DEFERRED_IO
-	if (info->fbdefio)
-		fb_deferred_io_open(info, inode, file);
-#endif
 out:
 	unlock_fb_info(info);
 	if (res)
diff --git a/include/linux/fb.h b/include/linux/fb.h
index ecfbcc0553a5..a8dccd23c249 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -659,9 +659,6 @@ static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch,
 /* drivers/video/fb_defio.c */
 int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma);
 extern void fb_deferred_io_init(struct fb_info *info);
-extern void fb_deferred_io_open(struct fb_info *info,
-				struct inode *inode,
-				struct file *file);
 extern void fb_deferred_io_cleanup(struct fb_info *info);
 extern int fb_deferred_io_fsync(struct file *file, loff_t start,
 				loff_t end, int datasync);
-- 
cgit v1.2.3


From 59079438a664559bb1f6f5fe85e306962ef9286e Mon Sep 17 00:00:00 2001
From: Mikhael Goikhman <migo@nvidia.com>
Date: Wed, 10 Mar 2021 23:09:09 -0800
Subject: net/mlx5: Remove unused mlx5_core_health member recover_work

The code related to health->recover_work was removed in
commit 63cbc552eebf ("net/mlx5: Handle SW reset of FW in error flow")

Fix struct mlx5_core_health accordingly.

Signed-off-by: Mikhael Goikhman <migo@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 53b89631a1d9..8fe51b4a781e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -438,7 +438,6 @@ struct mlx5_core_health {
 	unsigned long			flags;
 	struct work_struct		fatal_report_work;
 	struct work_struct		report_work;
-	struct delayed_work		recover_work;
 	struct devlink_health_reporter *fw_reporter;
 	struct devlink_health_reporter *fw_fatal_reporter;
 };
-- 
cgit v1.2.3


From 59c904c8fffd903c1dae5fc6a402b88fa6dfc874 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Wed, 10 Mar 2021 23:09:11 -0800
Subject: net/mlx5: E-Switch, Add eswitch pointer to each representor

Store the managing E-Switch of each representor. This will be used
when a representor is created on eswitch manager 0 but the vport
belongs to eswitch manager 1.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 1 +
 include/linux/mlx5/eswitch.h                               | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index fd5f8b830584..f6c0e7e05ad5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3153,6 +3153,7 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
 	esw->offloads.rep_ops[rep_type] = ops;
 	mlx5_esw_for_all_reps(esw, i, rep) {
 		if (likely(mlx5_eswitch_vport_has_rep(esw, i))) {
+			rep->esw = esw;
 			rep_data = &rep->rep_data[rep_type];
 			atomic_set(&rep_data->state, REP_REGISTERED);
 		}
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 994c2c8cb4fd..72d480df2a03 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -48,6 +48,7 @@ struct mlx5_eswitch_rep {
 	/* Only IB rep is using vport_index */
 	u16		       vport_index;
 	u32		       vlan_refcount;
+	struct                 mlx5_eswitch *esw;
 };
 
 void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
-- 
cgit v1.2.3


From 3a46f4fb55ffd46e475e3fc53b1252f722cf647e Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Wed, 10 Mar 2021 23:09:13 -0800
Subject: net/mlx5: E-Switch, Refactor send to vport to be more generic

Now that each representor stores a pointer to the managing E-Switch
use that information when creating the send-to-vport rules.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                  |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c     |  3 +--
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c   | 20 ++++++++++++--------
 include/linux/mlx5/eswitch.h                         |  4 ++--
 4 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 4eae7131b0ce..db5de720bb12 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -123,8 +123,7 @@ struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
 
 	rep = dev->port[port - 1].rep;
 
-	return mlx5_eswitch_add_send_to_vport_rule(esw, rep->vport,
-						   sq->base.mqp.qpn);
+	return mlx5_eswitch_add_send_to_vport_rule(esw, rep, sq->base.mqp.qpn);
 }
 
 static int mlx5r_rep_probe(struct auxiliary_device *adev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index a132fff7a980..3d6c2bce67d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -411,8 +411,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
 		}
 
 		/* Add re-inject rule to the PF/representor sqs */
-		flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw,
-								rep->vport,
+		flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw, rep,
 								sqns_array[i]);
 		if (IS_ERR(flow_rule)) {
 			err = PTR_ERR(flow_rule);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index f6c0e7e05ad5..6090b2609089 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1036,7 +1036,8 @@ out:
 }
 
 struct mlx5_flow_handle *
-mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, u16 vport,
+mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw,
+				    struct mlx5_eswitch_rep *rep,
 				    u32 sqn)
 {
 	struct mlx5_flow_act flow_act = {0};
@@ -1054,27 +1055,30 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, u16 vport,
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
 	MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn);
 	/* source vport is the esw manager */
-	MLX5_SET(fte_match_set_misc, misc, source_port, esw->manager_vport);
-	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+	MLX5_SET(fte_match_set_misc, misc, source_port, rep->esw->manager_vport);
+	if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch))
 		MLX5_SET(fte_match_set_misc, misc, source_eswitch_owner_vhca_id,
-			 MLX5_CAP_GEN(esw->dev, vhca_id));
+			 MLX5_CAP_GEN(rep->esw->dev, vhca_id));
 
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
 	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn);
 	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
-	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+	if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch))
 		MLX5_SET_TO_ONES(fte_match_set_misc, misc,
 				 source_eswitch_owner_vhca_id);
 
 	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-	dest.vport.num = vport;
+	dest.vport.num = rep->vport;
+	dest.vport.vhca_id = MLX5_CAP_GEN(rep->esw->dev, vhca_id);
+	dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID;
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
-	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb,
+	flow_rule = mlx5_add_flow_rules(on_esw->fdb_table.offloads.slow_fdb,
 					spec, &flow_act, &dest, 1);
 	if (IS_ERR(flow_rule))
-		esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule));
+		esw_warn(on_esw->dev, "FDB: Failed to add send to vport rule err %ld\n",
+			 PTR_ERR(flow_rule));
 out:
 	kvfree(spec);
 	return flow_rule;
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 72d480df2a03..2ec0527991c8 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -62,8 +62,8 @@ struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw,
 						u16 vport_num);
 void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type);
 struct mlx5_flow_handle *
-mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw,
-				    u16 vport_num, u32 sqn);
+mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw,
+				    struct mlx5_eswitch_rep *rep, u32 sqn);
 
 u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
 
-- 
cgit v1.2.3


From c3e666f1ada9cbfbe5465f122f9a2d63ddfd25ed Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Wed, 10 Mar 2021 23:09:14 -0800
Subject: net/mlx5: Add IFC bits needed for single FDB mode

Currently we operate in a mode where each eswitch manager has a separate
FDB. In order to combine these multiple FDBs we expose new caps to allow
this:

- Set root flow table which isn't native.
- Set FDB a different selection mode when in LAG mode.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index df5d91c8b2d4..3ee7a86f39e4 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -806,9 +806,11 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert_if_not_exist[0x1];
 	u8         vport_cvlan_insert_overwrite[0x1];
-	u8         reserved_at_5[0x3];
+	u8         reserved_at_5[0x2];
+	u8         esw_shared_ingress_acl[0x1];
 	u8         esw_uplink_ingress_acl[0x1];
-	u8         reserved_at_9[0x10];
+	u8         root_ft_on_other_esw[0x1];
+	u8         reserved_at_a[0xf];
 	u8         esw_functions_changed[0x1];
 	u8         reserved_at_1a[0x1];
 	u8         ecpf_vport_exists[0x1];
@@ -1502,7 +1504,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_270[0x6];
 	u8         lag_dct[0x2];
 	u8         lag_tx_port_affinity[0x1];
-	u8         reserved_at_279[0x2];
+	u8         lag_native_fdb_selection[0x1];
+	u8         reserved_at_27a[0x1];
 	u8         lag_master[0x1];
 	u8         num_lag_ports[0x4];
 
@@ -10036,14 +10039,19 @@ struct mlx5_ifc_set_flow_table_root_in_bits {
 	u8         reserved_at_60[0x20];
 
 	u8         table_type[0x8];
-	u8         reserved_at_88[0x18];
+	u8         reserved_at_88[0x7];
+	u8         table_of_other_vport[0x1];
+	u8         table_vport_number[0x10];
 
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
 
 	u8         reserved_at_c0[0x8];
 	u8         underlay_qpn[0x18];
-	u8         reserved_at_e0[0x120];
+	u8         table_eswitch_owner_vhca_id_valid[0x1];
+	u8         reserved_at_e1[0xf];
+	u8         table_eswitch_owner_vhca_id[0x10];
+	u8         reserved_at_100[0x100];
 };
 
 enum {
@@ -10273,7 +10281,8 @@ struct mlx5_ifc_dcbx_param_bits {
 };
 
 struct mlx5_ifc_lagc_bits {
-	u8         reserved_at_0[0x1d];
+	u8         fdb_selection_mode[0x1];
+	u8         reserved_at_1[0x1c];
 	u8         lag_state[0x3];
 
 	u8         reserved_at_20[0x14];
-- 
cgit v1.2.3


From 26bf30902c10473ba38f220d3401a61c56d8db3b Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Wed, 10 Mar 2021 23:09:15 -0800
Subject: net/mlx5: Use order-0 allocations for EQs

Currently we are allocating high-order page for EQs. In case of
fragmented system, VF hot remove/add in VMs for example, there isn't
enough contiguous memory for EQs allocation, which results in crashing
of the VM.
Therefore, use order-0 fragments for the EQ allocations instead.

Performance tests:
ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz
Performance tests show no sensible degradation.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/en/health.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 27 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h   | 15 ++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/wq.c       |  5 ----
 include/linux/mlx5/driver.h                        |  5 ++++
 5 files changed, 32 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c
index 84e501e057b4..6f4e6c34b2a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c
@@ -128,7 +128,7 @@ int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg
 	if (err)
 		return err;
 
-	err = devlink_fmsg_u32_pair_put(fmsg, "size", eq->core.nent);
+	err = devlink_fmsg_u32_pair_put(fmsg, "size", eq_get_size(&eq->core));
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 174dfbc996c6..4e8381030d77 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -271,7 +271,7 @@ static void init_eq_buf(struct mlx5_eq *eq)
 	struct mlx5_eqe *eqe;
 	int i;
 
-	for (i = 0; i < eq->nent; i++) {
+	for (i = 0; i < eq_get_size(eq); i++) {
 		eqe = get_eqe(eq, i);
 		eqe->owner = MLX5_EQE_OWNER_INIT_VAL;
 	}
@@ -281,8 +281,10 @@ static int
 create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	      struct mlx5_eq_param *param)
 {
+	u8 log_eq_size = order_base_2(param->nent + MLX5_NUM_SPARE_EQE);
 	struct mlx5_cq_table *cq_table = &eq->cq_table;
 	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
+	u8 log_eq_stride = ilog2(MLX5_EQE_SIZE);
 	struct mlx5_priv *priv = &dev->priv;
 	u8 vecidx = param->irq_index;
 	__be64 *pas;
@@ -297,16 +299,18 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	spin_lock_init(&cq_table->lock);
 	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
 
-	eq->nent = roundup_pow_of_two(param->nent + MLX5_NUM_SPARE_EQE);
 	eq->cons_index = 0;
-	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
+
+	err = mlx5_frag_buf_alloc_node(dev, wq_get_byte_sz(log_eq_size, log_eq_stride),
+				       &eq->frag_buf, dev->priv.numa_node);
 	if (err)
 		return err;
 
+	mlx5_init_fbc(eq->frag_buf.frags, log_eq_stride, log_eq_size, &eq->fbc);
 	init_eq_buf(eq);
 
 	inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
-		MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->buf.npages;
+		MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->frag_buf.npages;
 
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in) {
@@ -315,7 +319,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 	}
 
 	pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas);
-	mlx5_fill_page_array(&eq->buf, pas);
+	mlx5_fill_page_frag_array(&eq->frag_buf, pas);
 
 	MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
 	if (!param->mask[0] && MLX5_CAP_GEN(dev, log_max_uctx))
@@ -326,11 +330,11 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
 				 param->mask[i]);
 
 	eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
-	MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent));
+	MLX5_SET(eqc, eqc, log_eq_size, eq->fbc.log_sz);
 	MLX5_SET(eqc, eqc, uar_page, priv->uar->index);
 	MLX5_SET(eqc, eqc, intr, vecidx);
 	MLX5_SET(eqc, eqc, log_page_size,
-		 eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+		 eq->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
 	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
 	if (err)
@@ -356,7 +360,7 @@ err_in:
 	kvfree(in);
 
 err_buf:
-	mlx5_buf_free(dev, &eq->buf);
+	mlx5_frag_buf_free(dev, &eq->frag_buf);
 	return err;
 }
 
@@ -413,7 +417,7 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 			       eq->eqn);
 	synchronize_irq(eq->irqn);
 
-	mlx5_buf_free(dev, &eq->buf);
+	mlx5_frag_buf_free(dev, &eq->frag_buf);
 
 	return err;
 }
@@ -764,10 +768,11 @@ EXPORT_SYMBOL(mlx5_eq_destroy_generic);
 struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, u32 cc)
 {
 	u32 ci = eq->cons_index + cc;
+	u32 nent = eq_get_size(eq);
 	struct mlx5_eqe *eqe;
 
-	eqe = get_eqe(eq, ci & (eq->nent - 1));
-	eqe = ((eqe->owner & 1) ^ !!(ci & eq->nent)) ? NULL : eqe;
+	eqe = get_eqe(eq, ci & (nent - 1));
+	eqe = ((eqe->owner & 1) ^ !!(ci & nent)) ? NULL : eqe;
 	/* Make sure we read EQ entry contents after we've
 	 * checked the ownership bit.
 	 */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index 81f2cc4ca1da..f607a3858ef5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -22,15 +22,15 @@ struct mlx5_cq_table {
 };
 
 struct mlx5_eq {
+	struct mlx5_frag_buf_ctrl fbc;
+	struct mlx5_frag_buf    frag_buf;
 	struct mlx5_core_dev    *dev;
 	struct mlx5_cq_table    cq_table;
 	__be32 __iomem	        *doorbell;
 	u32                     cons_index;
-	struct mlx5_frag_buf    buf;
 	unsigned int            vecidx;
 	unsigned int            irqn;
 	u8                      eqn;
-	int                     nent;
 	struct mlx5_rsc_debug   *dbg;
 };
 
@@ -47,16 +47,21 @@ struct mlx5_eq_comp {
 	struct list_head        list;
 };
 
+static inline u32 eq_get_size(struct mlx5_eq *eq)
+{
+	return eq->fbc.sz_m1 + 1;
+}
+
 static inline struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry)
 {
-	return mlx5_buf_offset(&eq->buf, entry * MLX5_EQE_SIZE);
+	return mlx5_frag_buf_get_wqe(&eq->fbc, entry);
 }
 
 static inline struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq)
 {
-	struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & (eq->nent - 1));
+	struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & eq->fbc.sz_m1);
 
-	return ((eqe->owner & 1) ^ !!(eq->cons_index & eq->nent)) ? NULL : eqe;
+	return (eqe->owner ^ (eq->cons_index >> eq->fbc.log_sz)) & 1 ? NULL : eqe;
 }
 
 static inline void eq_update_ci(struct mlx5_eq *eq, int arm)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index 01f075fac276..3091dd014650 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -34,11 +34,6 @@
 #include "wq.h"
 #include "mlx5_core.h"
 
-static u32 wq_get_byte_sz(u8 log_sz, u8 log_stride)
-{
-	return ((u32)1 << log_sz) << log_stride;
-}
-
 int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		       void *wqc, struct mlx5_wq_cyc *wq,
 		       struct mlx5_wq_ctrl *wq_ctrl)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8fe51b4a781e..5c0422930b01 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -873,6 +873,11 @@ static inline u32 mlx5_base_mkey(const u32 key)
 	return key & 0xffffff00u;
 }
 
+static inline u32 wq_get_byte_sz(u8 log_sz, u8 log_stride)
+{
+	return ((u32)1 << log_sz) << log_stride;
+}
+
 static inline void mlx5_init_fbc_offset(struct mlx5_buf_list *frags,
 					u8 log_stride, u8 log_sz,
 					u16 strides_offset,
-- 
cgit v1.2.3


From 4260c4067fbba55a90037fe3ee32eff087749f83 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 24 Feb 2021 13:03:13 -0600
Subject: f2fs: Replace one-element array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Refactor the code according to the use of a flexible-array member in
struct f2fs_checkpoint, instead of a one-element arrays.

Notice that a temporary pointer to void '*tmp_ptr' was used in order to
fix the following errors when using a flexible array instead of a one
element array in struct f2fs_checkpoint:

  CC [M]  fs/f2fs/dir.o
In file included from fs/f2fs/dir.c:13:
fs/f2fs/f2fs.h: In function ‘__bitmap_ptr’:
fs/f2fs/f2fs.h:2227:40: error: invalid use of flexible array member
 2227 |   return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32);
      |                                        ^
fs/f2fs/f2fs.h:2227:49: error: invalid use of flexible array member
 2227 |   return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32);
      |                                                 ^
fs/f2fs/f2fs.h:2238:40: error: invalid use of flexible array member
 2238 |   return &ckpt->sit_nat_version_bitmap + offset;
      |                                        ^
make[2]: *** [scripts/Makefile.build:287: fs/f2fs/dir.o] Error 1
make[1]: *** [scripts/Makefile.build:530: fs/f2fs] Error 2
make: *** [Makefile:1819: fs] Error 2

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Build-tested-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/lkml/603647e4.DeEFbl4eqljuwAUe%25lkp@intel.com/
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          | 5 +++--
 include/linux/f2fs_fs.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cccdfb1a40ab..99e243fd26d5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2215,6 +2215,7 @@ static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
 static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	void *tmp_ptr = &ckpt->sit_nat_version_bitmap;
 	int offset;
 
 	if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) {
@@ -2224,7 +2225,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 		 * if large_nat_bitmap feature is enabled, leave checksum
 		 * protection for all nat/sit bitmaps.
 		 */
-		return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32);
+		return tmp_ptr + offset + sizeof(__le32);
 	}
 
 	if (__cp_payload(sbi) > 0) {
@@ -2235,7 +2236,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 	} else {
 		offset = (flag == NAT_BITMAP) ?
 			le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
-		return &ckpt->sit_nat_version_bitmap + offset;
+		return tmp_ptr + offset;
 	}
 }
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index c6cc0a566ef5..5487a80617a3 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -168,7 +168,7 @@ struct f2fs_checkpoint {
 	unsigned char alloc_type[MAX_ACTIVE_LOGS];
 
 	/* SIT and NAT version bitmap */
-	unsigned char sit_nat_version_bitmap[1];
+	unsigned char sit_nat_version_bitmap[];
 } __packed;
 
 #define CP_CHKSUM_OFFSET	4092	/* default chksum offset in checkpoint */
-- 
cgit v1.2.3


From 1bb73841ea7a88765db7f641a90120490f1f4aee Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 2 Mar 2021 07:21:33 +0100
Subject: PCI: Remove MicroGate SyncLink device IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The drivers were removed in a1f714b44e34 (tty: Remove redundant synclink
driver) and 3d608a591b2b (tty: Remove redundant synclinkmp driver).

Remove the PCI device ID entries as well.

Link: https://lore.kernel.org/r/20210302062214.29627-3-jslaby@suse.cz
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Krzysztof Wilczyński <kw@linux.com>
---
 include/linux/pci_ids.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a76ccb697bef..8a18517696c1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2065,8 +2065,6 @@
 #define PCI_DEVICE_ID_EXAR_XR17V358	0x0358
 
 #define PCI_VENDOR_ID_MICROGATE		0x13c0
-#define PCI_DEVICE_ID_MICROGATE_USC	0x0010
-#define PCI_DEVICE_ID_MICROGATE_SCA	0x0030
 
 #define PCI_VENDOR_ID_3WARE		0x13C1
 #define PCI_DEVICE_ID_3WARE_1000	0x1000
-- 
cgit v1.2.3


From a3222a2da0a2d6c7682252d4bfdff05721a82b95 Mon Sep 17 00:00:00 2001
From: Maor Dickman <maord@nvidia.com>
Date: Sun, 24 Jan 2021 15:56:36 +0200
Subject: net/mlx5e: Allow to match on ICMP parameters

Support matching on ICMPv4/6 type and code parameters using misc3
section of match parameters.

Signed-off-by: Maor Dickman <maord@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 47 +++++++++++++++++++++++++
 include/linux/mlx5/device.h                     |  2 ++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 121f0a744e55..54ea0dae7ded 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1961,6 +1961,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 				    misc_parameters);
 	void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
 				    misc_parameters);
+	void *misc_c_3 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters_3);
+	void *misc_v_3 = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters_3);
 	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_dissector *dissector = rule->match.dissector;
 	u16 addr_type = 0;
@@ -1990,6 +1994,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 	      BIT(FLOW_DISSECTOR_KEY_CT) |
 	      BIT(FLOW_DISSECTOR_KEY_ENC_IP) |
 	      BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) |
+	      BIT(FLOW_DISSECTOR_KEY_ICMP) |
 	      BIT(FLOW_DISSECTOR_KEY_MPLS))) {
 		NL_SET_ERR_MSG_MOD(extack, "Unsupported key");
 		netdev_dbg(priv->netdev, "Unsupported key used: 0x%x\n",
@@ -2309,7 +2314,49 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 		if (match.mask->flags)
 			*match_level = MLX5_MATCH_L4;
 	}
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP)) {
+		struct flow_match_icmp match;
 
+		flow_rule_match_icmp(rule, &match);
+		switch (ip_proto) {
+		case IPPROTO_ICMP:
+			if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
+			      MLX5_FLEX_PROTO_ICMP))
+				return -EOPNOTSUPP;
+			MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_type,
+				 match.mask->type);
+			MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_type,
+				 match.key->type);
+			MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_code,
+				 match.mask->code);
+			MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_code,
+				 match.key->code);
+			break;
+		case IPPROTO_ICMPV6:
+			if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
+			      MLX5_FLEX_PROTO_ICMPV6))
+				return -EOPNOTSUPP;
+			MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_type,
+				 match.mask->type);
+			MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_type,
+				 match.key->type);
+			MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_code,
+				 match.mask->code);
+			MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_code,
+				 match.key->code);
+			break;
+		default:
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Code and type matching only with ICMP and ICMPv6");
+			netdev_err(priv->netdev,
+				   "Code and type matching only with ICMP and ICMPv6\n");
+			return -EINVAL;
+		}
+		if (match.mask->code || match.mask->type) {
+			*match_level = MLX5_MATCH_L4;
+			spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3;
+		}
+	}
 	return 0;
 }
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index dc3d2508f5c6..92a029a800a0 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1142,6 +1142,8 @@ enum mlx5_flex_parser_protos {
 	MLX5_FLEX_PROTO_GENEVE	      = 1 << 3,
 	MLX5_FLEX_PROTO_CW_MPLS_GRE   = 1 << 4,
 	MLX5_FLEX_PROTO_CW_MPLS_UDP   = 1 << 5,
+	MLX5_FLEX_PROTO_ICMP	      = 1 << 8,
+	MLX5_FLEX_PROTO_ICMPV6	      = 1 << 9,
 };
 
 /* MLX5 DEV CAPs */
-- 
cgit v1.2.3


From f90fc37f289cd0886ef3a12b2ea33b93b8d9d360 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Fri, 12 Mar 2021 11:09:08 +0000
Subject: ptp_pch: Move 'pch_*()' prototypes to shared header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following W=1 kernel build warning(s):

 drivers/ptp/ptp_pch.c:193:6: warning: no previous prototype for ‘pch_ch_control_write’ [-Wmissing-prototypes]
 drivers/ptp/ptp_pch.c:201:5: warning: no previous prototype for ‘pch_ch_event_read’ [-Wmissing-prototypes]
 drivers/ptp/ptp_pch.c:212:6: warning: no previous prototype for ‘pch_ch_event_write’ [-Wmissing-prototypes]
 drivers/ptp/ptp_pch.c:220:5: warning: no previous prototype for ‘pch_src_uuid_lo_read’ [-Wmissing-prototypes]
 drivers/ptp/ptp_pch.c:231:5: warning: no previous prototype for ‘pch_src_uuid_hi_read’ [-Wmissing-prototypes]
 drivers/ptp/ptp_pch.c:242:5: warning: no previous prototype for ‘pch_rx_snap_read’ [-Wmissing-prototypes]
 drivers/ptp/ptp_pch.c:259:5: warning: no previous prototype for ‘pch_tx_snap_read’ [-Wmissing-prototypes]
 drivers/ptp/ptp_pch.c:300:5: warning: no previous prototype for ‘pch_set_station_address’ [-Wmissing-prototypes]

Cc: Richard Cochran <richardcochran@gmail.com> (maintainer:PTP HARDWARE CLOCK SUPPORT)
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Flavio Suligoi <f.suligoi@asem.it>
Cc: netdev@vger.kernel.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h    |  8 --------
 .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c   |  1 +
 drivers/ptp/ptp_pch.c                              |  1 +
 include/linux/ptp_pch.h                            | 22 ++++++++++++++++++++++
 4 files changed, 24 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/ptp_pch.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
index 3ce4899a0417..a6823c4d355d 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
@@ -612,14 +612,6 @@ void pch_gbe_free_tx_resources(struct pch_gbe_adapter *adapter,
 void pch_gbe_free_rx_resources(struct pch_gbe_adapter *adapter,
 			       struct pch_gbe_rx_ring *rx_ring);
 void pch_gbe_update_stats(struct pch_gbe_adapter *adapter);
-void pch_ch_control_write(struct pci_dev *pdev, u32 val);
-u32 pch_ch_event_read(struct pci_dev *pdev);
-void pch_ch_event_write(struct pci_dev *pdev, u32 val);
-u32 pch_src_uuid_lo_read(struct pci_dev *pdev);
-u32 pch_src_uuid_hi_read(struct pci_dev *pdev);
-u64 pch_rx_snap_read(struct pci_dev *pdev);
-u64 pch_tx_snap_read(struct pci_dev *pdev);
-int pch_set_station_address(u8 *addr, struct pci_dev *pdev);
 
 /* pch_gbe_param.c */
 void pch_gbe_check_options(struct pch_gbe_adapter *adapter);
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 140cee7c459d..334af49e5add 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_classify.h>
+#include <linux/ptp_pch.h>
 #include <linux/gpio.h>
 
 #define DRV_VERSION     "1.01"
diff --git a/drivers/ptp/ptp_pch.c b/drivers/ptp/ptp_pch.c
index f7ff7230623e..fa4417ad02e0 100644
--- a/drivers/ptp/ptp_pch.c
+++ b/drivers/ptp/ptp_pch.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_pch.h>
 #include <linux/slab.h>
 
 #define STATION_ADDR_LEN	20
diff --git a/include/linux/ptp_pch.h b/include/linux/ptp_pch.h
new file mode 100644
index 000000000000..51818198c292
--- /dev/null
+++ b/include/linux/ptp_pch.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * PTP PCH
+ *
+ * Copyright 2019 Linaro Ltd.
+ *
+ * Author Lee Jones <lee.jones@linaro.org>
+ */
+
+#ifndef _PTP_PCH_H_
+#define _PTP_PCH_H_
+
+void pch_ch_control_write(struct pci_dev *pdev, u32 val);
+u32  pch_ch_event_read(struct pci_dev *pdev);
+void pch_ch_event_write(struct pci_dev *pdev, u32 val);
+u32  pch_src_uuid_lo_read(struct pci_dev *pdev);
+u32  pch_src_uuid_hi_read(struct pci_dev *pdev);
+u64  pch_rx_snap_read(struct pci_dev *pdev);
+u64  pch_tx_snap_read(struct pci_dev *pdev);
+int  pch_set_station_address(u8 *addr, struct pci_dev *pdev);
+
+#endif /* _PTP_PCH_H_ */
-- 
cgit v1.2.3


From 34dc2efb39a231280fd6696a59bbe712bf3c5c4a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 12 Mar 2021 21:07:01 -0800
Subject: memblock: fix section mismatch warning

The inlining logic in clang-13 is rewritten to often not inline some
functions that were inlined by all earlier compilers.

In case of the memblock interfaces, this exposed a harmless bug of a
missing __init annotation:

WARNING: modpost: vmlinux.o(.text+0x507c0a): Section mismatch in reference from the function memblock_bottom_up() to the variable .meminit.data:memblock
The function memblock_bottom_up() references
the variable __meminitdata memblock.
This is often because memblock_bottom_up lacks a __meminitdata
annotation or the annotation of memblock is wrong.

Interestingly, these annotations were present originally, but got removed
with the explanation that the __init annotation prevents the function from
getting inlined.  I checked this again and found that while this is the
case with clang, gcc (version 7 through 10, did not test others) does
inline the functions regardless.

As the previous change was apparently intended to help the clang builds,
reverting it to help the newer clang versions seems appropriate as well.
gcc builds don't seem to care either way.

Link: https://lkml.kernel.org/r/20210225133808.2188581-1-arnd@kernel.org
Fixes: 5bdba520c1b3 ("mm: memblock: drop __init from memblock functions to make it inline")
Reference: 2cfb3665e864 ("include/linux/memblock.h: add __init to memblock_set_bottom_up()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Faiyaz Mohammed <faiyazm@codeaurora.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Aslan Bakirov <aslan@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index c88bc24e31aa..d13e3cd938b4 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -460,7 +460,7 @@ static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
-static inline void memblock_set_bottom_up(bool enable)
+static inline __init void memblock_set_bottom_up(bool enable)
 {
 	memblock.bottom_up = enable;
 }
@@ -470,7 +470,7 @@ static inline void memblock_set_bottom_up(bool enable)
  * if this is true, that said, memblock will allocate memory
  * in bottom-up direction.
  */
-static inline bool memblock_bottom_up(void)
+static inline __init bool memblock_bottom_up(void)
 {
 	return memblock.bottom_up;
 }
-- 
cgit v1.2.3


From cbf78d85079cee662c45749ef4f744d41be85d48 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 12 Mar 2021 21:07:04 -0800
Subject: stop_machine: mark helpers __always_inline

With clang-13, some functions only get partially inlined, with a
specialized version referring to a global variable.  This triggers a
harmless build-time check for the intel-rng driver:

WARNING: modpost: drivers/char/hw_random/intel-rng.o(.text+0xe): Section mismatch in reference from the function stop_machine() to the function .init.text:intel_rng_hw_init()
The function stop_machine() references
the function __init intel_rng_hw_init().
This is often because stop_machine lacks a __init
annotation or the annotation of intel_rng_hw_init is wrong.

In this instance, an easy workaround is to force the stop_machine()
function to be inline, along with related interfaces that did not show the
same behavior at the moment, but theoretically could.

The combination of the two patches listed below triggers the behavior in
clang-13, but individually these commits are correct.

Link: https://lkml.kernel.org/r/20210225130153.1956990-1-arnd@kernel.org
Fixes: fe5595c07400 ("stop_machine: Provide stop_machine_cpuslocked()")
Fixes: ee527cd3a20c ("Use stop_machine_run in the Intel RNG driver")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stop_machine.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 30577c3aecf8..46fb3ebdd16e 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -128,7 +128,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 				   const struct cpumask *cpus);
 #else	/* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
 
-static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
+static __always_inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
 					  const struct cpumask *cpus)
 {
 	unsigned long flags;
@@ -139,14 +139,15 @@ static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
 	return ret;
 }
 
-static inline int stop_machine(cpu_stop_fn_t fn, void *data,
-			       const struct cpumask *cpus)
+static __always_inline int
+stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 {
 	return stop_machine_cpuslocked(fn, data, cpus);
 }
 
-static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
-						 const struct cpumask *cpus)
+static __always_inline int
+stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
+			       const struct cpumask *cpus)
 {
 	return stop_machine(fn, data, cpus);
 }
-- 
cgit v1.2.3


From 82e69a121be4b1597ce758534816a8ee04c8b761 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Fri, 12 Mar 2021 21:07:15 -0800
Subject: mm/fork: clear PASID for new mm

When a new mm is created, its PASID should be cleared, i.e.  the PASID is
initialized to its init state 0 on both ARM and X86.

This patch was part of the series introducing mm->pasid, but got lost
along the way [1].  It still makes sense to have it, because each address
space has a different PASID.  And the IOMMU code in
iommu_sva_alloc_pasid() expects the pasid field of a new mm struct to be
cleared.

[1] https://lore.kernel.org/linux-iommu/YDgh53AcQHT+T3L0@otcwcpicx3.sc.intel.com/

Link: https://lkml.kernel.org/r/20210302103837.2562625-1-jean-philippe@linaro.org
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Cc: Jacob Pan <jacob.jun.pan@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 1 +
 kernel/fork.c            | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0974ad501a47..6613b26a8894 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
 #endif
 #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
 
+#define INIT_PASID	0
 
 struct address_space;
 struct mem_cgroup;
diff --git a/kernel/fork.c b/kernel/fork.c
index d3171e8e88e5..54cc905e5fe0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -994,6 +994,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif
 }
 
+static void mm_init_pasid(struct mm_struct *mm)
+{
+#ifdef CONFIG_IOMMU_SUPPORT
+	mm->pasid = INIT_PASID;
+#endif
+}
+
 static void mm_init_uprobes_state(struct mm_struct *mm)
 {
 #ifdef CONFIG_UPROBES
@@ -1024,6 +1031,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
+	mm_init_pasid(mm);
 	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_subscriptions_init(mm);
 	init_tlb_flush_pending(mm);
-- 
cgit v1.2.3


From 97a7e4733b9b221d012ae68fcd3b3251febf6341 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 12 Mar 2021 21:07:26 -0800
Subject: mm: introduce page_needs_cow_for_dma() for deciding whether cow

We've got quite a few places (pte, pmd, pud) that explicitly checked
against whether we should break the cow right now during fork().  It's
easier to provide a helper, especially before we work the same thing on
hugetlbfs.

Since we'll reference is_cow_mapping() in mm.h, move it there too.
Actually it suites mm.h more since internal.h is mm/ only, but mm.h is
exported to the whole kernel.  With that we should expect another patch to
use is_cow_mapping() whenever we can across the kernel since we do use it
quite a lot but it's always done with raw code against VM_* flags.

Link: https://lkml.kernel.org/r/20210217233547.93892-4-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: David Airlie <airlied@linux.ie>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Gal Pressman <galpress@amazon.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Kirill Shutemov <kirill@shutemov.name>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Roland Scheidegger <sroland@vmware.com>
Cc: VMware Graphics <linux-graphics-maintainer@vmware.com>
Cc: Wei Zhang <wzam@amazon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 21 +++++++++++++++++++++
 mm/huge_memory.c   |  8 ++------
 mm/internal.h      |  5 -----
 mm/memory.c        |  8 +-------
 4 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 77e64e3eac80..64a71bf20536 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1300,6 +1300,27 @@ static inline bool page_maybe_dma_pinned(struct page *page)
 		GUP_PIN_COUNTING_BIAS;
 }
 
+static inline bool is_cow_mapping(vm_flags_t flags)
+{
+	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+
+/*
+ * This should most likely only be called during fork() to see whether we
+ * should break the cow immediately for a page on the src mm.
+ */
+static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
+					  struct page *page)
+{
+	if (!is_cow_mapping(vma->vm_flags))
+		return false;
+
+	if (!atomic_read(&vma->vm_mm->has_pinned))
+		return false;
+
+	return page_maybe_dma_pinned(page);
+}
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define SECTION_IN_PAGE_FLAGS
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 395c75111d33..da1d63a41aec 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1100,9 +1100,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * best effort that the pinned pages won't be replaced by another
 	 * random page during the coming copy-on-write.
 	 */
-	if (unlikely(is_cow_mapping(vma->vm_flags) &&
-		     atomic_read(&src_mm->has_pinned) &&
-		     page_maybe_dma_pinned(src_page))) {
+	if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
 		pte_free(dst_mm, pgtable);
 		spin_unlock(src_ptl);
 		spin_unlock(dst_ptl);
@@ -1214,9 +1212,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	}
 
 	/* Please refer to comments in copy_huge_pmd() */
-	if (unlikely(is_cow_mapping(vma->vm_flags) &&
-		     atomic_read(&src_mm->has_pinned) &&
-		     page_maybe_dma_pinned(pud_page(pud)))) {
+	if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
 		spin_unlock(src_ptl);
 		spin_unlock(dst_ptl);
 		__split_huge_pud(vma, src_pud, addr);
diff --git a/mm/internal.h b/mm/internal.h
index 9902648f2206..1432feec62df 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -296,11 +296,6 @@ static inline unsigned int buddy_order(struct page *page)
  */
 #define buddy_order_unsafe(page)	READ_ONCE(page_private(page))
 
-static inline bool is_cow_mapping(vm_flags_t flags)
-{
-	return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-}
-
 /*
  * These three helpers classifies VMAs for virtual memory accounting.
  */
diff --git a/mm/memory.c b/mm/memory.c
index c8e357627318..523230005db1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -809,12 +809,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 		  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
 		  struct page **prealloc, pte_t pte, struct page *page)
 {
-	struct mm_struct *src_mm = src_vma->vm_mm;
 	struct page *new_page;
 
-	if (!is_cow_mapping(src_vma->vm_flags))
-		return 1;
-
 	/*
 	 * What we want to do is to check whether this page may
 	 * have been pinned by the parent process.  If so,
@@ -828,9 +824,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	 * the page count. That might give false positives for
 	 * for pinning, but it will work correctly.
 	 */
-	if (likely(!atomic_read(&src_mm->has_pinned)))
-		return 1;
-	if (likely(!page_maybe_dma_pinned(page)))
+	if (likely(!page_needs_cow_for_dma(src_vma, page)))
 		return 1;
 
 	new_page = *prealloc;
-- 
cgit v1.2.3


From 97e4910232fa1f81e806aa60c25a0450276d99a2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 12 Mar 2021 21:07:47 -0800
Subject: linux/compiler-clang.h: define HAVE_BUILTIN_BSWAP*

Separating compiler-clang.h from compiler-gcc.h inadventently dropped the
definitions of the three HAVE_BUILTIN_BSWAP macros, which requires falling
back to the open-coded version and hoping that the compiler detects it.

Since all versions of clang support the __builtin_bswap interfaces, add
back the flags and have the headers pick these up automatically.

This results in a 4% improvement of compilation speed for arm defconfig.

Note: it might also be worth revisiting which architectures set
CONFIG_ARCH_USE_BUILTIN_BSWAP for one compiler or the other, today this is
set on six architectures (arm32, csky, mips, powerpc, s390, x86), while
another ten architectures define custom helpers (alpha, arc, ia64, m68k,
mips, nios2, parisc, sh, sparc, xtensa), and the rest (arm64, h8300,
hexagon, microblaze, nds32, openrisc, riscv) just get the unoptimized
version and rely on the compiler to detect it.

A long time ago, the compiler builtins were architecture specific, but
nowadays, all compilers that are able to build the kernel have correct
implementations of them, though some may not be as optimized as the inline
asm versions.

The patch that dropped the optimization landed in v4.19, so as discussed
it would be fairly safe to backport this revert to stable kernels to the
4.19/5.4/5.10 stable kernels, but there is a remaining risk for
regressions, and it has no known side-effects besides compile speed.

Link: https://lkml.kernel.org/r/20210226161151.2629097-1-arnd@kernel.org
Link: https://lore.kernel.org/lkml/20210225164513.3667778-1-arnd@kernel.org/
Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Acked-by: Nick Desaulniers <ndesaulniers@google.com>
Acked-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nick Hu <nickhu@andestech.com>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Vincent Chen <deanbo422@gmail.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Guo Ren <guoren@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Arvind Sankar <nivedita@alum.mit.edu>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 04c0a5a717f7..d217c382b02d 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -31,6 +31,12 @@
 #define __no_sanitize_thread
 #endif
 
+#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
+#define __HAVE_BUILTIN_BSWAP32__
+#define __HAVE_BUILTIN_BSWAP64__
+#define __HAVE_BUILTIN_BSWAP16__
+#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
 #if __has_feature(undefined_behavior_sanitizer)
 /* GCC does not have __SANITIZE_UNDEFINED__ */
 #define __no_sanitize_undefined \
-- 
cgit v1.2.3


From 149fc787353f65b7e72e05e7b75d34863266c3e2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Mar 2021 21:08:03 -0800
Subject: include/linux/sched/mm.h: use rcu_dereference in in_vfork()

Fix a sparse warning by using rcu_dereference().  Technically this is a
bug and a sufficiently aggressive compiler could reload the `real_parent'
pointer outside the protection of the rcu lock (and access freed memory),
but I think it's pretty unlikely to happen.

Link: https://lkml.kernel.org/r/20210221194207.1351703-1-willy@infradead.org
Fixes: b18dc5f291c0 ("mm, oom: skip vforked tasks from being selected")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 1ae08b8462a4..90b2a0bce11c 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -140,7 +140,8 @@ static inline bool in_vfork(struct task_struct *tsk)
 	 * another oom-unkillable task does this it should blame itself.
 	 */
 	rcu_read_lock();
-	ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
+	ret = tsk->vfork_done &&
+			rcu_dereference(tsk->real_parent)->mm == tsk->mm;
 	rcu_read_unlock();
 
 	return ret;
-- 
cgit v1.2.3


From be6c8982e4ab9a41907555f601b711a7e2a17d4c Mon Sep 17 00:00:00 2001
From: Zhou Guanghui <zhouguanghui1@huawei.com>
Date: Fri, 12 Mar 2021 21:08:30 -0800
Subject: mm/memcg: rename mem_cgroup_split_huge_fixup to split_page_memcg and
 add nr_pages argument

Rename mem_cgroup_split_huge_fixup to split_page_memcg and explicitly pass
in page number argument.

In this way, the interface name is more common and can be used by
potential users.  In addition, the complete info(memcg and flag) of the
memcg needs to be set to the tail pages.

Link: https://lkml.kernel.org/r/20210304074053.65527-2-zhouguanghui1@huawei.com
Signed-off-by: Zhou Guanghui <zhouguanghui1@huawei.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Tianhong Ding <dingtianhong@huawei.com>
Cc: Weilong Chen <chenweilong@huawei.com>
Cc: Rui Xiang <rui.xiang@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ++----
 mm/huge_memory.c           |  2 +-
 mm/memcontrol.c            | 15 ++++++---------
 3 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e6dc793d587d..0c04d39a7967 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1061,9 +1061,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 	rcu_read_unlock();
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void mem_cgroup_split_huge_fixup(struct page *head);
-#endif
+void split_page_memcg(struct page *head, unsigned int nr);
 
 #else /* CONFIG_MEMCG */
 
@@ -1400,7 +1398,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 	return 0;
 }
 
-static inline void mem_cgroup_split_huge_fixup(struct page *head)
+static inline void split_page_memcg(struct page *head, unsigned int nr)
 {
 }
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index da1d63a41aec..ae907a9c2050 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2467,7 +2467,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	int i;
 
 	/* complete memcg works before add pages to LRU */
-	mem_cgroup_split_huge_fixup(head);
+	split_page_memcg(head, nr);
 
 	if (PageAnon(head) && PageSwapCache(head)) {
 		swp_entry_t entry = { .val = page_private(head) };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 845eec01ef9d..e064ac0d850a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3287,24 +3287,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
 
 #endif /* CONFIG_MEMCG_KMEM */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
- * Because page_memcg(head) is not set on compound tails, set it now.
+ * Because page_memcg(head) is not set on tails, set it now.
  */
-void mem_cgroup_split_huge_fixup(struct page *head)
+void split_page_memcg(struct page *head, unsigned int nr)
 {
 	struct mem_cgroup *memcg = page_memcg(head);
 	int i;
 
-	if (mem_cgroup_disabled())
+	if (mem_cgroup_disabled() || !memcg)
 		return;
 
-	for (i = 1; i < HPAGE_PMD_NR; i++) {
-		css_get(&memcg->css);
-		head[i].memcg_data = (unsigned long)memcg;
-	}
+	for (i = 1; i < nr; i++)
+		head[i].memcg_data = head->memcg_data;
+	css_get_many(&memcg->css, nr - 1);
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_MEMCG_SWAP
 /**
-- 
cgit v1.2.3


From bc22ed2ea1121f9d9ba3f85c524cb857d54a2d00 Mon Sep 17 00:00:00 2001
From: Xianting Tian <xianting_tian@126.com>
Date: Sat, 20 Feb 2021 11:28:18 -0500
Subject: virtio: remove export for virtio_config_{enable, disable}

virtio_config_enable(), virtio_config_disable() are only used inside
drivers/virtio/virtio.c, so it doesn't need export the symbols.

Signed-off-by: Xianting Tian <xianting_tian@126.com>
Link: https://lore.kernel.org/r/1613838498-8791-1-git-send-email-xianting_tian@126.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/virtio/virtio.c | 6 ++----
 include/linux/virtio.h  | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 42e09cc1b8ac..4b15c00c0a0a 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -141,15 +141,14 @@ void virtio_config_changed(struct virtio_device *dev)
 }
 EXPORT_SYMBOL_GPL(virtio_config_changed);
 
-void virtio_config_disable(struct virtio_device *dev)
+static void virtio_config_disable(struct virtio_device *dev)
 {
 	spin_lock_irq(&dev->config_lock);
 	dev->config_enabled = false;
 	spin_unlock_irq(&dev->config_lock);
 }
-EXPORT_SYMBOL_GPL(virtio_config_disable);
 
-void virtio_config_enable(struct virtio_device *dev)
+static void virtio_config_enable(struct virtio_device *dev)
 {
 	spin_lock_irq(&dev->config_lock);
 	dev->config_enabled = true;
@@ -158,7 +157,6 @@ void virtio_config_enable(struct virtio_device *dev)
 	dev->config_change_pending = false;
 	spin_unlock_irq(&dev->config_lock);
 }
-EXPORT_SYMBOL_GPL(virtio_config_enable);
 
 void virtio_add_status(struct virtio_device *dev, unsigned int status)
 {
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 55ea329fe72a..b1894e0323fa 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -132,8 +132,6 @@ bool is_virtio_device(struct device *dev);
 void virtio_break_device(struct virtio_device *dev);
 
 void virtio_config_changed(struct virtio_device *dev);
-void virtio_config_disable(struct virtio_device *dev);
-void virtio_config_enable(struct virtio_device *dev);
 int virtio_finalize_features(struct virtio_device *dev);
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev);
-- 
cgit v1.2.3


From e3305138da47f0ae2241e5daa18af276e1e54457 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sun, 14 Mar 2021 11:11:14 +0000
Subject: skbuff: make __skb_header_pointer()'s data argument const

The function never modifies the input buffer, so 'data' argument
can be marked as const.
This implies one harmless cast-away.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 483e89348f78..d6ea3dc3eddb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3678,11 +3678,11 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
 		    __wsum csum);
 
 static inline void * __must_check
-__skb_header_pointer(const struct sk_buff *skb, int offset,
-		     int len, void *data, int hlen, void *buffer)
+__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
+		     const void *data, int hlen, void *buffer)
 {
 	if (hlen - offset >= len)
-		return data + offset;
+		return (void *)data + offset;
 
 	if (!skb ||
 	    skb_copy_bits(skb, offset, buffer, len) < 0)
-- 
cgit v1.2.3


From f96533cded173b3b019001a505a746c3cd8fc323 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sun, 14 Mar 2021 11:11:23 +0000
Subject: flow_dissector: constify raw input data argument

Flow Dissector code never modifies the input buffer, neither skb nor
raw data.
Make 'data' argument const for all of the Flow dissector's functions.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 15 +++++++--------
 include/net/flow_dissector.h |  2 +-
 net/core/flow_dissector.c    | 41 ++++++++++++++++++++++-------------------
 3 files changed, 30 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d6ea3dc3eddb..46c61e127e9f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1292,10 +1292,10 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
 void __skb_get_hash(struct sk_buff *skb);
 u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
 		   const struct flow_keys_basic *keys, int hlen);
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
+			    const void *data, int hlen_proto);
 
 static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
 					int thoff, u8 ip_proto)
@@ -1314,9 +1314,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 bool __skb_flow_dissect(const struct net *net,
 			const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
-			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen,
-			unsigned int flags);
+			void *target_container, const void *data,
+			__be16 proto, int nhoff, int hlen, unsigned int flags);
 
 static inline bool skb_flow_dissect(const struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
@@ -1338,9 +1337,9 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
 static inline bool
 skb_flow_dissect_flow_keys_basic(const struct net *net,
 				 const struct sk_buff *skb,
-				 struct flow_keys_basic *flow, void *data,
-				 __be16 proto, int nhoff, int hlen,
-				 unsigned int flags)
+				 struct flow_keys_basic *flow,
+				 const void *data, __be16 proto,
+				 int nhoff, int hlen, unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
 	return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index bf00e71816ed..ffd386ea0dbb 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -350,7 +350,7 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys)
 u32 flow_hash_from_keys(struct flow_keys *keys);
 void skb_flow_get_icmp_tci(const struct sk_buff *skb,
 			   struct flow_dissector_key_icmp *key_icmp,
-			   void *data, int thoff, int hlen);
+			   const void *data, int thoff, int hlen);
 
 static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector,
 				      enum flow_dissector_key_id key_id)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2ef2224b3bff..2ed380d096ce 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -114,7 +114,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net,
  * is the protocol port offset returned from proto_ports_offset
  */
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen)
+			    const void *data, int hlen)
 {
 	int poff = proto_ports_offset(ip_proto);
 
@@ -161,7 +161,7 @@ static bool icmp_has_id(u8 type)
  */
 void skb_flow_get_icmp_tci(const struct sk_buff *skb,
 			   struct flow_dissector_key_icmp *key_icmp,
-			   void *data, int thoff, int hlen)
+			   const void *data, int thoff, int hlen)
 {
 	struct icmphdr *ih, _ih;
 
@@ -187,8 +187,8 @@ EXPORT_SYMBOL(skb_flow_get_icmp_tci);
  */
 static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
-				    void *target_container,
-				    void *data, int thoff, int hlen)
+				    void *target_container, const void *data,
+				    int thoff, int hlen)
 {
 	struct flow_dissector_key_icmp *key_icmp;
 
@@ -409,8 +409,8 @@ EXPORT_SYMBOL(skb_flow_dissect_hash);
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
-			void *target_container, void *data, int nhoff, int hlen,
-			int lse_index, bool *entropy_label)
+			void *target_container, const void *data, int nhoff,
+			int hlen, int lse_index, bool *entropy_label)
 {
 	struct mpls_label *hdr, _hdr;
 	u32 entry, label, bos;
@@ -467,7 +467,8 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb,
 static enum flow_dissect_ret
 __skb_flow_dissect_arp(const struct sk_buff *skb,
 		       struct flow_dissector *flow_dissector,
-		       void *target_container, void *data, int nhoff, int hlen)
+		       void *target_container, const void *data,
+		       int nhoff, int hlen)
 {
 	struct flow_dissector_key_arp *key_arp;
 	struct {
@@ -523,7 +524,7 @@ static enum flow_dissect_ret
 __skb_flow_dissect_gre(const struct sk_buff *skb,
 		       struct flow_dissector_key_control *key_control,
 		       struct flow_dissector *flow_dissector,
-		       void *target_container, void *data,
+		       void *target_container, const void *data,
 		       __be16 *p_proto, int *p_nhoff, int *p_hlen,
 		       unsigned int flags)
 {
@@ -663,8 +664,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
 static enum flow_dissect_ret
 __skb_flow_dissect_batadv(const struct sk_buff *skb,
 			  struct flow_dissector_key_control *key_control,
-			  void *data, __be16 *p_proto, int *p_nhoff, int hlen,
-			  unsigned int flags)
+			  const void *data, __be16 *p_proto, int *p_nhoff,
+			  int hlen, unsigned int flags)
 {
 	struct {
 		struct batadv_unicast_packet batadv_unicast;
@@ -695,7 +696,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb,
 static void
 __skb_flow_dissect_tcp(const struct sk_buff *skb,
 		       struct flow_dissector *flow_dissector,
-		       void *target_container, void *data, int thoff, int hlen)
+		       void *target_container, const void *data,
+		       int thoff, int hlen)
 {
 	struct flow_dissector_key_tcp *key_tcp;
 	struct tcphdr *th, _th;
@@ -719,8 +721,8 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb,
 static void
 __skb_flow_dissect_ports(const struct sk_buff *skb,
 			 struct flow_dissector *flow_dissector,
-			 void *target_container, void *data, int nhoff,
-			 u8 ip_proto, int hlen)
+			 void *target_container, const void *data,
+			 int nhoff, u8 ip_proto, int hlen)
 {
 	enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
 	struct flow_dissector_key_ports *key_ports;
@@ -744,7 +746,8 @@ __skb_flow_dissect_ports(const struct sk_buff *skb,
 static void
 __skb_flow_dissect_ipv4(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
-			void *target_container, void *data, const struct iphdr *iph)
+			void *target_container, const void *data,
+			const struct iphdr *iph)
 {
 	struct flow_dissector_key_ip *key_ip;
 
@@ -761,7 +764,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb,
 static void
 __skb_flow_dissect_ipv6(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
-			void *target_container, void *data, const struct ipv6hdr *iph)
+			void *target_container, const void *data,
+			const struct ipv6hdr *iph)
 {
 	struct flow_dissector_key_ip *key_ip;
 
@@ -908,9 +912,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 bool __skb_flow_dissect(const struct net *net,
 			const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
-			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen,
-			unsigned int flags)
+			void *target_container, const void *data,
+			__be16 proto, int nhoff, int hlen, unsigned int flags)
 {
 	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
@@ -1642,7 +1645,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
 		   const struct flow_keys_basic *keys, int hlen)
 {
 	u32 poff = keys->control.thoff;
-- 
cgit v1.2.3


From 805a25f3a1bdf4aafd0af412ce1e47d0cb6c7628 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sun, 14 Mar 2021 11:11:32 +0000
Subject: linux/etherdevice.h: misc trailing whitespace cleanup

Caught by the text editor. Fix it separately from the actual changes.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 2e5debc0373c..bcb2f81baafb 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -11,7 +11,7 @@
  * Authors:	Ross Biro
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *
- *		Relocated to include/linux where it belongs by Alan Cox 
+ *		Relocated to include/linux where it belongs by Alan Cox
  *							<gw4pts@gw4pts.ampr.org>
  */
 #ifndef _LINUX_ETHERDEVICE_H
-- 
cgit v1.2.3


From 59753ce8b196de60211a989c75ece8aeb0d9d57c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sun, 14 Mar 2021 11:11:41 +0000
Subject: ethernet: constify eth_get_headlen()'s data argument

It's used only for flow dissection, which now takes constant data
pointers.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 net/ethernet/eth.c          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index bcb2f81baafb..330345b1be54 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -29,7 +29,7 @@ struct device;
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
 unsigned char *arch_get_platform_mac_address(void);
 int nvmem_get_mac_address(struct device *dev, void *addrbuf);
-u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len);
+u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 extern const struct header_ops eth_header_ops;
 
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 4106373180c6..e01cf766d2c5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(eth_header);
  * Make a best effort attempt to pull the length for all of the headers for
  * a given frame in a linear buffer.
  */
-u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len)
+u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len)
 {
 	const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
 	const struct ethhdr *eth = (const struct ethhdr *)data;
-- 
cgit v1.2.3


From d206121faf8bb2239cd970af0bd32f5203780427 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@pm.me>
Date: Sun, 14 Mar 2021 11:11:50 +0000
Subject: skbuff: micro-optimize {,__}skb_header_pointer()

{,__}skb_header_pointer() helpers exist mainly for preventing
accesses-beyond-end of the linear data.
In the vast majorify of cases, they bail out on the first condition.
All code going after is mostly a fallback.
Mark the most common branch as 'likely' one to move it in-line.
Also, skb_copy_bits() can return negative values only when the input
arguments are invalid, e.g. offset is greater than skb->len. It can
be safely marked as 'unlikely' branch, assuming that hotpath code
provides sane input to not fail here.

These two bump the throughput with a single Flow Dissector pass on
every packet (e.g. with RPS or driver that uses eth_get_headlen())
on 20 Mbps per flow/core.

Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 46c61e127e9f..ecc029674ae4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3680,11 +3680,10 @@ static inline void * __must_check
 __skb_header_pointer(const struct sk_buff *skb, int offset, int len,
 		     const void *data, int hlen, void *buffer)
 {
-	if (hlen - offset >= len)
+	if (likely(hlen - offset >= len))
 		return (void *)data + offset;
 
-	if (!skb ||
-	    skb_copy_bits(skb, offset, buffer, len) < 0)
+	if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
 		return NULL;
 
 	return buffer;
-- 
cgit v1.2.3


From c4d57c22ac65bd503716062a06fad55a01569cac Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Date: Wed, 3 Mar 2021 10:54:19 +0100
Subject: power: supply: bq27xxx: fix power_avg for newer ICs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On all newer bq27xxx ICs, the AveragePower register contains a signed
value; in addition to handling the raw value as unsigned, the driver
code also didn't convert it to µW as expected.

At least for the BQ28Z610, the reference manual incorrectly states that
the value is in units of 1mW and not 10mW. I have no way of knowing
whether the manuals of other supported ICs contain the same error, or if
there are models that actually use 1mW. At least, the new code shouldn't
be *less* correct than the old version for any device.

power_avg is removed from the cache structure, se we don't have to
extend it to store both a signed value and an error code. Always getting
an up-to-date value may be desirable anyways, as it avoids inconsistent
current and power readings when switching between charging and
discharging.

Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c | 51 ++++++++++++++++++----------------
 include/linux/power/bq27xxx_battery.h  |  1 -
 2 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index cb6ebd2f905e..20e1dc8a87cf 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1661,27 +1661,6 @@ static int bq27xxx_battery_read_time(struct bq27xxx_device_info *di, u8 reg)
 	return tval * 60;
 }
 
-/*
- * Read an average power register.
- * Return < 0 if something fails.
- */
-static int bq27xxx_battery_read_pwr_avg(struct bq27xxx_device_info *di)
-{
-	int tval;
-
-	tval = bq27xxx_read(di, BQ27XXX_REG_AP, false);
-	if (tval < 0) {
-		dev_err(di->dev, "error reading average power register  %02x: %d\n",
-			BQ27XXX_REG_AP, tval);
-		return tval;
-	}
-
-	if (di->opts & BQ27XXX_O_ZERO)
-		return (tval * BQ27XXX_POWER_CONSTANT) / BQ27XXX_RS;
-	else
-		return tval;
-}
-
 /*
  * Returns true if a battery over temperature condition is detected
  */
@@ -1769,8 +1748,6 @@ void bq27xxx_battery_update(struct bq27xxx_device_info *di)
 		}
 		if (di->regs[BQ27XXX_REG_CYCT] != INVALID_REG_ADDR)
 			cache.cycle_count = bq27xxx_battery_read_cyct(di);
-		if (di->regs[BQ27XXX_REG_AP] != INVALID_REG_ADDR)
-			cache.power_avg = bq27xxx_battery_read_pwr_avg(di);
 
 		/* We only have to read charge design full once */
 		if (di->charge_design_full <= 0)
@@ -1833,6 +1810,32 @@ static int bq27xxx_battery_current(struct bq27xxx_device_info *di,
 	return 0;
 }
 
+/*
+ * Get the average power in µW
+ * Return < 0 if something fails.
+ */
+static int bq27xxx_battery_pwr_avg(struct bq27xxx_device_info *di,
+				   union power_supply_propval *val)
+{
+	int power;
+
+	power = bq27xxx_read(di, BQ27XXX_REG_AP, false);
+	if (power < 0) {
+		dev_err(di->dev,
+			"error reading average power register %02x: %d\n",
+			BQ27XXX_REG_AP, power);
+		return power;
+	}
+
+	if (di->opts & BQ27XXX_O_ZERO)
+		val->intval = (power * BQ27XXX_POWER_CONSTANT) / BQ27XXX_RS;
+	else
+		/* Other gauges return a signed value in units of 10mW */
+		val->intval = (int)((s16)power) * 10000;
+
+	return 0;
+}
+
 static int bq27xxx_battery_status(struct bq27xxx_device_info *di,
 				  union power_supply_propval *val)
 {
@@ -2020,7 +2023,7 @@ static int bq27xxx_battery_get_property(struct power_supply *psy,
 		ret = bq27xxx_simple_value(di->cache.energy, val);
 		break;
 	case POWER_SUPPLY_PROP_POWER_AVG:
-		ret = bq27xxx_simple_value(di->cache.power_avg, val);
+		ret = bq27xxx_battery_pwr_avg(di, val);
 		break;
 	case POWER_SUPPLY_PROP_HEALTH:
 		ret = bq27xxx_simple_value(di->cache.health, val);
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 111a40d0d3d5..8d5f4f40fb41 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -53,7 +53,6 @@ struct bq27xxx_reg_cache {
 	int capacity;
 	int energy;
 	int flags;
-	int power_avg;
 	int health;
 };
 
-- 
cgit v1.2.3


From c9570d4a5efd04479b3cd09c39b571eb031d94f4 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 31 Dec 2020 09:52:52 +0100
Subject: extcon: Add stubs for extcon_register_notifier_all() functions

Add stubs for extcon_register_notifier_all() function for !CONFIG_EXTCON
case.  This is useful for compile testing and for drivers which use
EXTCON but do not require it (therefore do not depend on CONFIG_EXTCON).

Fixes: 815429b39d94 ("extcon: Add new extcon_register_notifier_all() to monitor all external connectors")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 include/linux/extcon.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index fd183fb9c20f..0c19010da77f 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -271,6 +271,29 @@ static inline  void devm_extcon_unregister_notifier(struct device *dev,
 				struct extcon_dev *edev, unsigned int id,
 				struct notifier_block *nb) { }
 
+static inline int extcon_register_notifier_all(struct extcon_dev *edev,
+					       struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int extcon_unregister_notifier_all(struct extcon_dev *edev,
+						 struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline int devm_extcon_register_notifier_all(struct device *dev,
+						    struct extcon_dev *edev,
+						    struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline void devm_extcon_unregister_notifier_all(struct device *dev,
+						       struct extcon_dev *edev,
+						       struct notifier_block *nb) { }
+
 static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From 58483761810087e5ffdf36e84ac1bf26df909097 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Sun, 14 Mar 2021 12:13:29 +0100
Subject: thermal/drivers/core: Use a char pointer for the cooling device name

We want to have any kind of name for the cooling devices as we do no
longer want to rely on auto-numbering. Let's replace the cooling
device's fixed array by a char pointer to be allocated dynamically
when registering the cooling device, so we don't limit the length of
the name.

Rework the error path at the same time as we have to rollback the
allocations in case of error.

Tested with a dummy device having the name:
 "Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch"

A village on the island of Anglesey (Wales), known to have the longest
name in Europe.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20210314111333.16551-1-daniel.lezcano@linaro.org
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c |  2 +-
 drivers/thermal/thermal_core.c                     | 38 +++++++++++++---------
 include/linux/thermal.h                            |  2 +-
 3 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index bf85ce9835d7..7447c2a73cbd 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -141,7 +141,7 @@ static int mlxsw_get_cooling_device_idx(struct mlxsw_thermal *thermal,
 	/* Allow mlxsw thermal zone binding to an external cooling device */
 	for (i = 0; i < ARRAY_SIZE(mlxsw_thermal_external_allowed_cdev); i++) {
 		if (strnstr(cdev->type, mlxsw_thermal_external_allowed_cdev[i],
-			    sizeof(cdev->type)))
+			    strlen(cdev->type)))
 			return 0;
 	}
 
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 996c038f83a4..c8d4010940ef 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -960,10 +960,7 @@ __thermal_cooling_device_register(struct device_node *np,
 {
 	struct thermal_cooling_device *cdev;
 	struct thermal_zone_device *pos = NULL;
-	int result;
-
-	if (type && strlen(type) >= THERMAL_NAME_LENGTH)
-		return ERR_PTR(-EINVAL);
+	int ret;
 
 	if (!ops || !ops->get_max_state || !ops->get_cur_state ||
 	    !ops->set_cur_state)
@@ -973,14 +970,17 @@ __thermal_cooling_device_register(struct device_node *np,
 	if (!cdev)
 		return ERR_PTR(-ENOMEM);
 
-	result = ida_simple_get(&thermal_cdev_ida, 0, 0, GFP_KERNEL);
-	if (result < 0) {
-		kfree(cdev);
-		return ERR_PTR(result);
+	ret = ida_simple_get(&thermal_cdev_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto out_kfree_cdev;
+	cdev->id = ret;
+
+	cdev->type = kstrdup(type ? type : "", GFP_KERNEL);
+	if (!cdev->type) {
+		ret = -ENOMEM;
+		goto out_ida_remove;
 	}
 
-	cdev->id = result;
-	strlcpy(cdev->type, type ? : "", sizeof(cdev->type));
 	mutex_init(&cdev->lock);
 	INIT_LIST_HEAD(&cdev->thermal_instances);
 	cdev->np = np;
@@ -990,12 +990,9 @@ __thermal_cooling_device_register(struct device_node *np,
 	cdev->devdata = devdata;
 	thermal_cooling_device_setup_sysfs(cdev);
 	dev_set_name(&cdev->device, "cooling_device%d", cdev->id);
-	result = device_register(&cdev->device);
-	if (result) {
-		ida_simple_remove(&thermal_cdev_ida, cdev->id);
-		put_device(&cdev->device);
-		return ERR_PTR(result);
-	}
+	ret = device_register(&cdev->device);
+	if (ret)
+		goto out_kfree_type;
 
 	/* Add 'this' new cdev to the global cdev list */
 	mutex_lock(&thermal_list_lock);
@@ -1013,6 +1010,14 @@ __thermal_cooling_device_register(struct device_node *np,
 	mutex_unlock(&thermal_list_lock);
 
 	return cdev;
+
+out_kfree_type:
+	kfree(cdev->type);
+	put_device(&cdev->device);
+out_ida_remove:
+	ida_simple_remove(&thermal_cdev_ida, cdev->id);
+out_kfree_cdev:
+	return ERR_PTR(ret);
 }
 
 /**
@@ -1171,6 +1176,7 @@ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
 	ida_simple_remove(&thermal_cdev_ida, cdev->id);
 	device_del(&cdev->device);
 	thermal_cooling_device_destroy_sysfs(cdev);
+	kfree(cdev->type);
 	put_device(&cdev->device);
 }
 EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6ac7bb1d2b1f..169502164364 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -91,7 +91,7 @@ struct thermal_cooling_device_ops {
 
 struct thermal_cooling_device {
 	int id;
-	char type[THERMAL_NAME_LENGTH];
+	char *type;
 	struct device device;
 	struct device_node *np;
 	void *devdata;
-- 
cgit v1.2.3


From a74f681c3710b47a093d910ca7c6666b3d1e3a2c Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sun, 14 Mar 2021 19:33:54 +0300
Subject: opp: Add devres wrapper for dev_pm_opp_set_clkname

Add devres wrapper for dev_pm_opp_set_clkname() to simplify drivers code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 27 +++++++++++++++++++++++++++
 include/linux/pm_opp.h |  6 ++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 1556998425d5..61c48b2d4679 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2119,6 +2119,33 @@ void dev_pm_opp_put_clkname(struct opp_table *opp_table)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
 
+static void devm_pm_opp_clkname_release(void *data)
+{
+	dev_pm_opp_put_clkname(data);
+}
+
+/**
+ * devm_pm_opp_set_clkname() - Set clk name for the device
+ * @dev: Device for which clk name is being set.
+ * @name: Clk name.
+ *
+ * This is a resource-managed variant of dev_pm_opp_set_clkname().
+ *
+ * Return: 0 on success and errorno otherwise.
+ */
+int devm_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	struct opp_table *opp_table;
+
+	opp_table = dev_pm_opp_set_clkname(dev, name);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	return devm_add_action_or_reset(dev, devm_pm_opp_clkname_release,
+					opp_table);
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_set_clkname);
+
 /**
  * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
  * @dev: Device for which the helper is getting registered.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index c0371efa4a0f..0583d775aa5a 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -150,6 +150,7 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * con
 void dev_pm_opp_put_regulators(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
+int devm_pm_opp_set_clkname(struct device *dev, const char *name);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
 struct opp_table *devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
@@ -355,6 +356,11 @@ static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const
 
 static inline void dev_pm_opp_put_clkname(struct opp_table *opp_table) {}
 
+static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs)
 {
 	return ERR_PTR(-EOPNOTSUPP);
-- 
cgit v1.2.3


From 32aee78bc5184c7a51a081939721e97cfad4a44e Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sun, 14 Mar 2021 19:33:55 +0300
Subject: opp: Add devres wrapper for dev_pm_opp_set_regulators

Add devres wrapper for dev_pm_opp_set_regulators() to simplify drivers
code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 30 ++++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  8 ++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 61c48b2d4679..d77ec2c55783 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2047,6 +2047,36 @@ put_opp_table:
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
 
+static void devm_pm_opp_regulators_release(void *data)
+{
+	dev_pm_opp_put_regulators(data);
+}
+
+/**
+ * devm_pm_opp_set_regulators() - Set regulator names for the device
+ * @dev: Device for which regulator name is being set.
+ * @names: Array of pointers to the names of the regulator.
+ * @count: Number of regulators.
+ *
+ * This is a resource-managed variant of dev_pm_opp_set_regulators().
+ *
+ * Return: 0 on success and errorno otherwise.
+ */
+int devm_pm_opp_set_regulators(struct device *dev,
+			       const char * const names[],
+			       unsigned int count)
+{
+	struct opp_table *opp_table;
+
+	opp_table = dev_pm_opp_set_regulators(dev, names, count);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	return devm_add_action_or_reset(dev, devm_pm_opp_regulators_release,
+					opp_table);
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_set_regulators);
+
 /**
  * dev_pm_opp_set_clkname() - Set clk name for the device
  * @dev: Device for which clk name is being set.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 0583d775aa5a..3e667af57211 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -148,6 +148,7 @@ struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
 void dev_pm_opp_put_regulators(struct opp_table *opp_table);
+int devm_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
 struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name);
 void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 int devm_pm_opp_set_clkname(struct device *dev, const char *name);
@@ -349,6 +350,13 @@ static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, co
 
 static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
 
+static inline int devm_pm_opp_set_regulators(struct device *dev,
+					     const char * const names[],
+					     unsigned int count)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
 {
 	return ERR_PTR(-EOPNOTSUPP);
-- 
cgit v1.2.3


From 9c4f220f3dc260e325c92e8588ade2affcb6528c Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sun, 14 Mar 2021 19:33:56 +0300
Subject: opp: Add devres wrapper for dev_pm_opp_set_supported_hw

Add devres wrapper for dev_pm_opp_set_supported_hw() to simplify drivers
code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 29 +++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  8 ++++++++
 2 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index d77ec2c55783..956ea22a21d6 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1857,6 +1857,35 @@ void dev_pm_opp_put_supported_hw(struct opp_table *opp_table)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
 
+static void devm_pm_opp_supported_hw_release(void *data)
+{
+	dev_pm_opp_put_supported_hw(data);
+}
+
+/**
+ * devm_pm_opp_set_supported_hw() - Set supported platforms
+ * @dev: Device for which supported-hw has to be set.
+ * @versions: Array of hierarchy of versions to match.
+ * @count: Number of elements in the array.
+ *
+ * This is a resource-managed variant of dev_pm_opp_set_supported_hw().
+ *
+ * Return: 0 on success and errorno otherwise.
+ */
+int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
+				 unsigned int count)
+{
+	struct opp_table *opp_table;
+
+	opp_table = dev_pm_opp_set_supported_hw(dev, versions, count);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	return devm_add_action_or_reset(dev, devm_pm_opp_supported_hw_release,
+					opp_table);
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_set_supported_hw);
+
 /**
  * dev_pm_opp_set_prop_name() - Set prop-extn name
  * @dev: Device for which the prop-name has to be set.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 3e667af57211..e455b187e405 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -144,6 +144,7 @@ int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb
 
 struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 void dev_pm_opp_put_supported_hw(struct opp_table *opp_table);
+int devm_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count);
 struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct opp_table *opp_table);
 struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
@@ -321,6 +322,13 @@ static inline struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
 
 static inline void dev_pm_opp_put_supported_hw(struct opp_table *opp_table) {}
 
+static inline int devm_pm_opp_set_supported_hw(struct device *dev,
+					       const u32 *versions,
+					       unsigned int count)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
 			int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
-- 
cgit v1.2.3


From 3d5cfbb69508db7d092475be01c66edc86066717 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sun, 14 Mar 2021 19:33:57 +0300
Subject: opp: Add devres wrapper for dev_pm_opp_of_add_table

Add devres wrapper for dev_pm_opp_of_add_table() to simplify drivers
code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/of.c       | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/pm_opp.h |  6 ++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index f480c10e6314..c582a9ca397b 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1104,6 +1104,42 @@ static int _of_add_table_indexed(struct device *dev, int index, bool getclk)
 	return ret;
 }
 
+static void devm_pm_opp_of_table_release(void *data)
+{
+	dev_pm_opp_of_remove_table(data);
+}
+
+/**
+ * devm_pm_opp_of_add_table() - Initialize opp table from device tree
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Register the initial OPP table with the OPP library for given device.
+ *
+ * The opp_table structure will be freed after the device is destroyed.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ * -ENODEV	when 'operating-points' property is not found or is invalid data
+ *		in device node.
+ * -ENODATA	when empty 'operating-points' property is found
+ * -EINVAL	when invalid entries are found in opp-v2 table
+ */
+int devm_pm_opp_of_add_table(struct device *dev)
+{
+	int ret;
+
+	ret = dev_pm_opp_of_add_table(dev);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(dev, devm_pm_opp_of_table_release, dev);
+}
+EXPORT_SYMBOL_GPL(devm_pm_opp_of_add_table);
+
 /**
  * dev_pm_opp_of_add_table() - Initialize opp table from device tree
  * @dev:	device pointer used to lookup OPP table.
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index e455b187e405..3f8894012429 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -441,6 +441,7 @@ int dev_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_add_table_indexed(struct device *dev, int index);
 int dev_pm_opp_of_add_table_noclk(struct device *dev, int index);
 void dev_pm_opp_of_remove_table(struct device *dev);
+int devm_pm_opp_of_add_table(struct device *dev);
 int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask);
 void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask);
 int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -473,6 +474,11 @@ static inline void dev_pm_opp_of_remove_table(struct device *dev)
 {
 }
 
+static inline int devm_pm_opp_of_add_table(struct device *dev)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From eedb0b12d091a21909b5e84d9f3e5e649305bd12 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Jan 2021 14:53:22 +0100
Subject: dma-mapping: add a dma_mmap_pages helper

Add a helper to map memory allocated using dma_alloc_pages into
a user address space, similar to the dma_alloc_attrs function for
coherent allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Tested-by: Ricardo Ribalda <ribalda@chromium.org>
---
 Documentation/core-api/dma-api.rst | 10 ++++++++++
 include/linux/dma-mapping.h        |  2 ++
 kernel/dma/mapping.c               | 13 +++++++++++++
 3 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index e6d23f117308..157a474ae544 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -563,6 +563,16 @@ Free a region of memory previously allocated using dma_alloc_pages().
 dev, size, dma_handle and dir must all be the same as those passed into
 dma_alloc_pages().  page must be the pointer returned by dma_alloc_pages().
 
+::
+
+	int
+	dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+		       size_t size, struct page *page)
+
+Map an allocation returned from dma_alloc_pages() into a user address space.
+dev and size must be the same as those passed into dma_alloc_pages().
+page must be the pointer returned by dma_alloc_pages().
+
 ::
 
 	void *
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2a984cb4d1e0..2b8dce756e1f 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -263,6 +263,8 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
 void dma_free_pages(struct device *dev, size_t size, struct page *page,
 		dma_addr_t dma_handle, enum dma_data_direction dir);
+int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+		size_t size, struct page *page);
 
 static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b6a633679933..9ce86c77651c 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -517,6 +517,19 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page,
 }
 EXPORT_SYMBOL_GPL(dma_free_pages);
 
+int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+		size_t size, struct page *page)
+{
+	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff)
+		return -ENXIO;
+	return remap_pfn_range(vma, vma->vm_start,
+			       page_to_pfn(page) + vma->vm_pgoff,
+			       vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot);
+}
+EXPORT_SYMBOL_GPL(dma_mmap_pages);
+
 int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From 7d5b5738d1514e9dd8ed452660e2a4d25beb9483 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Jan 2021 14:54:18 +0100
Subject: dma-mapping: add a dma_alloc_noncontiguous API

Add a new API that returns a potentiall virtually non-contigous sg_table
and a DMA address.  This API is only properly implemented for dma-iommu
and will simply return a contigious chunk as a fallback.

The intent is that drivers can use this API if either:

 - no kernel mapping or only temporary kernel mappings are required.
   That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING
 - a kernel mapping is required for cached and DMA mapped pages, but
   the driver also needs the pages to e.g. map them to userspace.
   In that sense it is a replacement for some aspects of the recently
   removed and never fully implemented DMA_ATTR_NON_CONSISTENT

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tomasz Figa <tfiga@chromium.org>
Tested-by: Ricardo Ribalda <ribalda@chromium.org>
---
 Documentation/core-api/dma-api.rst |  78 +++++++++++++++++++++++++++
 include/linux/dma-map-ops.h        |  19 +++++++
 include/linux/dma-mapping.h        |  32 +++++++++++
 kernel/dma/mapping.c               | 106 +++++++++++++++++++++++++++++++++++++
 4 files changed, 235 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index 157a474ae544..00a1d4fa3f9e 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -594,6 +594,84 @@ dev, size, dma_handle and dir must all be the same as those passed into
 dma_alloc_noncoherent().  cpu_addr must be the virtual address returned by
 dma_alloc_noncoherent().
 
+::
+
+	struct sg_table *
+	dma_alloc_noncontiguous(struct device *dev, size_t size,
+				enum dma_data_direction dir, gfp_t gfp,
+				unsigned long attrs);
+
+This routine allocates  <size> bytes of non-coherent and possibly non-contiguous
+memory.  It returns a pointer to struct sg_table that describes the allocated
+and DMA mapped memory, or NULL if the allocation failed. The resulting memory
+can be used for struct page mapped into a scatterlist are suitable for.
+
+The return sg_table is guaranteed to have 1 single DMA mapped segment as
+indicated by sgt->nents, but it might have multiple CPU side segments as
+indicated by sgt->orig_nents.
+
+The dir parameter specified if data is read and/or written by the device,
+see dma_map_single() for details.
+
+The gfp parameter allows the caller to specify the ``GFP_`` flags (see
+kmalloc()) for the allocation, but rejects flags used to specify a memory
+zone such as GFP_DMA or GFP_HIGHMEM.
+
+The attrs argument must be either 0 or DMA_ATTR_ALLOC_SINGLE_PAGES.
+
+Before giving the memory to the device, dma_sync_sgtable_for_device() needs
+to be called, and before reading memory written by the device,
+dma_sync_sgtable_for_cpu(), just like for streaming DMA mappings that are
+reused.
+
+::
+
+	void
+	dma_free_noncontiguous(struct device *dev, size_t size,
+			       struct sg_table *sgt,
+			       enum dma_data_direction dir)
+
+Free memory previously allocated using dma_alloc_noncontiguous().  dev, size,
+and dir must all be the same as those passed into dma_alloc_noncontiguous().
+sgt must be the pointer returned by dma_alloc_noncontiguous().
+
+::
+
+	void *
+	dma_vmap_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt)
+
+Return a contiguous kernel mapping for an allocation returned from
+dma_alloc_noncontiguous().  dev and size must be the same as those passed into
+dma_alloc_noncontiguous().  sgt must be the pointer returned by
+dma_alloc_noncontiguous().
+
+Once a non-contiguous allocation is mapped using this function, the
+flush_kernel_vmap_range() and invalidate_kernel_vmap_range() APIs must be used
+to manage the coherency between the kernel mapping, the device and user space
+mappings (if any).
+
+::
+
+	void
+	dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+
+Unmap a kernel mapping returned by dma_vmap_noncontiguous().  dev must be the
+same the one passed into dma_alloc_noncontiguous().  vaddr must be the pointer
+returned by dma_vmap_noncontiguous().
+
+
+::
+
+	int
+	dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+			       size_t size, struct sg_table *sgt)
+
+Map an allocation returned from dma_alloc_noncontiguous() into a user address
+space.  dev and size must be the same as those passed into
+dma_alloc_noncontiguous().  sgt must be the pointer returned by
+dma_alloc_noncontiguous().
+
 ::
 
 	int
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 51872e736e7b..0d53a96a3d64 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -22,6 +22,11 @@ struct dma_map_ops {
 			gfp_t gfp);
 	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
 			dma_addr_t dma_handle, enum dma_data_direction dir);
+	struct sg_table *(*alloc_noncontiguous)(struct device *dev, size_t size,
+			enum dma_data_direction dir, gfp_t gfp,
+			unsigned long attrs);
+	void (*free_noncontiguous)(struct device *dev, size_t size,
+			struct sg_table *sgt, enum dma_data_direction dir);
 	int (*mmap)(struct device *, struct vm_area_struct *,
 			void *, dma_addr_t, size_t, unsigned long attrs);
 
@@ -198,6 +203,20 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_DMA_DECLARE_COHERENT */
 
+/*
+ * This is the actual return value from the ->alloc_noncontiguous method.
+ * The users of the DMA API should only care about the sg_table, but to make
+ * the DMA-API internal vmaping and freeing easier we stash away the page
+ * array as well (except for the fallback case).  This can go away any time,
+ * e.g. when a vmap-variant that takes a scatterlist comes along.
+ */
+struct dma_sgt_handle {
+	struct sg_table sgt;
+	struct page **pages;
+};
+#define sgt_handle(sgt) \
+	container_of((sgt), struct dma_sgt_handle, sgt)
+
 int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2b8dce756e1f..954847f9a3e0 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -144,6 +144,15 @@ u64 dma_get_required_mask(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 unsigned long dma_get_merge_boundary(struct device *dev);
+struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
+		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
+void dma_free_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt, enum dma_data_direction dir);
+void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt);
+void dma_vunmap_noncontiguous(struct device *dev, void *vaddr);
+int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+		size_t size, struct sg_table *sgt);
 #else /* CONFIG_HAS_DMA */
 static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 		struct page *page, size_t offset, size_t size,
@@ -257,6 +266,29 @@ static inline unsigned long dma_get_merge_boundary(struct device *dev)
 {
 	return 0;
 }
+static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev,
+		size_t size, enum dma_data_direction dir, gfp_t gfp,
+		unsigned long attrs)
+{
+	return NULL;
+}
+static inline void dma_free_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt, enum dma_data_direction dir)
+{
+}
+static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt)
+{
+	return NULL;
+}
+static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+{
+}
+static inline int dma_mmap_noncontiguous(struct device *dev,
+		struct vm_area_struct *vma, size_t size, struct sg_table *sgt)
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_HAS_DMA */
 
 struct page *dma_alloc_pages(struct device *dev, size_t size,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 07f964ebcda1..2b06a809d0b9 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -539,6 +539,112 @@ int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL_GPL(dma_mmap_pages);
 
+static struct sg_table *alloc_single_sgt(struct device *dev, size_t size,
+		enum dma_data_direction dir, gfp_t gfp)
+{
+	struct sg_table *sgt;
+	struct page *page;
+
+	sgt = kmalloc(sizeof(*sgt), gfp);
+	if (!sgt)
+		return NULL;
+	if (sg_alloc_table(sgt, 1, gfp))
+		goto out_free_sgt;
+	page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp);
+	if (!page)
+		goto out_free_table;
+	sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+	sg_dma_len(sgt->sgl) = sgt->sgl->length;
+	return sgt;
+out_free_table:
+	sg_free_table(sgt);
+out_free_sgt:
+	kfree(sgt);
+	return NULL;
+}
+
+struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
+		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	struct sg_table *sgt;
+
+	if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
+		return NULL;
+
+	if (ops && ops->alloc_noncontiguous)
+		sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
+	else
+		sgt = alloc_single_sgt(dev, size, dir, gfp);
+
+	if (sgt) {
+		sgt->nents = 1;
+		debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir);
+	}
+	return sgt;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous);
+
+static void free_single_sgt(struct device *dev, size_t size,
+		struct sg_table *sgt, enum dma_data_direction dir)
+{
+	__dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address,
+			 dir);
+	sg_free_table(sgt);
+	kfree(sgt);
+}
+
+void dma_free_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt, enum dma_data_direction dir)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
+	if (ops && ops->free_noncontiguous)
+		ops->free_noncontiguous(dev, size, sgt, dir);
+	else
+		free_single_sgt(dev, size, sgt, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
+
+void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+		struct sg_table *sgt)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	if (ops && ops->alloc_noncontiguous)
+		return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL);
+	return page_address(sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous);
+
+void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (ops && ops->alloc_noncontiguous)
+		vunmap(vaddr);
+}
+EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous);
+
+int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+		size_t size, struct sg_table *sgt)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (ops && ops->alloc_noncontiguous) {
+		unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+		if (vma->vm_pgoff >= count ||
+		    vma_pages(vma) > count - vma->vm_pgoff)
+			return -ENXIO;
+		return vm_map_pages(vma, sgt_handle(sgt)->pages, count);
+	}
+	return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
+
 int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From d3d40f237480abf3268956daf18cdc56edd32834 Mon Sep 17 00:00:00 2001
From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Date: Mon, 8 Mar 2021 14:24:12 +1300
Subject: Revert "netfilter: x_tables: Switch synchronization to RCU"

This reverts commit cc00bcaa589914096edef7fb87ca5cee4a166b5c.

This (and the preceding) patch basically re-implemented the RCU
mechanisms of patch 784544739a25. That patch was replaced because of the
performance problems that it created when replacing tables. Now, we have
the same issue: the call to synchronize_rcu() makes replacing tables
slower by as much as an order of magnitude.

Prior to using RCU a script calling "iptables" approx. 200 times was
taking 1.16s. With RCU this increased to 11.59s.

Revert these patches and fix the issue in a different way.

Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  5 +---
 net/ipv4/netfilter/arp_tables.c    | 14 +++++------
 net/ipv4/netfilter/ip_tables.c     | 14 +++++------
 net/ipv6/netfilter/ip6_tables.c    | 14 +++++------
 net/netfilter/x_tables.c           | 49 ++++++++++++++++++++++++++------------
 5 files changed, 56 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 8ebb64193757..5deb099d156d 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -227,7 +227,7 @@ struct xt_table {
 	unsigned int valid_hooks;
 
 	/* Man behind the curtain... */
-	struct xt_table_info __rcu *private;
+	struct xt_table_info *private;
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
@@ -448,9 +448,6 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
 
 struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);
 
-struct xt_table_info
-*xt_table_get_private_protected(const struct xt_table *table);
-
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 563b62b76a5f..d1e04d2b5170 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
-	private = rcu_access_pointer(table->private);
+	private = READ_ONCE(table->private); /* Address dependency. */
 	cpu     = smp_processor_id();
 	table_base = private->entries;
 	jumpstack  = (struct arpt_entry **)private->jumpstack[cpu];
@@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = xt_table_get_private_protected(table);
+	const struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	unsigned int off, num;
 	const struct arpt_entry *e;
 	struct xt_counters *counters;
-	struct xt_table_info *private = xt_table_get_private_protected(table);
+	struct xt_table_info *private = table->private;
 	int ret = 0;
 	void *loc_cpu_entry;
 
@@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
 	if (!IS_ERR(t)) {
 		struct arpt_getinfo info;
-		const struct xt_table_info *private = xt_table_get_private_protected(t);
+		const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
 		struct xt_table_info tmp;
 
@@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
 
 	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
 	if (!IS_ERR(t)) {
-		const struct xt_table_info *private = xt_table_get_private_protected(t);
+		const struct xt_table_info *private = t->private;
 
 		if (get.size == private->size)
 			ret = copy_entries_to_user(private->size,
@@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	}
 
 	local_bh_disable();
-	private = xt_table_get_private_protected(t);
+	private = t->private;
 	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 				       void __user *userptr)
 {
 	struct xt_counters *counters;
-	const struct xt_table_info *private = xt_table_get_private_protected(table);
+	const struct xt_table_info *private = table->private;
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 6e2851f8d3a3..f15bc21d7301 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb,
 	WARN_ON(!(table->valid_hooks & (1 << hook)));
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
-	private = rcu_access_pointer(table->private);
+	private = READ_ONCE(table->private); /* Address dependency. */
 	cpu        = smp_processor_id();
 	table_base = private->entries;
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
@@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = xt_table_get_private_protected(table);
+	const struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size,
 	unsigned int off, num;
 	const struct ipt_entry *e;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = xt_table_get_private_protected(table);
+	const struct xt_table_info *private = table->private;
 	int ret = 0;
 	const void *loc_cpu_entry;
 
@@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	t = xt_request_find_table_lock(net, AF_INET, name);
 	if (!IS_ERR(t)) {
 		struct ipt_getinfo info;
-		const struct xt_table_info *private = xt_table_get_private_protected(t);
+		const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
 		struct xt_table_info tmp;
 
@@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
 
 	t = xt_find_table_lock(net, AF_INET, get.name);
 	if (!IS_ERR(t)) {
-		const struct xt_table_info *private = xt_table_get_private_protected(t);
+		const struct xt_table_info *private = t->private;
 		if (get.size == private->size)
 			ret = copy_entries_to_user(private->size,
 						   t, uptr->entrytable);
@@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	}
 
 	local_bh_disable();
-	private = xt_table_get_private_protected(t);
+	private = t->private;
 	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 			    void __user *userptr)
 {
 	struct xt_counters *counters;
-	const struct xt_table_info *private = xt_table_get_private_protected(table);
+	const struct xt_table_info *private = table->private;
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c4f532f4d311..2e2119bfcf13 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -280,7 +280,7 @@ ip6t_do_table(struct sk_buff *skb,
 
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
-	private = rcu_access_pointer(table->private);
+	private = READ_ONCE(table->private); /* Address dependency. */
 	cpu        = smp_processor_id();
 	table_base = private->entries;
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
@@ -807,7 +807,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = xt_table_get_private_protected(table);
+	const struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -831,7 +831,7 @@ copy_entries_to_user(unsigned int total_size,
 	unsigned int off, num;
 	const struct ip6t_entry *e;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = xt_table_get_private_protected(table);
+	const struct xt_table_info *private = table->private;
 	int ret = 0;
 	const void *loc_cpu_entry;
 
@@ -980,7 +980,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	t = xt_request_find_table_lock(net, AF_INET6, name);
 	if (!IS_ERR(t)) {
 		struct ip6t_getinfo info;
-		const struct xt_table_info *private = xt_table_get_private_protected(t);
+		const struct xt_table_info *private = t->private;
 #ifdef CONFIG_COMPAT
 		struct xt_table_info tmp;
 
@@ -1035,7 +1035,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
 
 	t = xt_find_table_lock(net, AF_INET6, get.name);
 	if (!IS_ERR(t)) {
-		struct xt_table_info *private = xt_table_get_private_protected(t);
+		struct xt_table_info *private = t->private;
 		if (get.size == private->size)
 			ret = copy_entries_to_user(private->size,
 						   t, uptr->entrytable);
@@ -1189,7 +1189,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	}
 
 	local_bh_disable();
-	private = xt_table_get_private_protected(t);
+	private = t->private;
 	if (private->number != tmp.num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1552,7 +1552,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 			    void __user *userptr)
 {
 	struct xt_counters *counters;
-	const struct xt_table_info *private = xt_table_get_private_protected(table);
+	const struct xt_table_info *private = table->private;
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index bce6ca203d46..7df3aef39c5c 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1351,14 +1351,6 @@ struct xt_counters *xt_counters_alloc(unsigned int counters)
 }
 EXPORT_SYMBOL(xt_counters_alloc);
 
-struct xt_table_info
-*xt_table_get_private_protected(const struct xt_table *table)
-{
-	return rcu_dereference_protected(table->private,
-					 mutex_is_locked(&xt[table->af].mutex));
-}
-EXPORT_SYMBOL(xt_table_get_private_protected);
-
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
@@ -1366,6 +1358,7 @@ xt_replace_table(struct xt_table *table,
 	      int *error)
 {
 	struct xt_table_info *private;
+	unsigned int cpu;
 	int ret;
 
 	ret = xt_jumpstack_alloc(newinfo);
@@ -1375,20 +1368,47 @@ xt_replace_table(struct xt_table *table,
 	}
 
 	/* Do the substitution. */
-	private = xt_table_get_private_protected(table);
+	local_bh_disable();
+	private = table->private;
 
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		pr_debug("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
+		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
 
 	newinfo->initial_entries = private->initial_entries;
+	/*
+	 * Ensure contents of newinfo are visible before assigning to
+	 * private.
+	 */
+	smp_wmb();
+	table->private = newinfo;
+
+	/* make sure all cpus see new ->private value */
+	smp_wmb();
 
-	rcu_assign_pointer(table->private, newinfo);
-	synchronize_rcu();
+	/*
+	 * Even though table entries have now been swapped, other CPU's
+	 * may still be using the old entries...
+	 */
+	local_bh_enable();
+
+	/* ... so wait for even xt_recseq on all cpus */
+	for_each_possible_cpu(cpu) {
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
+		u32 seq = raw_read_seqcount(s);
+
+		if (seq & 1) {
+			do {
+				cond_resched();
+				cpu_relax();
+			} while (seq == raw_read_seqcount(s));
+		}
+	}
 
 	audit_log_nfcfg(table->name, table->af, private->number,
 			!private->number ? AUDIT_XT_OP_REGISTER :
@@ -1424,12 +1444,12 @@ struct xt_table *xt_register_table(struct net *net,
 	}
 
 	/* Simplifies replace_table code. */
-	rcu_assign_pointer(table->private, bootstrap);
+	table->private = bootstrap;
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 
-	private = xt_table_get_private_protected(table);
+	private = table->private;
 	pr_debug("table->private->number = %u\n", private->number);
 
 	/* save number of initial entries */
@@ -1452,8 +1472,7 @@ void *xt_unregister_table(struct xt_table *table)
 	struct xt_table_info *private;
 
 	mutex_lock(&xt[table->af].mutex);
-	private = xt_table_get_private_protected(table);
-	RCU_INIT_POINTER(table->private, NULL);
+	private = table->private;
 	list_del(&table->list);
 	mutex_unlock(&xt[table->af].mutex);
 	audit_log_nfcfg(table->name, table->af, private->number,
-- 
cgit v1.2.3


From 175e476b8cdf2a4de7432583b49c871345e4f8a1 Mon Sep 17 00:00:00 2001
From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Date: Mon, 8 Mar 2021 14:24:13 +1300
Subject: netfilter: x_tables: Use correct memory barriers.

When a new table value was assigned, it was followed by a write memory
barrier. This ensured that all writes before this point would complete
before any writes after this point. However, to determine whether the
rules are unused, the sequence counter is read. To ensure that all
writes have been done before these reads, a full memory barrier is
needed, not just a write memory barrier. The same argument applies when
incrementing the counter, before the rules are read.

Changing to using smp_mb() instead of smp_wmb() fixes the kernel panic
reported in cc00bcaa5899 (which is still present), while still
maintaining the same speed of replacing tables.

The smb_mb() barriers potentially slow the packet path, however testing
has shown no measurable change in performance on a 4-core MIPS64
platform.

Fixes: 7f5c6d4f665b ("netfilter: get rid of atomic ops in fast path")
Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 2 +-
 net/netfilter/x_tables.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 5deb099d156d..8ec48466410a 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -376,7 +376,7 @@ static inline unsigned int xt_write_recseq_begin(void)
 	 * since addend is most likely 1
 	 */
 	__this_cpu_add(xt_recseq.sequence, addend);
-	smp_wmb();
+	smp_mb();
 
 	return addend;
 }
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 7df3aef39c5c..6bd31a7a27fc 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1389,7 +1389,7 @@ xt_replace_table(struct xt_table *table,
 	table->private = newinfo;
 
 	/* make sure all cpus see new ->private value */
-	smp_wmb();
+	smp_mb();
 
 	/*
 	 * Even though table entries have now been swapped, other CPU's
-- 
cgit v1.2.3


From 9cb24ea051857f2a7ab85c42842c5baa40497e53 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 14 Mar 2021 18:24:02 +0300
Subject: atm: delete include/linux/atm_suni.h

This file has been effectively empty since 2.3.99-pre3 !

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/fore200e.c   |  1 -
 drivers/atm/suni.c       |  1 -
 include/linux/atm_suni.h | 12 ------------
 3 files changed, 14 deletions(-)
 delete mode 100644 include/linux/atm_suni.h

(limited to 'include/linux')

diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 9a70bee84125..0b9c99c3d218 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -21,7 +21,6 @@
 #include <linux/module.h>
 #include <linux/atmdev.h>
 #include <linux/sonet.h>
-#include <linux/atm_suni.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
 #include <linux/firmware.h>
diff --git a/drivers/atm/suni.c b/drivers/atm/suni.c
index c920a8c52925..21e5acc766b8 100644
--- a/drivers/atm/suni.c
+++ b/drivers/atm/suni.c
@@ -21,7 +21,6 @@
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/capability.h>
-#include <linux/atm_suni.h>
 #include <linux/slab.h>
 #include <asm/param.h>
 #include <linux/uaccess.h>
diff --git a/include/linux/atm_suni.h b/include/linux/atm_suni.h
deleted file mode 100644
index 84f3aab54468..000000000000
--- a/include/linux/atm_suni.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* atm_suni.h - Driver-specific declarations of the SUNI driver (for use by
-		driver-specific utilities) */
-
-/* Written 1998,2000 by Werner Almesberger, EPFL ICA */
-
-
-#ifndef LINUX_ATM_SUNI_H
-#define LINUX_ATM_SUNI_H
-
-/* everything obsoleted */
-
-#endif
-- 
cgit v1.2.3


From 07a4bc51fc732b3618fd46dc51609948933064a4 Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Mon, 15 Mar 2021 13:27:06 +0800
Subject: net: pcs: rearrange C73 functions to prepare for C37 support later

The current implementation for XPCS is validated for C73, so we rename them
to have _c73 suffix and introduce a set of functions to use an_mode flag
to switch between C73 and C37 AN later.

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/pcs-xpcs.c   | 94 +++++++++++++++++++++++++++++++-------------
 include/linux/pcs/pcs-xpcs.h |  4 ++
 2 files changed, 70 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 1aa9903d602e..10def2d98696 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -125,22 +125,26 @@ static struct xpcs_id {
 	u32 mask;
 	const int *supported;
 	const phy_interface_t *interface;
+	int an_mode;
 } xpcs_id_list[] = {
 	{
 		.id = SYNOPSYS_XPCS_USXGMII_ID,
 		.mask = SYNOPSYS_XPCS_MASK,
 		.supported = xpcs_usxgmii_features,
 		.interface = xpcs_usxgmii_interfaces,
+		.an_mode = DW_AN_C73,
 	}, {
 		.id = SYNOPSYS_XPCS_10GKR_ID,
 		.mask = SYNOPSYS_XPCS_MASK,
 		.supported = xpcs_10gkr_features,
 		.interface = xpcs_10gkr_interfaces,
+		.an_mode = DW_AN_C73,
 	}, {
 		.id = SYNOPSYS_XPCS_XLGMII_ID,
 		.mask = SYNOPSYS_XPCS_MASK,
 		.supported = xpcs_xlgmii_features,
 		.interface = xpcs_xlgmii_interfaces,
+		.an_mode = DW_AN_C73,
 	},
 };
 
@@ -195,9 +199,17 @@ static int xpcs_poll_reset(struct mdio_xpcs_args *xpcs, int dev)
 	return (ret & MDIO_CTRL1_RESET) ? -ETIMEDOUT : 0;
 }
 
-static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs, int dev)
+static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs)
 {
-	int ret;
+	int ret, dev;
+
+	switch (xpcs->an_mode) {
+	case DW_AN_C73:
+		dev = MDIO_MMD_PCS;
+		break;
+	default:
+		return -1;
+	}
 
 	ret = xpcs_write(xpcs, dev, MDIO_CTRL1, MDIO_CTRL1_RESET);
 	if (ret < 0)
@@ -212,8 +224,8 @@ static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs, int dev)
 		dev_warn(&(__xpcs)->bus->dev, ##__args); \
 })
 
-static int xpcs_read_fault(struct mdio_xpcs_args *xpcs,
-			   struct phylink_link_state *state)
+static int xpcs_read_fault_c73(struct mdio_xpcs_args *xpcs,
+			       struct phylink_link_state *state)
 {
 	int ret;
 
@@ -263,7 +275,7 @@ static int xpcs_read_fault(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
-static int xpcs_read_link(struct mdio_xpcs_args *xpcs, bool an)
+static int xpcs_read_link_c73(struct mdio_xpcs_args *xpcs, bool an)
 {
 	bool link = true;
 	int ret;
@@ -357,7 +369,7 @@ static int xpcs_config_usxgmii(struct mdio_xpcs_args *xpcs, int speed)
 	return xpcs_write_vpcs(xpcs, MDIO_CTRL1, ret | DW_USXGMII_RST);
 }
 
-static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
+static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
 {
 	int ret, adv;
 
@@ -401,11 +413,11 @@ static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_AN, DW_SR_AN_ADV1, adv);
 }
 
-static int xpcs_config_aneg(struct mdio_xpcs_args *xpcs)
+static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
 {
 	int ret;
 
-	ret = xpcs_config_aneg_c73(xpcs);
+	ret = _xpcs_config_aneg_c73(xpcs);
 	if (ret < 0)
 		return ret;
 
@@ -418,8 +430,8 @@ static int xpcs_config_aneg(struct mdio_xpcs_args *xpcs)
 	return xpcs_write(xpcs, MDIO_MMD_AN, MDIO_CTRL1, ret);
 }
 
-static int xpcs_aneg_done(struct mdio_xpcs_args *xpcs,
-			  struct phylink_link_state *state)
+static int xpcs_aneg_done_c73(struct mdio_xpcs_args *xpcs,
+			      struct phylink_link_state *state)
 {
 	int ret;
 
@@ -434,7 +446,7 @@ static int xpcs_aneg_done(struct mdio_xpcs_args *xpcs,
 
 		/* Check if Aneg outcome is valid */
 		if (!(ret & DW_C73_AN_ADV_SF)) {
-			xpcs_config_aneg(xpcs);
+			xpcs_config_aneg_c73(xpcs);
 			return 0;
 		}
 
@@ -444,8 +456,8 @@ static int xpcs_aneg_done(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
-static int xpcs_read_lpa(struct mdio_xpcs_args *xpcs,
-			 struct phylink_link_state *state)
+static int xpcs_read_lpa_c73(struct mdio_xpcs_args *xpcs,
+			     struct phylink_link_state *state)
 {
 	int ret;
 
@@ -493,8 +505,8 @@ static int xpcs_read_lpa(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
-static void xpcs_resolve_lpa(struct mdio_xpcs_args *xpcs,
-			     struct phylink_link_state *state)
+static void xpcs_resolve_lpa_c73(struct mdio_xpcs_args *xpcs,
+				 struct phylink_link_state *state)
 {
 	int max_speed = xpcs_get_max_usxgmii_speed(state->lp_advertising);
 
@@ -590,27 +602,33 @@ static int xpcs_config(struct mdio_xpcs_args *xpcs,
 {
 	int ret;
 
-	if (state->an_enabled) {
-		ret = xpcs_config_aneg(xpcs);
-		if (ret)
-			return ret;
+	switch (xpcs->an_mode) {
+	case DW_AN_C73:
+		if (state->an_enabled) {
+			ret = xpcs_config_aneg_c73(xpcs);
+			if (ret)
+				return ret;
+		}
+		break;
+	default:
+		return -1;
 	}
 
 	return 0;
 }
 
-static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
-			  struct phylink_link_state *state)
+static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
+			      struct phylink_link_state *state)
 {
 	int ret;
 
 	/* Link needs to be read first ... */
-	state->link = xpcs_read_link(xpcs, state->an_enabled) > 0 ? 1 : 0;
+	state->link = xpcs_read_link_c73(xpcs, state->an_enabled) > 0 ? 1 : 0;
 
 	/* ... and then we check the faults. */
-	ret = xpcs_read_fault(xpcs, state);
+	ret = xpcs_read_fault_c73(xpcs, state);
 	if (ret) {
-		ret = xpcs_soft_reset(xpcs, MDIO_MMD_PCS);
+		ret = xpcs_soft_reset(xpcs);
 		if (ret)
 			return ret;
 
@@ -619,10 +637,10 @@ static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
 		return xpcs_config(xpcs, state);
 	}
 
-	if (state->an_enabled && xpcs_aneg_done(xpcs, state)) {
+	if (state->an_enabled && xpcs_aneg_done_c73(xpcs, state)) {
 		state->an_complete = true;
-		xpcs_read_lpa(xpcs, state);
-		xpcs_resolve_lpa(xpcs, state);
+		xpcs_read_lpa_c73(xpcs, state);
+		xpcs_resolve_lpa_c73(xpcs, state);
 	} else if (state->an_enabled) {
 		state->link = 0;
 	} else if (state->link) {
@@ -632,6 +650,24 @@ static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
+static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
+			  struct phylink_link_state *state)
+{
+	int ret;
+
+	switch (xpcs->an_mode) {
+	case DW_AN_C73:
+		ret = xpcs_get_state_c73(xpcs, state);
+		if (ret)
+			return ret;
+		break;
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
 static int xpcs_link_up(struct mdio_xpcs_args *xpcs, int speed,
 			phy_interface_t interface)
 {
@@ -676,6 +712,8 @@ static bool xpcs_check_features(struct mdio_xpcs_args *xpcs,
 	for (i = 0; match->supported[i] != __ETHTOOL_LINK_MODE_MASK_NBITS; i++)
 		set_bit(match->supported[i], xpcs->supported);
 
+	xpcs->an_mode = match->an_mode;
+
 	return true;
 }
 
@@ -692,7 +730,7 @@ static int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
 			match = entry;
 
 			if (xpcs_check_features(xpcs, match, interface))
-				return xpcs_soft_reset(xpcs, MDIO_MMD_PCS);
+				return xpcs_soft_reset(xpcs);
 		}
 	}
 
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index 351c1c9aedc5..a04e57c25fea 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -10,10 +10,14 @@
 #include <linux/phy.h>
 #include <linux/phylink.h>
 
+/* AN mode */
+#define DW_AN_C73			1
+
 struct mdio_xpcs_args {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
 	struct mii_bus *bus;
 	int addr;
+	int an_mode;
 };
 
 struct mdio_xpcs_ops {
-- 
cgit v1.2.3


From b97b5331b8ab7f60fb880e0c31c9b09b73d2fa4e Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Mon, 15 Mar 2021 13:27:07 +0800
Subject: net: pcs: add C37 SGMII AN support for intel mGbE controller

XPCS IP supports C37 SGMII AN process and it is used in intel multi-GbE
controller as MAC-side SGMII.

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pcs/pcs-xpcs.c   | 167 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/pcs/pcs-xpcs.h |   1 +
 2 files changed, 167 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 10def2d98696..944ba105cac1 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -15,6 +15,7 @@
 #define SYNOPSYS_XPCS_USXGMII_ID	0x7996ced0
 #define SYNOPSYS_XPCS_10GKR_ID		0x7996ced0
 #define SYNOPSYS_XPCS_XLGMII_ID		0x7996ced0
+#define SYNOPSYS_XPCS_SGMII_ID		0x7996ced0
 #define SYNOPSYS_XPCS_MASK		0xffffffff
 
 /* Vendor regs access */
@@ -57,6 +58,34 @@
 #define DW_C73_2500KX			BIT(0)
 #define DW_C73_5000KR			BIT(1)
 
+/* Clause 37 Defines */
+/* VR MII MMD registers offsets */
+#define DW_VR_MII_DIG_CTRL1		0x8000
+#define DW_VR_MII_AN_CTRL		0x8001
+#define DW_VR_MII_AN_INTR_STS		0x8002
+
+/* VR_MII_DIG_CTRL1 */
+#define DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW		BIT(9)
+
+/* VR_MII_AN_CTRL */
+#define DW_VR_MII_AN_CTRL_TX_CONFIG_SHIFT	3
+#define DW_VR_MII_TX_CONFIG_MASK		BIT(3)
+#define DW_VR_MII_TX_CONFIG_PHY_SIDE_SGMII	0x1
+#define DW_VR_MII_TX_CONFIG_MAC_SIDE_SGMII	0x0
+#define DW_VR_MII_AN_CTRL_PCS_MODE_SHIFT	1
+#define DW_VR_MII_PCS_MODE_MASK			GENMASK(2, 1)
+#define DW_VR_MII_PCS_MODE_C37_1000BASEX	0x0
+#define DW_VR_MII_PCS_MODE_C37_SGMII		0x2
+
+/* VR_MII_AN_INTR_STS */
+#define DW_VR_MII_AN_STS_C37_ANSGM_FD		BIT(1)
+#define DW_VR_MII_AN_STS_C37_ANSGM_SP_SHIFT	2
+#define DW_VR_MII_AN_STS_C37_ANSGM_SP		GENMASK(3, 2)
+#define DW_VR_MII_C37_ANSGM_SP_10		0x0
+#define DW_VR_MII_C37_ANSGM_SP_100		0x1
+#define DW_VR_MII_C37_ANSGM_SP_1000		0x2
+#define DW_VR_MII_C37_ANSGM_SP_LNKSTS		BIT(4)
+
 static const int xpcs_usxgmii_features[] = {
 	ETHTOOL_LINK_MODE_Pause_BIT,
 	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
@@ -105,6 +134,16 @@ static const int xpcs_xlgmii_features[] = {
 	__ETHTOOL_LINK_MODE_MASK_NBITS,
 };
 
+static const int xpcs_sgmii_features[] = {
+	ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+	ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+	ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
 static const phy_interface_t xpcs_usxgmii_interfaces[] = {
 	PHY_INTERFACE_MODE_USXGMII,
 	PHY_INTERFACE_MODE_MAX,
@@ -120,6 +159,11 @@ static const phy_interface_t xpcs_xlgmii_interfaces[] = {
 	PHY_INTERFACE_MODE_MAX,
 };
 
+static const phy_interface_t xpcs_sgmii_interfaces[] = {
+	PHY_INTERFACE_MODE_SGMII,
+	PHY_INTERFACE_MODE_MAX,
+};
+
 static struct xpcs_id {
 	u32 id;
 	u32 mask;
@@ -145,6 +189,12 @@ static struct xpcs_id {
 		.supported = xpcs_xlgmii_features,
 		.interface = xpcs_xlgmii_interfaces,
 		.an_mode = DW_AN_C73,
+	}, {
+		.id = SYNOPSYS_XPCS_SGMII_ID,
+		.mask = SYNOPSYS_XPCS_MASK,
+		.supported = xpcs_sgmii_features,
+		.interface = xpcs_sgmii_interfaces,
+		.an_mode = DW_AN_C37_SGMII,
 	},
 };
 
@@ -207,6 +257,9 @@ static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs)
 	case DW_AN_C73:
 		dev = MDIO_MMD_PCS;
 		break;
+	case DW_AN_C37_SGMII:
+		dev = MDIO_MMD_VEND2;
+		break;
 	default:
 		return -1;
 	}
@@ -597,6 +650,47 @@ static int xpcs_validate(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
+static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs)
+{
+	int ret;
+
+	/* For AN for C37 SGMII mode, the settings are :-
+	 * 1) VR_MII_AN_CTRL Bit(2:1)[PCS_MODE] = 10b (SGMII AN)
+	 * 2) VR_MII_AN_CTRL Bit(3) [TX_CONFIG] = 0b (MAC side SGMII)
+	 *    DW xPCS used with DW EQoS MAC is always MAC side SGMII.
+	 * 3) VR_MII_DIG_CTRL1 Bit(9) [MAC_AUTO_SW] = 1b (Automatic
+	 *    speed/duplex mode change by HW after SGMII AN complete)
+	 *
+	 * Note: Since it is MAC side SGMII, there is no need to set
+	 *	 SR_MII_AN_ADV. MAC side SGMII receives AN Tx Config from
+	 *	 PHY about the link state change after C28 AN is completed
+	 *	 between PHY and Link Partner. There is also no need to
+	 *	 trigger AN restart for MAC-side SGMII.
+	 */
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_CTRL);
+	if (ret < 0)
+		return ret;
+
+	ret &= ~(DW_VR_MII_PCS_MODE_MASK | DW_VR_MII_TX_CONFIG_MASK);
+	ret |= (DW_VR_MII_PCS_MODE_C37_SGMII <<
+		DW_VR_MII_AN_CTRL_PCS_MODE_SHIFT &
+		DW_VR_MII_PCS_MODE_MASK);
+	ret |= (DW_VR_MII_TX_CONFIG_MAC_SIDE_SGMII <<
+		DW_VR_MII_AN_CTRL_TX_CONFIG_SHIFT &
+		DW_VR_MII_TX_CONFIG_MASK);
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_CTRL, ret);
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1);
+	if (ret < 0)
+		return ret;
+
+	ret |= DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW;
+
+	return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret);
+}
+
 static int xpcs_config(struct mdio_xpcs_args *xpcs,
 		       const struct phylink_link_state *state)
 {
@@ -610,6 +704,11 @@ static int xpcs_config(struct mdio_xpcs_args *xpcs,
 				return ret;
 		}
 		break;
+	case DW_AN_C37_SGMII:
+		ret = xpcs_config_aneg_c37_sgmii(xpcs);
+		if (ret)
+			return ret;
+		break;
 	default:
 		return -1;
 	}
@@ -650,6 +749,47 @@ static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs,
 	return 0;
 }
 
+static int xpcs_get_state_c37_sgmii(struct mdio_xpcs_args *xpcs,
+				    struct phylink_link_state *state)
+{
+	int ret;
+
+	/* Reset link_state */
+	state->link = false;
+	state->speed = SPEED_UNKNOWN;
+	state->duplex = DUPLEX_UNKNOWN;
+	state->pause = 0;
+
+	/* For C37 SGMII mode, we check DW_VR_MII_AN_INTR_STS for link
+	 * status, speed and duplex.
+	 */
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_INTR_STS);
+	if (ret < 0)
+		return false;
+
+	if (ret & DW_VR_MII_C37_ANSGM_SP_LNKSTS) {
+		int speed_value;
+
+		state->link = true;
+
+		speed_value = (ret & DW_VR_MII_AN_STS_C37_ANSGM_SP) >>
+			      DW_VR_MII_AN_STS_C37_ANSGM_SP_SHIFT;
+		if (speed_value == DW_VR_MII_C37_ANSGM_SP_1000)
+			state->speed = SPEED_1000;
+		else if (speed_value == DW_VR_MII_C37_ANSGM_SP_100)
+			state->speed = SPEED_100;
+		else
+			state->speed = SPEED_10;
+
+		if (ret & DW_VR_MII_AN_STS_C37_ANSGM_FD)
+			state->duplex = DUPLEX_FULL;
+		else
+			state->duplex = DUPLEX_HALF;
+	}
+
+	return 0;
+}
+
 static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
 			  struct phylink_link_state *state)
 {
@@ -661,6 +801,11 @@ static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
 		if (ret)
 			return ret;
 		break;
+	case DW_AN_C37_SGMII:
+		ret = xpcs_get_state_c37_sgmii(xpcs, state);
+		if (ret)
+			return ret;
+		break;
 	default:
 		return -1;
 	}
@@ -682,6 +827,7 @@ static u32 xpcs_get_id(struct mdio_xpcs_args *xpcs)
 	int ret;
 	u32 id;
 
+	/* First, search C73 PCS using PCS MMD */
 	ret = xpcs_read(xpcs, MDIO_MMD_PCS, MII_PHYSID1);
 	if (ret < 0)
 		return 0xffffffff;
@@ -692,7 +838,26 @@ static u32 xpcs_get_id(struct mdio_xpcs_args *xpcs)
 	if (ret < 0)
 		return 0xffffffff;
 
-	return id | ret;
+	/* If Device IDs are not all zeros, we found C73 AN-type device */
+	if (id | ret)
+		return id | ret;
+
+	/* Next, search C37 PCS using Vendor-Specific MII MMD */
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, MII_PHYSID1);
+	if (ret < 0)
+		return 0xffffffff;
+
+	id = ret << 16;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, MII_PHYSID2);
+	if (ret < 0)
+		return 0xffffffff;
+
+	/* If Device IDs are not all zeros, we found C37 AN-type device */
+	if (id | ret)
+		return id | ret;
+
+	return 0xffffffff;
 }
 
 static bool xpcs_check_features(struct mdio_xpcs_args *xpcs,
diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h
index a04e57c25fea..2cb5188a7ef1 100644
--- a/include/linux/pcs/pcs-xpcs.h
+++ b/include/linux/pcs/pcs-xpcs.h
@@ -12,6 +12,7 @@
 
 /* AN mode */
 #define DW_AN_C73			1
+#define DW_AN_C37_SGMII			2
 
 struct mdio_xpcs_args {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
-- 
cgit v1.2.3


From ab39385021d1e0b4cd6cc521dc35c2fe659bbddf Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Mon, 15 Mar 2021 13:27:08 +0800
Subject: net: phylink: make phylink_parse_mode() support non-DT platform

Certain platform does not support DT, so we make phylink_parse_mode() to
allow non-DT platform to use it to setup in-band AN advertising.

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 5 +++--
 include/linux/phylink.h   | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 053c92e02cd8..12a047d47dec 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -271,8 +271,9 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode)
 		pl->cfg_link_an_mode = MLO_AN_FIXED;
 	fwnode_handle_put(dn);
 
-	if (fwnode_property_read_string(fwnode, "managed", &managed) == 0 &&
-	    strcmp(managed, "in-band-status") == 0) {
+	if ((fwnode_property_read_string(fwnode, "managed", &managed) == 0 &&
+	     strcmp(managed, "in-band-status") == 0) ||
+	    pl->config->ovr_an_inband) {
 		if (pl->cfg_link_an_mode == MLO_AN_FIXED) {
 			phylink_err(pl,
 				    "can't use both fixed-link and in-band-status\n");
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index d81a714cfbbd..fd2acfd9b597 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -64,6 +64,7 @@ enum phylink_op_type {
  * @pcs_poll: MAC PCS cannot provide link change interrupt
  * @poll_fixed_state: if true, starts link_poll,
  *		      if MAC link is at %MLO_AN_FIXED mode.
+ * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND
  * @get_fixed_state: callback to execute to determine the fixed link state,
  *		     if MAC link is at %MLO_AN_FIXED mode.
  */
@@ -72,6 +73,7 @@ struct phylink_config {
 	enum phylink_op_type type;
 	bool pcs_poll;
 	bool poll_fixed_state;
+	bool ovr_an_inband;
 	void (*get_fixed_state)(struct phylink_config *config,
 				struct phylink_link_state *state);
 };
-- 
cgit v1.2.3


From e5e5b771f684c22b25c67df85d2deb43901f7b95 Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Mon, 15 Mar 2021 13:27:09 +0800
Subject: net: stmmac: make in-band AN mode parsing is supported for non-DT

Not all platform uses DT, so phylink_parse_mode() will skip in-band setup
of pl->supported and pl->link_config.advertising entirely. So, we add the
setting of ovr_an_inband flag to make it works for non-DT platform.

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 ++
 include/linux/stmmac.h                            | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 208cae344ffa..b64ee029d41f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1117,6 +1117,8 @@ static int stmmac_phy_setup(struct stmmac_priv *priv)
 	priv->phylink_config.dev = &priv->dev->dev;
 	priv->phylink_config.type = PHYLINK_NETDEV;
 	priv->phylink_config.pcs_poll = true;
+	priv->phylink_config.ovr_an_inband =
+		priv->plat->mdio_bus_data->xpcs_an_inband;
 
 	if (!fwnode)
 		fwnode = dev_fwnode(priv->device);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index a302982de2d7..722dc167b5c9 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -81,6 +81,7 @@
 struct stmmac_mdio_bus_data {
 	unsigned int phy_mask;
 	unsigned int has_xpcs;
+	unsigned int xpcs_an_inband;
 	int *irqs;
 	int probed_phy_irq;
 	bool needs_reset;
-- 
cgit v1.2.3


From 76d00b494d7962e88d4bbd4135f34aba9019c67f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 23 Feb 2021 01:10:00 +0100
Subject: rcu/nocb: Disable bypass when CPU isn't completely offloaded

Currently, the bypass is flushed at the very last moment in the
deoffloading procedure.  However, this approach leads to a larger state
space than would be preferred.  This commit therefore disables the
bypass at soon as the deoffloading procedure begins, then flushes it.
This guarantees that the bypass remains empty and thus out of the way
of the deoffloading procedure.

Symmetrically, this commit waits to enable the bypass until the offloading
procedure has completed.

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcu_segcblist.h |  7 ++++---
 kernel/rcu/tree_plugin.h      | 38 +++++++++++++++++++++++++++++---------
 2 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 8afe886e85f1..3db96c4f45fd 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -109,7 +109,7 @@ struct rcu_cblist {
  *  |                           SEGCBLIST_KTHREAD_GP                           |
  *  |                                                                          |
  *  |   Kthreads handle callbacks holding nocb_lock, local rcu_core() stops    |
- *  |   handling callbacks.                                                    |
+ *  |   handling callbacks. Enable bypass queueing.                            |
  *  ----------------------------------------------------------------------------
  */
 
@@ -125,7 +125,7 @@ struct rcu_cblist {
  *  |                           SEGCBLIST_KTHREAD_GP                           |
  *  |                                                                          |
  *  |   CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core()    |
- *  |   ignores callbacks.                                                     |
+ *  |   ignores callbacks. Bypass enqueue is enabled.                          |
  *  ----------------------------------------------------------------------------
  *                                      |
  *                                      v
@@ -134,7 +134,8 @@ struct rcu_cblist {
  *  |                           SEGCBLIST_KTHREAD_GP                           |
  *  |                                                                          |
  *  |   CB/GP kthreads and local rcu_core() handle callbacks concurrently      |
- *  |   holding nocb_lock. Wake up CB and GP kthreads if necessary.            |
+ *  |   holding nocb_lock. Wake up CB and GP kthreads if necessary. Disable    |
+ *  |   bypass enqueue.                                                        |
  *  ----------------------------------------------------------------------------
  *                                      |
  *                                      v
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e392bd129316..b08564b2bcf7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1830,11 +1830,22 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	unsigned long j = jiffies;
 	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 
+	lockdep_assert_irqs_disabled();
+
+	// Pure softirq/rcuc based processing: no bypassing, no
+	// locking.
 	if (!rcu_rdp_is_offloaded(rdp)) {
+		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+		return false;
+	}
+
+	// In the process of (de-)offloading: no bypassing, but
+	// locking.
+	if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+		rcu_nocb_lock(rdp);
 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 		return false; /* Not offloaded, no bypassing. */
 	}
-	lockdep_assert_irqs_disabled();
 
 	// Don't use ->nocb_bypass during early boot.
 	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
@@ -2416,7 +2427,16 @@ static long rcu_nocb_rdp_deoffload(void *arg)
 	pr_info("De-offloading %d\n", rdp->cpu);
 
 	rcu_nocb_lock_irqsave(rdp, flags);
-
+	/*
+	 * Flush once and for all now. This suffices because we are
+	 * running on the target CPU holding ->nocb_lock (thus having
+	 * interrupts disabled), and because rdp_offload_toggle()
+	 * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
+	 * Thus future calls to rcu_segcblist_completely_offloaded() will
+	 * return false, which means that future calls to rcu_nocb_try_bypass()
+	 * will refuse to put anything into the bypass.
+	 */
+	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
 	ret = rdp_offload_toggle(rdp, false, flags);
 	swait_event_exclusive(rdp->nocb_state_wq,
 			      !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
@@ -2428,21 +2448,21 @@ static long rcu_nocb_rdp_deoffload(void *arg)
 	del_timer_sync(&rdp->nocb_timer);
 
 	/*
-	 * Flush bypass. While IRQs are disabled and once we set
-	 * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be
-	 * enqueued on bypass.
+	 * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked
+	 * and IRQs disabled but let's be paranoid.
 	 */
 	rcu_nocb_lock_irqsave(rdp, flags);
-	rcu_nocb_flush_bypass(rdp, NULL, jiffies);
 	rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
 	/*
 	 * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
-	 * rcu_nocb_unlock_irqrestore() anymore. Theoretically we
-	 * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs
-	 * disabled now, but let's be paranoid.
+	 * rcu_nocb_unlock_irqrestore() anymore.
 	 */
 	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 
+	/* Sanity check */
+	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From b4d45aee6635197d257f3469413837cd94fc11f4 Mon Sep 17 00:00:00 2001
From: Joakim Zhang <qiangqing.zhang@nxp.com>
Date: Mon, 15 Mar 2021 20:16:47 +0800
Subject: net: stmmac: add platform level clocks management

This patch intends to add platform level clocks management. Some
platforms may have their own special clocks, they also need to be
managed dynamically. If you want to manage such clocks, please implement
clks_config callback.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Joakim Zhang <qiangqing.zhang@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 10 ++++++++++
 include/linux/stmmac.h                            |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3c50846f59cd..a10704d8e3c6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -127,9 +127,19 @@ int stmmac_bus_clks_config(struct stmmac_priv *priv, bool enabled)
 			clk_disable_unprepare(priv->plat->stmmac_clk);
 			return ret;
 		}
+		if (priv->plat->clks_config) {
+			ret = priv->plat->clks_config(priv->plat->bsp_priv, enabled);
+			if (ret) {
+				clk_disable_unprepare(priv->plat->stmmac_clk);
+				clk_disable_unprepare(priv->plat->pclk);
+				return ret;
+			}
+		}
 	} else {
 		clk_disable_unprepare(priv->plat->stmmac_clk);
 		clk_disable_unprepare(priv->plat->pclk);
+		if (priv->plat->clks_config)
+			priv->plat->clks_config(priv->plat->bsp_priv, enabled);
 	}
 
 	return ret;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 722dc167b5c9..51004ebd0540 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -184,6 +184,7 @@ struct plat_stmmacenet_data {
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	struct mac_device_info *(*setup)(void *priv);
+	int (*clks_config)(void *priv, bool enabled);
 	void *bsp_priv;
 	struct clk *stmmac_clk;
 	struct clk *pclk;
-- 
cgit v1.2.3


From 6e3bac3eba448a438840ab8152cb8bbfcb8787b8 Mon Sep 17 00:00:00 2001
From: Ivan Bornyakov <i.bornyakov@metrotek.ru>
Date: Mon, 15 Mar 2021 17:19:26 +0300
Subject: net: phy: add Marvell 88X2222 transceiver support

Add basic support for the Marvell 88X2222 multi-speed ethernet
transceiver.

This PHY provides data transmission over fiber-optic as well as Twinax
copper links. The 88X2222 supports 2 ports of 10GBase-R and 1000Base-X
on the line-side interface. The host-side interface supports 4 ports of
10GBase-R, RXAUI, 1000Base-X and 2 ports of XAUI.

This driver, however, supports only XAUI on the host-side and
1000Base-X/10GBase-R on the line-side, for now. The SGMII is also
supported over 1000Base-X. Interrupts are not supported.

Internal registers access compliant with the Clause 45 specification.

Signed-off-by: Ivan Bornyakov <i.bornyakov@metrotek.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig           |   6 +
 drivers/net/phy/Makefile          |   1 +
 drivers/net/phy/marvell-88x2222.c | 519 ++++++++++++++++++++++++++++++++++++++
 include/linux/marvell_phy.h       |   1 +
 4 files changed, 527 insertions(+)
 create mode 100644 drivers/net/phy/marvell-88x2222.c

(limited to 'include/linux')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 698bea312adc..a615b3660b05 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -201,6 +201,12 @@ config MARVELL_10G_PHY
 	help
 	  Support for the Marvell Alaska MV88X3310 and compatible PHYs.
 
+config MARVELL_88X2222_PHY
+	tristate "Marvell 88X2222 PHY"
+	help
+	  Support for the Marvell 88X2222 Dual-port Multi-speed Ethernet
+	  Transceiver.
+
 config MICREL_PHY
 	tristate "Micrel PHYs"
 	help
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index a13e402074cf..de683e3abe63 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_LSI_ET1011C_PHY)	+= et1011c.o
 obj-$(CONFIG_LXT_PHY)		+= lxt.o
 obj-$(CONFIG_MARVELL_10G_PHY)	+= marvell10g.o
 obj-$(CONFIG_MARVELL_PHY)	+= marvell.o
+obj-$(CONFIG_MARVELL_88X2222_PHY)	+= marvell-88x2222.o
 obj-$(CONFIG_MESON_GXL_PHY)	+= meson-gxl.o
 obj-$(CONFIG_MICREL_KS8995MA)	+= spi_ks8995.o
 obj-$(CONFIG_MICREL_PHY)	+= micrel.o
diff --git a/drivers/net/phy/marvell-88x2222.c b/drivers/net/phy/marvell-88x2222.c
new file mode 100644
index 000000000000..eca8c2f20684
--- /dev/null
+++ b/drivers/net/phy/marvell-88x2222.c
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Marvell 88x2222 dual-port multi-speed ethernet transceiver.
+ *
+ * Supports:
+ *	XAUI on the host side.
+ *	1000Base-X or 10GBase-R on the line side.
+ *	SGMII over 1000Base-X.
+ */
+#include <linux/module.h>
+#include <linux/phy.h>
+#include <linux/gpio.h>
+#include <linux/delay.h>
+#include <linux/mdio.h>
+#include <linux/marvell_phy.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
+#include <linux/sfp.h>
+#include <linux/netdevice.h>
+
+/* Port PCS Configuration */
+#define	MV_PCS_CONFIG		0xF002
+#define	MV_PCS_HOST_XAUI	0x73
+#define	MV_PCS_LINE_10GBR	(0x71 << 8)
+#define	MV_PCS_LINE_1GBX_AN	(0x7B << 8)
+#define	MV_PCS_LINE_SGMII_AN	(0x7F << 8)
+
+/* Port Reset and Power Down */
+#define	MV_PORT_RST	0xF003
+#define	MV_LINE_RST_SW	BIT(15)
+#define	MV_HOST_RST_SW	BIT(7)
+#define	MV_PORT_RST_SW	(MV_LINE_RST_SW | MV_HOST_RST_SW)
+
+/* 1000Base-X/SGMII Control Register */
+#define	MV_1GBX_CTRL		(0x2000 + MII_BMCR)
+
+/* 1000BASE-X/SGMII Status Register */
+#define	MV_1GBX_STAT		(0x2000 + MII_BMSR)
+
+/* 1000Base-X Auto-Negotiation Advertisement Register */
+#define	MV_1GBX_ADVERTISE	(0x2000 + MII_ADVERTISE)
+
+/* 1000Base-X PHY Specific Status Register */
+#define	MV_1GBX_PHY_STAT		0xA003
+#define	MV_1GBX_PHY_STAT_AN_RESOLVED	BIT(11)
+#define	MV_1GBX_PHY_STAT_DUPLEX		BIT(13)
+#define	MV_1GBX_PHY_STAT_SPEED100	BIT(14)
+#define	MV_1GBX_PHY_STAT_SPEED1000	BIT(15)
+
+struct mv2222_data {
+	phy_interface_t line_interface;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+};
+
+/* SFI PMA transmit enable */
+static int mv2222_tx_enable(struct phy_device *phydev)
+{
+	return phy_clear_bits_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_TXDIS,
+				  MDIO_PMD_TXDIS_GLOBAL);
+}
+
+/* SFI PMA transmit disable */
+static int mv2222_tx_disable(struct phy_device *phydev)
+{
+	return phy_set_bits_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_TXDIS,
+				MDIO_PMD_TXDIS_GLOBAL);
+}
+
+static int mv2222_soft_reset(struct phy_device *phydev)
+{
+	int val, ret;
+
+	ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_PORT_RST,
+			    MV_PORT_RST_SW);
+	if (ret < 0)
+		return ret;
+
+	return phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND2, MV_PORT_RST,
+					 val, !(val & MV_PORT_RST_SW),
+					 5000, 1000000, true);
+}
+
+/* Returns negative on error, 0 if link is down, 1 if link is up */
+static int mv2222_read_status_10g(struct phy_device *phydev)
+{
+	int val, link = 0;
+
+	val = phy_read_mmd(phydev, MDIO_MMD_PCS, MDIO_STAT1);
+	if (val < 0)
+		return val;
+
+	if (val & MDIO_STAT1_LSTATUS) {
+		link = 1;
+
+		/* 10GBASE-R do not support auto-negotiation */
+		phydev->autoneg = AUTONEG_DISABLE;
+		phydev->speed = SPEED_10000;
+		phydev->duplex = DUPLEX_FULL;
+	}
+
+	return link;
+}
+
+/* Returns negative on error, 0 if link is down, 1 if link is up */
+static int mv2222_read_status_1g(struct phy_device *phydev)
+{
+	int val, link = 0;
+
+	val = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_STAT);
+	if (val < 0)
+		return val;
+
+	if (!(val & BMSR_LSTATUS) ||
+	    (phydev->autoneg == AUTONEG_ENABLE &&
+	     !(val & BMSR_ANEGCOMPLETE)))
+		return 0;
+
+	link = 1;
+
+	val = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_PHY_STAT);
+	if (val < 0)
+		return val;
+
+	if (val & MV_1GBX_PHY_STAT_AN_RESOLVED) {
+		if (val & MV_1GBX_PHY_STAT_DUPLEX)
+			phydev->duplex = DUPLEX_FULL;
+		else
+			phydev->duplex = DUPLEX_HALF;
+
+		if (val & MV_1GBX_PHY_STAT_SPEED1000)
+			phydev->speed = SPEED_1000;
+		else if (val & MV_1GBX_PHY_STAT_SPEED100)
+			phydev->speed = SPEED_100;
+		else
+			phydev->speed = SPEED_10;
+	}
+
+	return link;
+}
+
+static int mv2222_read_status(struct phy_device *phydev)
+{
+	struct mv2222_data *priv = phydev->priv;
+	int link;
+
+	phydev->link = 0;
+	phydev->speed = SPEED_UNKNOWN;
+	phydev->duplex = DUPLEX_UNKNOWN;
+
+	if (priv->line_interface == PHY_INTERFACE_MODE_10GBASER)
+		link = mv2222_read_status_10g(phydev);
+	else
+		link = mv2222_read_status_1g(phydev);
+
+	if (link < 0)
+		return link;
+
+	phydev->link = link;
+
+	return 0;
+}
+
+static int mv2222_disable_aneg(struct phy_device *phydev)
+{
+	int ret = phy_clear_bits_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_CTRL,
+				     BMCR_ANENABLE | BMCR_ANRESTART);
+	if (ret < 0)
+		return ret;
+
+	return mv2222_soft_reset(phydev);
+}
+
+static int mv2222_enable_aneg(struct phy_device *phydev)
+{
+	int ret = phy_set_bits_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_CTRL,
+				   BMCR_ANENABLE | BMCR_RESET);
+	if (ret < 0)
+		return ret;
+
+	return mv2222_soft_reset(phydev);
+}
+
+static int mv2222_set_sgmii_speed(struct phy_device *phydev)
+{
+	struct mv2222_data *priv = phydev->priv;
+
+	switch (phydev->speed) {
+	default:
+	case SPEED_1000:
+		if ((linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+				       priv->supported) ||
+		     linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+				       priv->supported)))
+			return phy_modify_mmd(phydev, MDIO_MMD_PCS,
+					      MV_1GBX_CTRL,
+					      BMCR_SPEED1000 | BMCR_SPEED100,
+					      BMCR_SPEED1000);
+
+		fallthrough;
+	case SPEED_100:
+		if ((linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+				       priv->supported) ||
+		     linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+				       priv->supported)))
+			return phy_modify_mmd(phydev, MDIO_MMD_PCS,
+					      MV_1GBX_CTRL,
+					      BMCR_SPEED1000 | BMCR_SPEED100,
+					      BMCR_SPEED100);
+		fallthrough;
+	case SPEED_10:
+		if ((linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+				       priv->supported) ||
+		     linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+				       priv->supported)))
+			return phy_modify_mmd(phydev, MDIO_MMD_PCS,
+					      MV_1GBX_CTRL,
+					      BMCR_SPEED1000 | BMCR_SPEED100,
+					      BMCR_SPEED10);
+
+		return -EINVAL;
+	}
+}
+
+static bool mv2222_is_10g_capable(struct phy_device *phydev)
+{
+	struct mv2222_data *priv = phydev->priv;
+
+	return (linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+				  priv->supported) ||
+		linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseCR_Full_BIT,
+				  priv->supported) ||
+		linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseSR_Full_BIT,
+				  priv->supported) ||
+		linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseLR_Full_BIT,
+				  priv->supported) ||
+		linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT,
+				  priv->supported) ||
+		linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseER_Full_BIT,
+				  priv->supported));
+}
+
+static bool mv2222_is_1gbx_capable(struct phy_device *phydev)
+{
+	struct mv2222_data *priv = phydev->priv;
+
+	return linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT,
+				 priv->supported);
+}
+
+static int mv2222_config_line(struct phy_device *phydev)
+{
+	struct mv2222_data *priv = phydev->priv;
+
+	switch (priv->line_interface) {
+	case PHY_INTERFACE_MODE_10GBASER:
+		return phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_PCS_CONFIG,
+				     MV_PCS_HOST_XAUI | MV_PCS_LINE_10GBR);
+	case PHY_INTERFACE_MODE_1000BASEX:
+		return phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_PCS_CONFIG,
+				     MV_PCS_HOST_XAUI | MV_PCS_LINE_1GBX_AN);
+	case PHY_INTERFACE_MODE_SGMII:
+		return phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_PCS_CONFIG,
+				     MV_PCS_HOST_XAUI | MV_PCS_LINE_SGMII_AN);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int mv2222_setup_forced(struct phy_device *phydev)
+{
+	struct mv2222_data *priv = phydev->priv;
+	bool changed = false;
+	int ret;
+
+	switch (priv->line_interface) {
+	case PHY_INTERFACE_MODE_10GBASER:
+		if (phydev->speed == SPEED_1000 &&
+		    mv2222_is_1gbx_capable(phydev)) {
+			priv->line_interface = PHY_INTERFACE_MODE_1000BASEX;
+			changed = true;
+		}
+
+		break;
+	case PHY_INTERFACE_MODE_1000BASEX:
+		if (phydev->speed == SPEED_10000 &&
+		    mv2222_is_10g_capable(phydev)) {
+			priv->line_interface = PHY_INTERFACE_MODE_10GBASER;
+			changed = true;
+		}
+
+		break;
+	case PHY_INTERFACE_MODE_SGMII:
+		ret = mv2222_set_sgmii_speed(phydev);
+		if (ret < 0)
+			return ret;
+
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (changed) {
+		ret = mv2222_config_line(phydev);
+		if (ret < 0)
+			return ret;
+	}
+
+	return mv2222_disable_aneg(phydev);
+}
+
+static int mv2222_config_aneg(struct phy_device *phydev)
+{
+	struct mv2222_data *priv = phydev->priv;
+	int ret, adv;
+
+	/* SFP is not present, do nothing */
+	if (priv->line_interface == PHY_INTERFACE_MODE_NA)
+		return 0;
+
+	if (phydev->autoneg == AUTONEG_DISABLE ||
+	    phydev->speed == SPEED_10000)
+		return mv2222_setup_forced(phydev);
+
+	if (priv->line_interface == PHY_INTERFACE_MODE_10GBASER &&
+	    mv2222_is_1gbx_capable(phydev)) {
+		priv->line_interface = PHY_INTERFACE_MODE_1000BASEX;
+		ret = mv2222_config_line(phydev);
+		if (ret < 0)
+			return ret;
+	}
+
+	adv = linkmode_adv_to_mii_adv_x(priv->supported,
+					ETHTOOL_LINK_MODE_1000baseX_Full_BIT);
+
+	ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_ADVERTISE,
+			     ADVERTISE_1000XFULL |
+			     ADVERTISE_1000XPAUSE | ADVERTISE_1000XPSE_ASYM,
+			     adv);
+	if (ret < 0)
+		return ret;
+
+	return mv2222_enable_aneg(phydev);
+}
+
+static int mv2222_aneg_done(struct phy_device *phydev)
+{
+	int ret;
+
+	if (mv2222_is_10g_capable(phydev)) {
+		ret = phy_read_mmd(phydev, MDIO_MMD_PCS, MDIO_STAT1);
+		if (ret < 0)
+			return ret;
+
+		if (ret & MDIO_STAT1_LSTATUS)
+			return 1;
+	}
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_STAT);
+	if (ret < 0)
+		return ret;
+
+	return (ret & BMSR_ANEGCOMPLETE);
+}
+
+static int mv2222_resume(struct phy_device *phydev)
+{
+	return mv2222_tx_enable(phydev);
+}
+
+static int mv2222_suspend(struct phy_device *phydev)
+{
+	return mv2222_tx_disable(phydev);
+}
+
+static int mv2222_get_features(struct phy_device *phydev)
+{
+	/* All supported linkmodes are set at probe */
+
+	return 0;
+}
+
+static int mv2222_config_init(struct phy_device *phydev)
+{
+	if (phydev->interface != PHY_INTERFACE_MODE_XAUI)
+		return -EINVAL;
+
+	phydev->autoneg = AUTONEG_DISABLE;
+
+	return 0;
+}
+
+static int mv2222_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
+{
+	struct phy_device *phydev = upstream;
+	phy_interface_t sfp_interface;
+	struct mv2222_data *priv;
+	struct device *dev;
+	int ret;
+
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(sfp_supported) = { 0, };
+
+	priv = (struct mv2222_data *)phydev->priv;
+	dev = &phydev->mdio.dev;
+
+	sfp_parse_support(phydev->sfp_bus, id, sfp_supported);
+	sfp_interface = sfp_select_interface(phydev->sfp_bus, sfp_supported);
+
+	dev_info(dev, "%s SFP module inserted\n", phy_modes(sfp_interface));
+
+	if (sfp_interface != PHY_INTERFACE_MODE_10GBASER &&
+	    sfp_interface != PHY_INTERFACE_MODE_1000BASEX &&
+	    sfp_interface != PHY_INTERFACE_MODE_SGMII) {
+		dev_err(dev, "Incompatible SFP module inserted\n");
+
+		return -EINVAL;
+	}
+
+	priv->line_interface = sfp_interface;
+	linkmode_and(priv->supported, phydev->supported, sfp_supported);
+
+	ret = mv2222_config_line(phydev);
+	if (ret < 0)
+		return ret;
+
+	if (mutex_trylock(&phydev->lock)) {
+		if (priv->line_interface == PHY_INTERFACE_MODE_10GBASER)
+			ret = mv2222_setup_forced(phydev);
+		else
+			ret = mv2222_config_aneg(phydev);
+
+		mutex_unlock(&phydev->lock);
+	}
+
+	return ret;
+}
+
+static void mv2222_sfp_remove(void *upstream)
+{
+	struct phy_device *phydev = upstream;
+	struct mv2222_data *priv;
+
+	priv = (struct mv2222_data *)phydev->priv;
+
+	priv->line_interface = PHY_INTERFACE_MODE_NA;
+	linkmode_zero(priv->supported);
+}
+
+static const struct sfp_upstream_ops sfp_phy_ops = {
+	.module_insert = mv2222_sfp_insert,
+	.module_remove = mv2222_sfp_remove,
+	.attach = phy_sfp_attach,
+	.detach = phy_sfp_detach,
+};
+
+static int mv2222_probe(struct phy_device *phydev)
+{
+	struct device *dev = &phydev->mdio.dev;
+	struct mv2222_data *priv = NULL;
+
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = { 0, };
+
+	linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_Pause_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_FIBRE_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_TP_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseCR_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseSR_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseLR_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT, supported);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseER_Full_BIT, supported);
+
+	linkmode_copy(phydev->supported, supported);
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->line_interface = PHY_INTERFACE_MODE_NA;
+	phydev->priv = priv;
+
+	return phy_sfp_probe(phydev, &sfp_phy_ops);
+}
+
+static struct phy_driver mv2222_drivers[] = {
+	{
+		.phy_id = MARVELL_PHY_ID_88X2222,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88X2222",
+		.get_features = mv2222_get_features,
+		.soft_reset = mv2222_soft_reset,
+		.config_init = mv2222_config_init,
+		.config_aneg = mv2222_config_aneg,
+		.aneg_done = mv2222_aneg_done,
+		.probe = mv2222_probe,
+		.suspend = mv2222_suspend,
+		.resume = mv2222_resume,
+		.read_status = mv2222_read_status,
+	},
+};
+module_phy_driver(mv2222_drivers);
+
+static struct mdio_device_id __maybe_unused mv2222_tbl[] = {
+	{ MARVELL_PHY_ID_88X2222, MARVELL_PHY_ID_MASK },
+	{ }
+};
+MODULE_DEVICE_TABLE(mdio, mv2222_tbl);
+
+MODULE_DESCRIPTION("Marvell 88x2222 ethernet transceiver driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 52b1610eae68..274abd5fbac3 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -24,6 +24,7 @@
 #define MARVELL_PHY_ID_88E3016		0x01410e60
 #define MARVELL_PHY_ID_88X3310		0x002b09a0
 #define MARVELL_PHY_ID_88E2110		0x002b09b0
+#define MARVELL_PHY_ID_88X2222		0x01410f10
 
 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
 #define MARVELL_PHY_ID_88E1111_FINISAR	0x01ff0cc0
-- 
cgit v1.2.3


From 45f3a13c816656c9d3d311880d90286341644d9b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Mon, 15 Mar 2021 16:51:46 -0500
Subject: net: qualcomm: rmnet: mark trailer field endianness

The fields in the checksum trailer structure used for QMAP protocol
RX packets are all big-endian format, so define them that way.

It turns out these fields are never actually used by the RMNet code.
The start offset is always assumed to be zero, and the length is
taken from the other packet headers.  So making these fields
explicitly big endian has no effect on the behavior of the code.

Signed-off-by: Alex Elder <elder@linaro.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_rmnet.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index 9661416a9bb4..8c7845baf383 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -32,8 +32,8 @@ struct rmnet_map_dl_csum_trailer {
 #else
 #error	"Please fix <asm/byteorder.h>"
 #endif
-	u16 csum_start_offset;
-	u16 csum_length;
+	__be16 csum_start_offset;
+	__be16 csum_length;
 	__be16 csum_value;
 } __aligned(1);
 
-- 
cgit v1.2.3


From 16653c16d282e768763b2e8cc78f75df8fd53992 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Mon, 15 Mar 2021 16:51:49 -0500
Subject: net: qualcomm: rmnet: use masks instead of C bit-fields

The actual layout of bits defined in C bit-fields (e.g. int foo : 3)
is implementation-defined.  Structures defined in <linux/if_rmnet.h>
address this by specifying all bit-fields twice, to cover two
possible layouts.

I think this pattern is repetitive and noisy, and I find the whole
notion of compiler "bitfield endianness" to be non-intuitive.

Stop using C bit-fields for the command/data flag and the pad length
fields in the rmnet_map structure, and define a single-byte flags
field instead.  Define a mask for the single-bit "command" flag,
and another mask for the encoded pad length.  The content of both
fields can be accessed using a simple bitwise AND operation.

Signed-off-by: Alex Elder <elder@linaro.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c   |  4 ++--
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   |  4 +++-
 include/linux/if_rmnet.h                           | 23 ++++++++++------------
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index 2a6b2a609884..0be5ac7ab261 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -61,7 +61,7 @@ __rmnet_map_ingress_handler(struct sk_buff *skb,
 	u16 len, pad;
 	u8 mux_id;
 
-	if (map_header->cd_bit) {
+	if (map_header->flags & MAP_CMD_FLAG) {
 		/* Packet contains a MAP command (not data) */
 		if (port->data_format & RMNET_FLAGS_INGRESS_MAP_COMMANDS)
 			return rmnet_map_command(skb, port);
@@ -70,7 +70,7 @@ __rmnet_map_ingress_handler(struct sk_buff *skb,
 	}
 
 	mux_id = map_header->mux_id;
-	pad = map_header->pad_len;
+	pad = map_header->flags & MAP_PAD_LEN_MASK;
 	len = ntohs(map_header->pkt_len) - pad;
 
 	if (mux_id >= RMNET_MAX_LOGICAL_EP)
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index 3af68368fc31..e7d0394cb297 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -280,6 +280,7 @@ struct rmnet_map_header *rmnet_map_add_map_header(struct sk_buff *skb,
 		return map_header;
 	}
 
+	BUILD_BUG_ON(MAP_PAD_LEN_MASK < 3);
 	padding = ALIGN(map_datalen, 4) - map_datalen;
 
 	if (padding == 0)
@@ -293,7 +294,8 @@ struct rmnet_map_header *rmnet_map_add_map_header(struct sk_buff *skb,
 
 done:
 	map_header->pkt_len = htons(map_datalen + padding);
-	map_header->pad_len = padding & 0x3F;
+	/* This is a data packet, so the CMD bit is 0 */
+	map_header->flags = padding & MAP_PAD_LEN_MASK;
 
 	return map_header;
 }
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index 8c7845baf383..a02f0a3df1d9 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -6,21 +6,18 @@
 #define _LINUX_IF_RMNET_H_
 
 struct rmnet_map_header {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	u8  pad_len:6;
-	u8  reserved_bit:1;
-	u8  cd_bit:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	u8  cd_bit:1;
-	u8  reserved_bit:1;
-	u8  pad_len:6;
-#else
-#error	"Please fix <asm/byteorder.h>"
-#endif
-	u8  mux_id;
-	__be16 pkt_len;
+	u8 flags;			/* MAP_CMD_FLAG, MAP_PAD_LEN_MASK */
+	u8 mux_id;
+	__be16 pkt_len;			/* Length of packet, including pad */
 }  __aligned(1);
 
+/* rmnet_map_header flags field:
+ *  PAD_LEN:	number of pad bytes following packet data
+ *  CMD:	1 = packet contains a MAP command; 0 = packet contains data
+ */
+#define MAP_PAD_LEN_MASK		GENMASK(5, 0)
+#define MAP_CMD_FLAG			BIT(7)
+
 struct rmnet_map_dl_csum_trailer {
 	u8  reserved1;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-- 
cgit v1.2.3


From cc1b21ba6251c8dd8e4e86018c9fdba85df0d219 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Mon, 15 Mar 2021 16:51:50 -0500
Subject: net: qualcomm: rmnet: don't use C bit-fields in rmnet checksum
 trailer

Replace the use of C bit-fields in the rmnet_map_dl_csum_trailer
structure with a single one-byte field, using constant field masks
to encode or get at embedded values.

Signed-off-by: Alex Elder <elder@linaro.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c |  2 +-
 include/linux/if_rmnet.h                             | 17 +++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index e7d0394cb297..c336c17e01fe 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -359,7 +359,7 @@ int rmnet_map_checksum_downlink_packet(struct sk_buff *skb, u16 len)
 
 	csum_trailer = (struct rmnet_map_dl_csum_trailer *)(skb->data + len);
 
-	if (!csum_trailer->valid) {
+	if (!(csum_trailer->flags & MAP_CSUM_DL_VALID_FLAG)) {
 		priv->stats.csum_valid_unset++;
 		return -EINVAL;
 	}
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index a02f0a3df1d9..941997df9e08 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -19,21 +19,18 @@ struct rmnet_map_header {
 #define MAP_CMD_FLAG			BIT(7)
 
 struct rmnet_map_dl_csum_trailer {
-	u8  reserved1;
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	u8  valid:1;
-	u8  reserved2:7;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	u8  reserved2:7;
-	u8  valid:1;
-#else
-#error	"Please fix <asm/byteorder.h>"
-#endif
+	u8 reserved1;
+	u8 flags;			/* MAP_CSUM_DL_VALID_FLAG */
 	__be16 csum_start_offset;
 	__be16 csum_length;
 	__be16 csum_value;
 } __aligned(1);
 
+/* rmnet_map_dl_csum_trailer flags field:
+ *  VALID:	1 = checksum and length valid; 0 = ignore them
+ */
+#define MAP_CSUM_DL_VALID_FLAG		BIT(0)
+
 struct rmnet_map_ul_csum_header {
 	__be16 csum_start_offset;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-- 
cgit v1.2.3


From 86ca860e12ec0feab7d721d3b05e60fb86613540 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Mon, 15 Mar 2021 16:51:51 -0500
Subject: net: qualcomm: rmnet: don't use C bit-fields in rmnet checksum header

Replace the use of C bit-fields in the rmnet_map_ul_csum_header
structure with a single two-byte (big endian) structure member,
and use masks to encode or get values within it.  The content of
these fields can be accessed using simple bitwise AND and OR
operations on the (host byte order) value of the new structure
member.

Previously rmnet_map_ipv4_ul_csum_header() would update C bit-field
values in host byte order, then forcibly fix their byte order using
a combination of byte swap operations and types.

Instead, just compute the value that needs to go into the new
structure member and save it with a simple byte-order conversion.

Make similar simplifications in rmnet_map_ipv6_ul_csum_header().

Finally, in rmnet_map_checksum_uplink_packet() a set of assignments
zeroes every field in the upload checksum header.  Replace that with
a single memset() operation.

Signed-off-by: Alex Elder <elder@linaro.org>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   | 38 ++++++++--------------
 include/linux/if_rmnet.h                           | 21 ++++++------
 2 files changed, 23 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index c336c17e01fe..0ac2ff828320 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -197,20 +197,16 @@ rmnet_map_ipv4_ul_csum_header(void *iphdr,
 			      struct rmnet_map_ul_csum_header *ul_header,
 			      struct sk_buff *skb)
 {
-	__be16 *hdr = (__be16 *)ul_header;
 	struct iphdr *ip4h = iphdr;
+	u16 val;
 
-	ul_header->csum_start_offset = htons(skb_network_header_len(skb));
-	ul_header->csum_insert_offset = skb->csum_offset;
-	ul_header->csum_enabled = 1;
+	val = MAP_CSUM_UL_ENABLED_FLAG;
 	if (ip4h->protocol == IPPROTO_UDP)
-		ul_header->udp_ind = 1;
-	else
-		ul_header->udp_ind = 0;
+		val |= MAP_CSUM_UL_UDP_FLAG;
+	val |= skb->csum_offset & MAP_CSUM_UL_OFFSET_MASK;
 
-	/* Changing remaining fields to network order */
-	hdr++;
-	*hdr = htons((__force u16)*hdr);
+	ul_header->csum_start_offset = htons(skb_network_header_len(skb));
+	ul_header->csum_info = htons(val);
 
 	skb->ip_summed = CHECKSUM_NONE;
 
@@ -237,21 +233,16 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr,
 			      struct rmnet_map_ul_csum_header *ul_header,
 			      struct sk_buff *skb)
 {
-	__be16 *hdr = (__be16 *)ul_header;
 	struct ipv6hdr *ip6h = ip6hdr;
+	u16 val;
 
-	ul_header->csum_start_offset = htons(skb_network_header_len(skb));
-	ul_header->csum_insert_offset = skb->csum_offset;
-	ul_header->csum_enabled = 1;
-
+	val = MAP_CSUM_UL_ENABLED_FLAG;
 	if (ip6h->nexthdr == IPPROTO_UDP)
-		ul_header->udp_ind = 1;
-	else
-		ul_header->udp_ind = 0;
+		val |= MAP_CSUM_UL_UDP_FLAG;
+	val |= skb->csum_offset & MAP_CSUM_UL_OFFSET_MASK;
 
-	/* Changing remaining fields to network order */
-	hdr++;
-	*hdr = htons((__force u16)*hdr);
+	ul_header->csum_start_offset = htons(skb_network_header_len(skb));
+	ul_header->csum_info = htons(val);
 
 	skb->ip_summed = CHECKSUM_NONE;
 
@@ -419,10 +410,7 @@ void rmnet_map_checksum_uplink_packet(struct sk_buff *skb,
 	}
 
 sw_csum:
-	ul_header->csum_start_offset = 0;
-	ul_header->csum_insert_offset = 0;
-	ul_header->csum_enabled = 0;
-	ul_header->udp_ind = 0;
+	memset(ul_header, 0, sizeof(*ul_header));
 
 	priv->stats.csum_sw++;
 }
diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h
index 941997df9e08..4efb537f57f3 100644
--- a/include/linux/if_rmnet.h
+++ b/include/linux/if_rmnet.h
@@ -33,17 +33,16 @@ struct rmnet_map_dl_csum_trailer {
 
 struct rmnet_map_ul_csum_header {
 	__be16 csum_start_offset;
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-	u16 csum_insert_offset:14;
-	u16 udp_ind:1;
-	u16 csum_enabled:1;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-	u16 csum_enabled:1;
-	u16 udp_ind:1;
-	u16 csum_insert_offset:14;
-#else
-#error	"Please fix <asm/byteorder.h>"
-#endif
+	__be16 csum_info;		/* MAP_CSUM_UL_* */
 } __aligned(1);
 
+/* csum_info field:
+ *  OFFSET:	where (offset in bytes) to insert computed checksum
+ *  UDP:	1 = UDP checksum (zero checkum means no checksum)
+ *  ENABLED:	1 = checksum computation requested
+ */
+#define MAP_CSUM_UL_OFFSET_MASK		GENMASK(13, 0)
+#define MAP_CSUM_UL_UDP_FLAG		BIT(14)
+#define MAP_CSUM_UL_ENABLED_FLAG	BIT(15)
+
 #endif /* !(_LINUX_IF_RMNET_H_) */
-- 
cgit v1.2.3


From c41c8a3485b0b36b7e308eeff8716eb77093596a Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Sun, 14 Mar 2021 19:33:58 +0300
Subject: opp: Change return type of devm_pm_opp_register_set_opp_helper()

Make devm_pm_opp_register_set_opp_helper() to return error code instead
of opp_table pointer in order to have return type consistent with the
other resource-managed OPP helpers.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 18 ++++++------------
 include/linux/pm_opp.h |  7 +++----
 2 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 956ea22a21d6..89c3b0b219ce 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2295,25 +2295,19 @@ static void devm_pm_opp_unregister_set_opp_helper(void *data)
  *
  * This is a resource-managed version of dev_pm_opp_register_set_opp_helper().
  *
- * Return: pointer to 'struct opp_table' on success and errorno otherwise.
+ * Return: 0 on success and errorno otherwise.
  */
-struct opp_table *
-devm_pm_opp_register_set_opp_helper(struct device *dev,
-				    int (*set_opp)(struct dev_pm_set_opp_data *data))
+int devm_pm_opp_register_set_opp_helper(struct device *dev,
+					int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
 	struct opp_table *opp_table;
-	int err;
 
 	opp_table = dev_pm_opp_register_set_opp_helper(dev, set_opp);
 	if (IS_ERR(opp_table))
-		return opp_table;
-
-	err = devm_add_action_or_reset(dev, devm_pm_opp_unregister_set_opp_helper,
-				       opp_table);
-	if (err)
-		return ERR_PTR(err);
+		return PTR_ERR(opp_table);
 
-	return opp_table;
+	return devm_add_action_or_reset(dev, devm_pm_opp_unregister_set_opp_helper,
+					opp_table);
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_register_set_opp_helper);
 
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 3f8894012429..2cf9694908a2 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -155,7 +155,7 @@ void dev_pm_opp_put_clkname(struct opp_table *opp_table);
 int devm_pm_opp_set_clkname(struct device *dev, const char *name);
 struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
-struct opp_table *devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
+int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
 void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
 struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
@@ -337,11 +337,10 @@ static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device
 
 static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {}
 
-static inline struct opp_table *
-devm_pm_opp_register_set_opp_helper(struct device *dev,
+static inline int devm_pm_opp_register_set_opp_helper(struct device *dev,
 				    int (*set_opp)(struct dev_pm_set_opp_data *data))
 {
-	return ERR_PTR(-EOPNOTSUPP);
+	return -EOPNOTSUPP;
 }
 
 static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
-- 
cgit v1.2.3


From 9edf48a4bfb75456f7612972a4750a12d8a83702 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Sun, 14 Mar 2021 19:33:59 +0300
Subject: opp: Change return type of devm_pm_opp_attach_genpd()

Make devm_pm_opp_attach_genpd() to return error code instead of
opp_table pointer in order to have return type consistent with the
other resource-managed OPP helpers.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 18 ++++++------------
 include/linux/pm_opp.h |  9 +++++----
 2 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 89c3b0b219ce..e366218d6736 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2460,25 +2460,19 @@ static void devm_pm_opp_detach_genpd(void *data)
  *
  * This is a resource-managed version of dev_pm_opp_attach_genpd().
  *
- * Return: pointer to 'struct opp_table' on success and errorno otherwise.
+ * Return: 0 on success and errorno otherwise.
  */
-struct opp_table *
-devm_pm_opp_attach_genpd(struct device *dev, const char **names,
-			 struct device ***virt_devs)
+int devm_pm_opp_attach_genpd(struct device *dev, const char **names,
+			     struct device ***virt_devs)
 {
 	struct opp_table *opp_table;
-	int err;
 
 	opp_table = dev_pm_opp_attach_genpd(dev, names, virt_devs);
 	if (IS_ERR(opp_table))
-		return opp_table;
-
-	err = devm_add_action_or_reset(dev, devm_pm_opp_detach_genpd,
-				       opp_table);
-	if (err)
-		return ERR_PTR(err);
+		return PTR_ERR(opp_table);
 
-	return opp_table;
+	return devm_add_action_or_reset(dev, devm_pm_opp_detach_genpd,
+					opp_table);
 }
 EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd);
 
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 2cf9694908a2..84150a22fd7c 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -158,7 +158,7 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table);
 int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
 struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
 void dev_pm_opp_detach_genpd(struct opp_table *opp_table);
-struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
+int devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs);
 struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp);
 int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
@@ -383,10 +383,11 @@ static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, cons
 
 static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {}
 
-static inline struct opp_table *devm_pm_opp_attach_genpd(struct device *dev,
-				const char **names, struct device ***virt_devs)
+static inline int devm_pm_opp_attach_genpd(struct device *dev,
+					   const char **names,
+					   struct device ***virt_devs)
 {
-	return ERR_PTR(-EOPNOTSUPP);
+	return -EOPNOTSUPP;
 }
 
 static inline struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table,
-- 
cgit v1.2.3


From cfb12911ffb0ed38c7c420efb9d5dc8494e28109 Mon Sep 17 00:00:00 2001
From: Yangtao Li <tiny.windzz@gmail.com>
Date: Sun, 14 Mar 2021 19:34:01 +0300
Subject: spi: spi-geni-qcom: Convert to use resource-managed OPP API

Use resource-managed OPP API to simplify code.

Signed-off-by: Yangtao Li <tiny.windzz@gmail.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/spi/spi-geni-qcom.c  | 16 ++++++----------
 include/linux/qcom-geni-se.h |  2 --
 2 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c
index 881f645661cc..3d0d8ddd5772 100644
--- a/drivers/spi/spi-geni-qcom.c
+++ b/drivers/spi/spi-geni-qcom.c
@@ -691,14 +691,15 @@ static int spi_geni_probe(struct platform_device *pdev)
 	mas->se.wrapper = dev_get_drvdata(dev->parent);
 	mas->se.base = base;
 	mas->se.clk = clk;
-	mas->se.opp_table = dev_pm_opp_set_clkname(&pdev->dev, "se");
-	if (IS_ERR(mas->se.opp_table))
-		return PTR_ERR(mas->se.opp_table);
+
+	ret = devm_pm_opp_set_clkname(&pdev->dev, "se");
+	if (ret)
+		return ret;
 	/* OPP table is optional */
-	ret = dev_pm_opp_of_add_table(&pdev->dev);
+	ret = devm_pm_opp_of_add_table(&pdev->dev);
 	if (ret && ret != -ENODEV) {
 		dev_err(&pdev->dev, "invalid OPP table in device tree\n");
-		goto put_clkname;
+		return ret;
 	}
 
 	spi->bus_num = -1;
@@ -750,9 +751,6 @@ spi_geni_probe_free_irq:
 	free_irq(mas->irq, spi);
 spi_geni_probe_runtime_disable:
 	pm_runtime_disable(dev);
-	dev_pm_opp_of_remove_table(&pdev->dev);
-put_clkname:
-	dev_pm_opp_put_clkname(mas->se.opp_table);
 	return ret;
 }
 
@@ -766,8 +764,6 @@ static int spi_geni_remove(struct platform_device *pdev)
 
 	free_irq(mas->irq, spi);
 	pm_runtime_disable(&pdev->dev);
-	dev_pm_opp_of_remove_table(&pdev->dev);
-	dev_pm_opp_put_clkname(mas->se.opp_table);
 	return 0;
 }
 
diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h
index ec2ad4b0fe14..cddef864a760 100644
--- a/include/linux/qcom-geni-se.h
+++ b/include/linux/qcom-geni-se.h
@@ -47,7 +47,6 @@ struct geni_icc_path {
  * @num_clk_levels:	Number of valid clock levels in clk_perf_tbl
  * @clk_perf_tbl:	Table of clock frequency input to serial engine clock
  * @icc_paths:		Array of ICC paths for SE
- * @opp_table:		Pointer to the OPP table
  */
 struct geni_se {
 	void __iomem *base;
@@ -57,7 +56,6 @@ struct geni_se {
 	unsigned int num_clk_levels;
 	unsigned long *clk_perf_tbl;
 	struct geni_icc_path icc_paths[3];
-	struct opp_table *opp_table;
 };
 
 /* Common SE registers */
-- 
cgit v1.2.3


From 1d49439c04792a4a3d8299a32b7673ab7ba13b77 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Mon, 15 Mar 2021 16:38:42 -0300
Subject: ASoC: mx27vis: Remove unused file

i.MX has been converted to a devicetree-only platform and
asoc-mx27vis.h is no longer used.

Get rid of this unused file.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20210315193842.183042-1-festevam@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/asoc-mx27vis.h | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 include/linux/platform_data/asoc-mx27vis.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/asoc-mx27vis.h b/include/linux/platform_data/asoc-mx27vis.h
deleted file mode 100644
index 2107d0d992dd..000000000000
--- a/include/linux/platform_data/asoc-mx27vis.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PLATFORM_DATA_ASOC_MX27VIS_H
-#define __PLATFORM_DATA_ASOC_MX27VIS_H
-
-struct snd_mx27vis_platform_data {
-	int amp_gain0_gpio;
-	int amp_gain1_gpio;
-	int amp_mutel_gpio;
-	int amp_muter_gpio;
-};
-
-#endif /* __PLATFORM_DATA_ASOC_MX27VIS_H */
-- 
cgit v1.2.3


From 3a0ade0c521a542f8a25e96ce8ea0dfaa532ac75 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sat, 6 Mar 2021 13:36:58 -0800
Subject: tasklet: Remove tasklet_kill_immediate

Ever since RCU was converted to softirq, it has no users.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20210306213658.12862-1-dave@stgolabs.net
---
 include/linux/interrupt.h |  1 -
 kernel/softirq.c          | 32 --------------------------------
 2 files changed, 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 76f1161a441a..2b98156ec707 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -716,7 +716,6 @@ static inline void tasklet_enable(struct tasklet_struct *t)
 }
 
 extern void tasklet_kill(struct tasklet_struct *t);
-extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 			 void (*func)(unsigned long), unsigned long data);
 extern void tasklet_setup(struct tasklet_struct *t,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9908ec4a9bfe..8b44ab9a2f69 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -658,38 +658,6 @@ static void run_ksoftirqd(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * tasklet_kill_immediate is called to remove a tasklet which can already be
- * scheduled for execution on @cpu.
- *
- * Unlike tasklet_kill, this function removes the tasklet
- * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
- *
- * When this function is called, @cpu must be in the CPU_DEAD state.
- */
-void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
-{
-	struct tasklet_struct **i;
-
-	BUG_ON(cpu_online(cpu));
-	BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
-
-	if (!test_bit(TASKLET_STATE_SCHED, &t->state))
-		return;
-
-	/* CPU is dead, so no lock needed. */
-	for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
-		if (*i == t) {
-			*i = t->next;
-			/* If this was the tail element, move the tail ptr */
-			if (*i == NULL)
-				per_cpu(tasklet_vec, cpu).tail = i;
-			return;
-		}
-	}
-	BUG();
-}
-
 static int takeover_tasklets(unsigned int cpu)
 {
 	/* CPU is dead, so no lock needed. */
-- 
cgit v1.2.3


From 6f73171e192366ff7c98af9fb50615ef9615f8a7 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 4 Mar 2021 12:48:22 +0200
Subject: fsnotify: allow fsnotify_{peek,remove}_first_event with empty queue

Current code has an assumtion that fsnotify_notify_queue_is_empty() is
called to verify that queue is not empty before trying to peek or remove
an event from queue.

Remove this assumption by moving the fsnotify_notify_queue_is_empty()
into the functions, allow them to return NULL value and check return
value by all callers.

This is a prep patch for multi event queues.

Link: https://lore.kernel.org/r/20210304104826.3993892-2-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c | 26 ++++++++++++++---------
 fs/notify/inotify/inotify_user.c   |  5 ++---
 fs/notify/notification.c           | 42 +++++++++++++++++---------------------
 include/linux/fsnotify_backend.h   |  8 +++++++-
 4 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9e0c1afac8bd..16162207e886 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -100,24 +100,30 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
 {
 	size_t event_size = FAN_EVENT_METADATA_LEN;
 	struct fanotify_event *event = NULL;
+	struct fsnotify_event *fsn_event;
 	unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
 
 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
 
 	spin_lock(&group->notification_lock);
-	if (fsnotify_notify_queue_is_empty(group))
+	fsn_event = fsnotify_peek_first_event(group);
+	if (!fsn_event)
 		goto out;
 
-	if (fid_mode) {
-		event_size += fanotify_event_info_len(fid_mode,
-			FANOTIFY_E(fsnotify_peek_first_event(group)));
-	}
+	event = FANOTIFY_E(fsn_event);
+	if (fid_mode)
+		event_size += fanotify_event_info_len(fid_mode, event);
 
 	if (event_size > count) {
 		event = ERR_PTR(-EINVAL);
 		goto out;
 	}
-	event = FANOTIFY_E(fsnotify_remove_first_event(group));
+
+	/*
+	 * Held the notification_lock the whole time, so this is the
+	 * same event we peeked above.
+	 */
+	fsnotify_remove_first_event(group);
 	if (fanotify_is_perm_event(event->mask))
 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
 out:
@@ -573,6 +579,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
+	struct fsnotify_event *fsn_event;
 
 	/*
 	 * Stop new events from arriving in the notification queue. since
@@ -601,13 +608,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 	 * dequeue them and set the response. They will be freed once the
 	 * response is consumed and fanotify_get_response() returns.
 	 */
-	while (!fsnotify_notify_queue_is_empty(group)) {
-		struct fanotify_event *event;
+	while ((fsn_event = fsnotify_remove_first_event(group))) {
+		struct fanotify_event *event = FANOTIFY_E(fsn_event);
 
-		event = FANOTIFY_E(fsnotify_remove_first_event(group));
 		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
 			spin_unlock(&group->notification_lock);
-			fsnotify_destroy_event(group, &event->fse);
+			fsnotify_destroy_event(group, fsn_event);
 		} else {
 			finish_permission_event(group, FANOTIFY_PERM(event),
 						FAN_ALLOW);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index c71be4fb7dc5..a6c95bd64618 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -146,10 +146,9 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 	size_t event_size = sizeof(struct inotify_event);
 	struct fsnotify_event *event;
 
-	if (fsnotify_notify_queue_is_empty(group))
-		return NULL;
-
 	event = fsnotify_peek_first_event(group);
+	if (!event)
+		return NULL;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 75d79d6d3ef0..001cfe7d2e4e 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -47,13 +47,6 @@ u32 fsnotify_get_cookie(void)
 }
 EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
 
-/* return true if the notify queue is empty, false otherwise */
-bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
-{
-	assert_spin_locked(&group->notification_lock);
-	return list_empty(&group->notification_list) ? true : false;
-}
-
 void fsnotify_destroy_event(struct fsnotify_group *group,
 			    struct fsnotify_event *event)
 {
@@ -141,33 +134,36 @@ void fsnotify_remove_queued_event(struct fsnotify_group *group,
 }
 
 /*
- * Remove and return the first event from the notification list.  It is the
- * responsibility of the caller to destroy the obtained event
+ * Return the first event on the notification list without removing it.
+ * Returns NULL if the list is empty.
  */
-struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
+struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
 {
-	struct fsnotify_event *event;
-
 	assert_spin_locked(&group->notification_lock);
 
-	pr_debug("%s: group=%p\n", __func__, group);
+	if (fsnotify_notify_queue_is_empty(group))
+		return NULL;
 
-	event = list_first_entry(&group->notification_list,
-				 struct fsnotify_event, list);
-	fsnotify_remove_queued_event(group, event);
-	return event;
+	return list_first_entry(&group->notification_list,
+				struct fsnotify_event, list);
 }
 
 /*
- * This will not remove the event, that must be done with
- * fsnotify_remove_first_event()
+ * Remove and return the first event from the notification list.  It is the
+ * responsibility of the caller to destroy the obtained event
  */
-struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
+struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
 {
-	assert_spin_locked(&group->notification_lock);
+	struct fsnotify_event *event = fsnotify_peek_first_event(group);
 
-	return list_first_entry(&group->notification_list,
-				struct fsnotify_event, list);
+	if (!event)
+		return NULL;
+
+	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+	fsnotify_remove_queued_event(group, event);
+
+	return event;
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e5409b83e731..7eb979bfc141 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -495,7 +495,13 @@ static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
 	fsnotify_add_event(group, group->overflow_event, NULL);
 }
 
-/* true if the group notification queue is empty */
+static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+	assert_spin_locked(&group->notification_lock);
+
+	return list_empty(&group->notification_list);
+}
+
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
-- 
cgit v1.2.3


From 8988f11abb820bacfcc53d498370bfb30f792ec4 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 4 Mar 2021 12:48:23 +0200
Subject: fanotify: reduce event objectid to 29-bit hash

objectid is only used by fanotify backend and it is just an optimization
for event merge before comparing all fields in event.

Move the objectid member from common struct fsnotify_event into struct
fanotify_event and reduce it to 29-bit hash to cram it together with the
3-bit event type.

Events of different types are never merged, so the combination of event
type and hash form a 32-bit key for fast compare of events.

This reduces the size of events by one pointer and paves the way for
adding hashed queue support for fanotify.

Link: https://lore.kernel.org/r/20210304104826.3993892-3-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c        | 25 ++++++++++++-------------
 fs/notify/fanotify/fanotify.h        | 16 +++++++++++++---
 fs/notify/inotify/inotify_fsnotify.c |  2 +-
 fs/notify/inotify/inotify_user.c     |  2 +-
 include/linux/fsnotify_backend.h     |  5 +----
 5 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 1192c9953620..8a2bb6954e02 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -88,16 +88,12 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
 	return fanotify_info_equal(info1, info2);
 }
 
-static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
-				  struct fsnotify_event *new_fsn)
+static bool fanotify_should_merge(struct fanotify_event *old,
+				  struct fanotify_event *new)
 {
-	struct fanotify_event *old, *new;
+	pr_debug("%s: old=%p new=%p\n", __func__, old, new);
 
-	pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
-	old = FANOTIFY_E(old_fsn);
-	new = FANOTIFY_E(new_fsn);
-
-	if (old_fsn->objectid != new_fsn->objectid ||
+	if (old->hash != new->hash ||
 	    old->type != new->type || old->pid != new->pid)
 		return false;
 
@@ -133,10 +129,9 @@ static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
 static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 {
 	struct fsnotify_event *test_event;
-	struct fanotify_event *new;
+	struct fanotify_event *old, *new = FANOTIFY_E(event);
 
 	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
-	new = FANOTIFY_E(event);
 
 	/*
 	 * Don't merge a permission event with any other event so that we know
@@ -147,8 +142,9 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 		return 0;
 
 	list_for_each_entry_reverse(test_event, list, list) {
-		if (fanotify_should_merge(test_event, event)) {
-			FANOTIFY_E(test_event)->mask |= new->mask;
+		old = FANOTIFY_E(test_event);
+		if (fanotify_should_merge(old, new)) {
+			old->mask |= new->mask;
 			return 1;
 		}
 	}
@@ -533,6 +529,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
 	struct mem_cgroup *old_memcg;
 	struct inode *child = NULL;
 	bool name_event = false;
+	unsigned int hash = 0;
 
 	if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) {
 		/*
@@ -600,8 +597,10 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
 	 * Use the victim inode instead of the watching inode as the id for
 	 * event queue, so event reported on parent is merged with event
 	 * reported on child when both directory and child watches exist.
+	 * Hash object id for queue merge.
 	 */
-	fanotify_init_event(event, (unsigned long)id, mask);
+	hash = hash_ptr(id, FANOTIFY_EVENT_HASH_BITS);
+	fanotify_init_event(event, hash, mask);
 	if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
 		event->pid = get_pid(task_pid(current));
 	else
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 896c819a1786..d531f0cfa46f 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -135,19 +135,29 @@ enum fanotify_event_type {
 	FANOTIFY_EVENT_TYPE_PATH,
 	FANOTIFY_EVENT_TYPE_PATH_PERM,
 	FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
+	__FANOTIFY_EVENT_TYPE_NUM
 };
 
+#define FANOTIFY_EVENT_TYPE_BITS \
+	(ilog2(__FANOTIFY_EVENT_TYPE_NUM - 1) + 1)
+#define FANOTIFY_EVENT_HASH_BITS \
+	(32 - FANOTIFY_EVENT_TYPE_BITS)
+
 struct fanotify_event {
 	struct fsnotify_event fse;
 	u32 mask;
-	enum fanotify_event_type type;
+	struct {
+		unsigned int type : FANOTIFY_EVENT_TYPE_BITS;
+		unsigned int hash : FANOTIFY_EVENT_HASH_BITS;
+	};
 	struct pid *pid;
 };
 
 static inline void fanotify_init_event(struct fanotify_event *event,
-				       unsigned long id, u32 mask)
+				       unsigned int hash, u32 mask)
 {
-	fsnotify_init_event(&event->fse, id);
+	fsnotify_init_event(&event->fse);
+	event->hash = hash;
 	event->mask = mask;
 	event->pid = NULL;
 }
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1901d799909b..0533bacbd584 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -107,7 +107,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
 		mask &= ~IN_ISDIR;
 
 	fsn_event = &event->fse;
-	fsnotify_init_event(fsn_event, 0);
+	fsnotify_init_event(fsn_event);
 	event->mask = mask;
 	event->wd = i_mark->wd;
 	event->sync_cookie = cookie;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a6c95bd64618..98f61b31745a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -641,7 +641,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 		return ERR_PTR(-ENOMEM);
 	}
 	group->overflow_event = &oevent->fse;
-	fsnotify_init_event(group->overflow_event, 0);
+	fsnotify_init_event(group->overflow_event);
 	oevent->mask = FS_Q_OVERFLOW;
 	oevent->wd = -1;
 	oevent->sync_cookie = 0;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7eb979bfc141..fc98f9f88d12 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -167,7 +167,6 @@ struct fsnotify_ops {
  */
 struct fsnotify_event {
 	struct list_head list;
-	unsigned long objectid;	/* identifier for queue merges */
 };
 
 /*
@@ -582,11 +581,9 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
 extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);
 
-static inline void fsnotify_init_event(struct fsnotify_event *event,
-				       unsigned long objectid)
+static inline void fsnotify_init_event(struct fsnotify_event *event)
 {
 	INIT_LIST_HEAD(&event->list);
-	event->objectid = objectid;
 }
 
 #else
-- 
cgit v1.2.3


From 94e00d28a680dff18805ca472b191364347d2234 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 4 Mar 2021 12:48:25 +0200
Subject: fsnotify: use hash table for faster events merge

In order to improve event merge performance, hash events in a 128 size
hash table by the event merge key.

The fanotify_event size grows by two pointers, but we just reduced its
size by removing the objectid member, so overall its size is increased
by one pointer.

Permission events and overflow event are not merged so they are also
not hashed.

Link: https://lore.kernel.org/r/20210304104826.3993892-5-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c        | 40 +++++++++++++++++++++++++++++-------
 fs/notify/fanotify/fanotify.h        | 25 ++++++++++++++++++++++
 fs/notify/fanotify/fanotify_user.c   | 39 +++++++++++++++++++++++++++++++++++
 fs/notify/inotify/inotify_fsnotify.c |  7 ++++---
 fs/notify/notification.c             | 22 +++++++++++++-------
 include/linux/fsnotify_backend.h     | 10 ++++++---
 6 files changed, 123 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 43a606f15370..50b3abc06215 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -149,12 +149,15 @@ static bool fanotify_should_merge(struct fanotify_event *old,
 }
 
 /* and the list better be locked by something too! */
-static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
+static int fanotify_merge(struct fsnotify_group *group,
+			  struct fsnotify_event *event)
 {
-	struct fsnotify_event *test_event;
 	struct fanotify_event *old, *new = FANOTIFY_E(event);
+	unsigned int bucket = fanotify_event_hash_bucket(group, new);
+	struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket];
 
-	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
+		 group, event, bucket);
 
 	/*
 	 * Don't merge a permission event with any other event so that we know
@@ -164,8 +167,7 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 	if (fanotify_is_perm_event(new->mask))
 		return 0;
 
-	list_for_each_entry_reverse(test_event, list, list) {
-		old = FANOTIFY_E(test_event);
+	hlist_for_each_entry(old, hlist, merge_list) {
 		if (fanotify_should_merge(old, new)) {
 			old->mask |= new->mask;
 			return 1;
@@ -203,8 +205,11 @@ static int fanotify_get_response(struct fsnotify_group *group,
 			return ret;
 		}
 		/* Event not yet reported? Just remove it. */
-		if (event->state == FAN_EVENT_INIT)
+		if (event->state == FAN_EVENT_INIT) {
 			fsnotify_remove_queued_event(group, &event->fae.fse);
+			/* Permission events are not supposed to be hashed */
+			WARN_ON_ONCE(!hlist_unhashed(&event->fae.merge_list));
+		}
 		/*
 		 * Event may be also answered in case signal delivery raced
 		 * with wakeup. In that case we have nothing to do besides
@@ -679,6 +684,24 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
 	return fsid;
 }
 
+/*
+ * Add an event to hash table for faster merge.
+ */
+static void fanotify_insert_event(struct fsnotify_group *group,
+				  struct fsnotify_event *fsn_event)
+{
+	struct fanotify_event *event = FANOTIFY_E(fsn_event);
+	unsigned int bucket = fanotify_event_hash_bucket(group, event);
+	struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket];
+
+	assert_spin_locked(&group->notification_lock);
+
+	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
+		 group, event, bucket);
+
+	hlist_add_head(&event->merge_list, hlist);
+}
+
 static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
 				 const void *data, int data_type,
 				 struct inode *dir,
@@ -749,7 +772,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
 	}
 
 	fsn_event = &event->fse;
-	ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
+	ret = fsnotify_add_event(group, fsn_event, fanotify_merge,
+				 fanotify_is_hashed_event(mask) ?
+				 fanotify_insert_event : NULL);
 	if (ret) {
 		/* Permission events shouldn't be merged */
 		BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
@@ -772,6 +797,7 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
 {
 	struct user_struct *user;
 
+	kfree(group->fanotify_data.merge_hash);
 	user = group->fanotify_data.user;
 	atomic_dec(&user->fanotify_listeners);
 	free_uid(user);
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 9871f76cd9c2..4a5e555dc3d2 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -3,6 +3,7 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 #include <linux/exportfs.h>
+#include <linux/hashtable.h>
 
 extern struct kmem_cache *fanotify_mark_cache;
 extern struct kmem_cache *fanotify_fid_event_cachep;
@@ -150,6 +151,7 @@ enum fanotify_event_type {
 
 struct fanotify_event {
 	struct fsnotify_event fse;
+	struct hlist_node merge_list;	/* List for hashed merge */
 	u32 mask;
 	struct {
 		unsigned int type : FANOTIFY_EVENT_TYPE_BITS;
@@ -162,6 +164,7 @@ static inline void fanotify_init_event(struct fanotify_event *event,
 				       unsigned int hash, u32 mask)
 {
 	fsnotify_init_event(&event->fse);
+	INIT_HLIST_NODE(&event->merge_list);
 	event->hash = hash;
 	event->mask = mask;
 	event->pid = NULL;
@@ -299,3 +302,25 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event)
 	else
 		return NULL;
 }
+
+/*
+ * Use 128 size hash table to speed up events merge.
+ */
+#define FANOTIFY_HTABLE_BITS	(7)
+#define FANOTIFY_HTABLE_SIZE	(1 << FANOTIFY_HTABLE_BITS)
+#define FANOTIFY_HTABLE_MASK	(FANOTIFY_HTABLE_SIZE - 1)
+
+/*
+ * Permission events and overflow event do not get merged - don't hash them.
+ */
+static inline bool fanotify_is_hashed_event(u32 mask)
+{
+	return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW);
+}
+
+static inline unsigned int fanotify_event_hash_bucket(
+						struct fsnotify_group *group,
+						struct fanotify_event *event)
+{
+	return event->hash & FANOTIFY_HTABLE_MASK;
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 16162207e886..b89f332248bd 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -89,6 +89,23 @@ static int fanotify_event_info_len(unsigned int fid_mode,
 	return info_len;
 }
 
+/*
+ * Remove an hashed event from merge hash table.
+ */
+static void fanotify_unhash_event(struct fsnotify_group *group,
+				  struct fanotify_event *event)
+{
+	assert_spin_locked(&group->notification_lock);
+
+	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
+		 group, event, fanotify_event_hash_bucket(group, event));
+
+	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
+		return;
+
+	hlist_del_init(&event->merge_list);
+}
+
 /*
  * Get an fanotify notification event if one exists and is small
  * enough to fit in "count". Return an error pointer if the count
@@ -126,6 +143,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
 	fsnotify_remove_first_event(group);
 	if (fanotify_is_perm_event(event->mask))
 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
+	if (fanotify_is_hashed_event(event->mask))
+		fanotify_unhash_event(group, event);
 out:
 	spin_unlock(&group->notification_lock);
 	return event;
@@ -925,6 +944,20 @@ static struct fsnotify_event *fanotify_alloc_overflow_event(void)
 	return &oevent->fse;
 }
 
+static struct hlist_head *fanotify_alloc_merge_hash(void)
+{
+	struct hlist_head *hash;
+
+	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
+		       GFP_KERNEL_ACCOUNT);
+	if (!hash)
+		return NULL;
+
+	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
+
+	return hash;
+}
+
 /* fanotify syscalls */
 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 {
@@ -993,6 +1026,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	atomic_inc(&user->fanotify_listeners);
 	group->memcg = get_mem_cgroup_from_mm(current->mm);
 
+	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
+	if (!group->fanotify_data.merge_hash) {
+		fd = -ENOMEM;
+		goto out_destroy_group;
+	}
+
 	group->overflow_event = fanotify_alloc_overflow_event();
 	if (unlikely(!group->overflow_event)) {
 		fd = -ENOMEM;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 0533bacbd584..d1a64daa0171 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -46,9 +46,10 @@ static bool event_compare(struct fsnotify_event *old_fsn,
 	return false;
 }
 
-static int inotify_merge(struct list_head *list,
-			  struct fsnotify_event *event)
+static int inotify_merge(struct fsnotify_group *group,
+			 struct fsnotify_event *event)
 {
+	struct list_head *list = &group->notification_list;
 	struct fsnotify_event *last_event;
 
 	last_event = list_entry(list->prev, struct fsnotify_event, list);
@@ -115,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
 	if (len)
 		strcpy(event->name, name->name);
 
-	ret = fsnotify_add_event(group, fsn_event, inotify_merge);
+	ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL);
 	if (ret) {
 		/* Our event wasn't used in the end. Free it. */
 		fsnotify_destroy_event(group, fsn_event);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 001cfe7d2e4e..32f45543b9c6 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -68,16 +68,22 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
 }
 
 /*
- * Add an event to the group notification queue.  The group can later pull this
- * event off the queue to deal with.  The function returns 0 if the event was
- * added to the queue, 1 if the event was merged with some other queued event,
+ * Try to add an event to the notification queue.
+ * The group can later pull this event off the queue to deal with.
+ * The group can use the @merge hook to merge the event with a queued event.
+ * The group can use the @insert hook to insert the event into hash table.
+ * The function returns:
+ * 0 if the event was added to a queue
+ * 1 if the event was merged with some other queued event
  * 2 if the event was not queued - either the queue of events has overflown
- * or the group is shutting down.
+ *   or the group is shutting down.
  */
 int fsnotify_add_event(struct fsnotify_group *group,
 		       struct fsnotify_event *event,
-		       int (*merge)(struct list_head *,
-				    struct fsnotify_event *))
+		       int (*merge)(struct fsnotify_group *,
+				    struct fsnotify_event *),
+		       void (*insert)(struct fsnotify_group *,
+				      struct fsnotify_event *))
 {
 	int ret = 0;
 	struct list_head *list = &group->notification_list;
@@ -104,7 +110,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
 	}
 
 	if (!list_empty(list) && merge) {
-		ret = merge(list, event);
+		ret = merge(group, event);
 		if (ret) {
 			spin_unlock(&group->notification_lock);
 			return ret;
@@ -114,6 +120,8 @@ int fsnotify_add_event(struct fsnotify_group *group,
 queue:
 	group->q_len++;
 	list_add_tail(&event->list, list);
+	if (insert)
+		insert(group, event);
 	spin_unlock(&group->notification_lock);
 
 	wake_up(&group->notification_waitq);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index fc98f9f88d12..63fb766f0f3e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -233,6 +233,8 @@ struct fsnotify_group {
 #endif
 #ifdef CONFIG_FANOTIFY
 		struct fanotify_group_private_data {
+			/* Hash table of events for merge */
+			struct hlist_head *merge_hash;
 			/* allows a group to block waiting for a userspace response */
 			struct list_head access_list;
 			wait_queue_head_t access_waitq;
@@ -486,12 +488,14 @@ extern void fsnotify_destroy_event(struct fsnotify_group *group,
 /* attach the event to the group notification queue */
 extern int fsnotify_add_event(struct fsnotify_group *group,
 			      struct fsnotify_event *event,
-			      int (*merge)(struct list_head *,
-					   struct fsnotify_event *));
+			      int (*merge)(struct fsnotify_group *,
+					   struct fsnotify_event *),
+			      void (*insert)(struct fsnotify_group *,
+					     struct fsnotify_event *));
 /* Queue overflow event to a notification group */
 static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
 {
-	fsnotify_add_event(group, group->overflow_event, NULL);
+	fsnotify_add_event(group, group->overflow_event, NULL, NULL);
 }
 
 static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
-- 
cgit v1.2.3


From 5b8fea65d197f408bb00b251c70d842826d6b70b Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 4 Mar 2021 13:29:20 +0200
Subject: fanotify: configurable limits via sysfs

fanotify has some hardcoded limits. The only APIs to escape those limits
are FAN_UNLIMITED_QUEUE and FAN_UNLIMITED_MARKS.

Allow finer grained tuning of the system limits via sysfs tunables under
/proc/sys/fs/fanotify, similar to tunables under /proc/sys/fs/inotify,
with some minor differences.

- max_queued_events - global system tunable for group queue size limit.
  Like the inotify tunable with the same name, it defaults to 16384 and
  applies on initialization of a new group.

- max_user_marks - user ns tunable for marks limit per user.
  Like the inotify tunable named max_user_watches, on a machine with
  sufficient RAM and it defaults to 1048576 in init userns and can be
  further limited per containing user ns.

- max_user_groups - user ns tunable for number of groups per user.
  Like the inotify tunable named max_user_instances, it defaults to 128
  in init userns and can be further limited per containing user ns.

The slightly different tunable names used for fanotify are derived from
the "group" and "mark" terminology used in the fanotify man pages and
throughout the code.

Considering the fact that the default value for max_user_instances was
increased in kernel v5.10 from 8192 to 1048576, leaving the legacy
fanotify limit of 8192 marks per group in addition to the max_user_marks
limit makes little sense, so the per group marks limit has been removed.

Note that when a group is initialized with FAN_UNLIMITED_MARKS, its own
marks are not accounted in the per user marks account, so in effect the
limit of max_user_marks is only for the collection of groups that are
not initialized with FAN_UNLIMITED_MARKS.

Link: https://lore.kernel.org/r/20210304112921.3996419-2-amir73il@gmail.com
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      |  16 +++--
 fs/notify/fanotify/fanotify_user.c | 123 +++++++++++++++++++++++++++++++------
 fs/notify/group.c                  |   1 -
 fs/notify/mark.c                   |   4 --
 include/linux/fanotify.h           |   3 +
 include/linux/fsnotify_backend.h   |   6 +-
 include/linux/sched/user.h         |   3 -
 include/linux/user_namespace.h     |   4 ++
 kernel/sysctl.c                    |  12 +++-
 kernel/ucount.c                    |   4 ++
 10 files changed, 137 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 754e27ead874..057abd2cf887 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -801,12 +801,10 @@ finish:
 
 static void fanotify_free_group_priv(struct fsnotify_group *group)
 {
-	struct user_struct *user;
-
 	kfree(group->fanotify_data.merge_hash);
-	user = group->fanotify_data.user;
-	atomic_dec(&user->fanotify_listeners);
-	free_uid(user);
+	if (group->fanotify_data.ucounts)
+		dec_ucount(group->fanotify_data.ucounts,
+			   UCOUNT_FANOTIFY_GROUPS);
 }
 
 static void fanotify_free_path_event(struct fanotify_event *event)
@@ -862,6 +860,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
 	}
 }
 
+static void fanotify_freeing_mark(struct fsnotify_mark *mark,
+				  struct fsnotify_group *group)
+{
+	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
+		dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_MARKS);
+}
+
 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
 	kmem_cache_free(fanotify_mark_cache, fsn_mark);
@@ -871,5 +876,6 @@ const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
 	.free_group_priv = fanotify_free_group_priv,
 	.free_event = fanotify_free_event,
+	.freeing_mark = fanotify_freeing_mark,
 	.free_mark = fanotify_free_mark,
 };
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b89f332248bd..e81848e09646 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -27,8 +27,61 @@
 #include "fanotify.h"
 
 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
-#define FANOTIFY_DEFAULT_MAX_MARKS	8192
-#define FANOTIFY_DEFAULT_MAX_LISTENERS	128
+#define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
+#define FANOTIFY_DEFAULT_MAX_GROUPS	128
+
+/*
+ * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
+ * limit of marks per user, similar to inotify.  Effectively, the legacy limit
+ * of fanotify marks per user is <max marks per group> * <max groups per user>.
+ * This default limit (1M) also happens to match the increased limit of inotify
+ * max_user_watches since v5.10.
+ */
+#define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
+	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
+
+/*
+ * Most of the memory cost of adding an inode mark is pinning the marked inode.
+ * The size of the filesystem inode struct is not uniform across filesystems,
+ * so double the size of a VFS inode is used as a conservative approximation.
+ */
+#define INODE_MARK_COST	(2 * sizeof(struct inode))
+
+/* configurable via /proc/sys/fs/fanotify/ */
+static int fanotify_max_queued_events __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+struct ctl_table fanotify_table[] = {
+	{
+		.procname	= "max_user_groups",
+		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "max_user_marks",
+		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "max_queued_events",
+		.data		= &fanotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO
+	},
+	{ }
+};
+#endif /* CONFIG_SYSCTL */
 
 /*
  * All flags that may be specified in parameter event_f_flags of fanotify_init.
@@ -847,24 +900,38 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 						   unsigned int type,
 						   __kernel_fsid_t *fsid)
 {
+	struct ucounts *ucounts = group->fanotify_data.ucounts;
 	struct fsnotify_mark *mark;
 	int ret;
 
-	if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+	/*
+	 * Enforce per user marks limits per user in all containing user ns.
+	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
+	 * in the limited groups account.
+	 */
+	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
+	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
 		return ERR_PTR(-ENOSPC);
 
 	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-	if (!mark)
-		return ERR_PTR(-ENOMEM);
+	if (!mark) {
+		ret = -ENOMEM;
+		goto out_dec_ucounts;
+	}
 
 	fsnotify_init_mark(mark, group);
 	ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
 	if (ret) {
 		fsnotify_put_mark(mark);
-		return ERR_PTR(ret);
+		goto out_dec_ucounts;
 	}
 
 	return mark;
+
+out_dec_ucounts:
+	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
+		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
+	return ERR_PTR(ret);
 }
 
 
@@ -963,7 +1030,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 {
 	struct fsnotify_group *group;
 	int f_flags, fd;
-	struct user_struct *user;
 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
 
@@ -1002,12 +1068,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
 		return -EINVAL;
 
-	user = get_current_user();
-	if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
-		free_uid(user);
-		return -EMFILE;
-	}
-
 	f_flags = O_RDWR | FMODE_NONOTIFY;
 	if (flags & FAN_CLOEXEC)
 		f_flags |= O_CLOEXEC;
@@ -1017,13 +1077,19 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
 	group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
 	if (IS_ERR(group)) {
-		free_uid(user);
 		return PTR_ERR(group);
 	}
 
-	group->fanotify_data.user = user;
+	/* Enforce groups limits per user in all containing user ns */
+	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
+						  current_euid(),
+						  UCOUNT_FANOTIFY_GROUPS);
+	if (!group->fanotify_data.ucounts) {
+		fd = -EMFILE;
+		goto out_destroy_group;
+	}
+
 	group->fanotify_data.flags = flags;
-	atomic_inc(&user->fanotify_listeners);
 	group->memcg = get_mem_cgroup_from_mm(current->mm);
 
 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
@@ -1064,16 +1130,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 			goto out_destroy_group;
 		group->max_events = UINT_MAX;
 	} else {
-		group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+		group->max_events = fanotify_max_queued_events;
 	}
 
 	if (flags & FAN_UNLIMITED_MARKS) {
 		fd = -EPERM;
 		if (!capable(CAP_SYS_ADMIN))
 			goto out_destroy_group;
-		group->fanotify_data.max_marks = UINT_MAX;
-	} else {
-		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
 	}
 
 	if (flags & FAN_ENABLE_AUDIT) {
@@ -1357,6 +1420,21 @@ SYSCALL32_DEFINE6(fanotify_mark,
  */
 static int __init fanotify_user_setup(void)
 {
+	struct sysinfo si;
+	int max_marks;
+
+	si_meminfo(&si);
+	/*
+	 * Allow up to 1% of addressable memory to be accounted for per user
+	 * marks limited to the range [8192, 1048576]. mount and sb marks are
+	 * a lot cheaper than inode marks, but there is no reason for a user
+	 * to have many of those, so calculate by the cost of inode marks.
+	 */
+	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
+		    INODE_MARK_COST;
+	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
+				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
+
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
@@ -1371,6 +1449,11 @@ static int __init fanotify_user_setup(void)
 			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
 	}
 
+	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
+					FANOTIFY_DEFAULT_MAX_GROUPS;
+	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
+
 	return 0;
 }
 device_initcall(fanotify_user_setup);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ffd723ffe46d..fb89c351295d 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -122,7 +122,6 @@ static struct fsnotify_group *__fsnotify_alloc_group(
 
 	/* set to 0 when there a no external references to this group */
 	refcount_set(&group->refcnt, 1);
-	atomic_set(&group->num_marks, 0);
 	atomic_set(&group->user_waits, 0);
 
 	spin_lock_init(&group->notification_lock);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 8387937b9d01..d32ab349db74 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -391,8 +391,6 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 	list_del_init(&mark->g_list);
 	spin_unlock(&mark->lock);
 
-	atomic_dec(&group->num_marks);
-
 	/* Drop mark reference acquired in fsnotify_add_mark_locked() */
 	fsnotify_put_mark(mark);
 }
@@ -656,7 +654,6 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
 
 	list_add(&mark->g_list, &group->marks_list);
-	atomic_inc(&group->num_marks);
 	fsnotify_get_mark(mark); /* for g_list */
 	spin_unlock(&mark->lock);
 
@@ -674,7 +671,6 @@ err:
 			 FSNOTIFY_MARK_FLAG_ATTACHED);
 	list_del_init(&mark->g_list);
 	spin_unlock(&mark->lock);
-	atomic_dec(&group->num_marks);
 
 	fsnotify_put_mark(mark);
 	return ret;
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 3e9c56ee651f..031a97d8369a 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -2,8 +2,11 @@
 #ifndef _LINUX_FANOTIFY_H
 #define _LINUX_FANOTIFY_H
 
+#include <linux/sysctl.h>
 #include <uapi/linux/fanotify.h>
 
+extern struct ctl_table fanotify_table[]; /* for sysctl */
+
 #define FAN_GROUP_FLAG(group, flag) \
 	((group)->fanotify_data.flags & (flag))
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 63fb766f0f3e..1ce66748a2d2 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -206,9 +206,6 @@ struct fsnotify_group {
 
 	/* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
 	struct mutex mark_mutex;	/* protect marks_list */
-	atomic_t num_marks;		/* 1 for each mark and 1 for not being
-					 * past the point of no return when freeing
-					 * a group */
 	atomic_t user_waits;		/* Number of tasks waiting for user
 					 * response */
 	struct list_head marks_list;	/* all inode marks for this group */
@@ -240,8 +237,7 @@ struct fsnotify_group {
 			wait_queue_head_t access_waitq;
 			int flags;           /* flags from fanotify_init() */
 			int f_flags; /* event_f_flags from fanotify_init() */
-			unsigned int max_marks;
-			struct user_struct *user;
+			struct ucounts *ucounts;
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index a8ec3b6093fc..3632c5d6ec55 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -14,9 +14,6 @@ struct user_struct {
 	refcount_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
-#ifdef CONFIG_FANOTIFY
-	atomic_t fanotify_listeners;
-#endif
 #ifdef CONFIG_EPOLL
 	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
 #endif
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..f0d961a15fba 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -49,6 +49,10 @@ enum ucount_type {
 #ifdef CONFIG_INOTIFY_USER
 	UCOUNT_INOTIFY_INSTANCES,
 	UCOUNT_INOTIFY_WATCHES,
+#endif
+#ifdef CONFIG_FANOTIFY
+	UCOUNT_FANOTIFY_GROUPS,
+	UCOUNT_FANOTIFY_MARKS,
 #endif
 	UCOUNT_COUNTS,
 };
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..4b6b9de89da8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -148,6 +148,9 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
 #endif
+#ifdef CONFIG_FANOTIFY
+#include <linux/fanotify.h>
+#endif
 
 #ifdef CONFIG_PROC_SYSCTL
 
@@ -3258,7 +3261,14 @@ static struct ctl_table fs_table[] = {
 		.mode		= 0555,
 		.child		= inotify_table,
 	},
-#endif	
+#endif
+#ifdef CONFIG_FANOTIFY
+	{
+		.procname	= "fanotify",
+		.mode		= 0555,
+		.child		= fanotify_table,
+	},
+#endif
 #ifdef CONFIG_EPOLL
 	{
 		.procname	= "epoll",
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 11b1596e2542..8d8874f1c35e 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -73,6 +73,10 @@ static struct ctl_table user_table[] = {
 #ifdef CONFIG_INOTIFY_USER
 	UCOUNT_ENTRY("max_inotify_instances"),
 	UCOUNT_ENTRY("max_inotify_watches"),
+#endif
+#ifdef CONFIG_FANOTIFY
+	UCOUNT_ENTRY("max_fanotify_groups"),
+	UCOUNT_ENTRY("max_fanotify_marks"),
 #endif
 	{ }
 };
-- 
cgit v1.2.3


From 7cea2a3c505e87a9d6afc78be4a7f7be636a73a7 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 4 Mar 2021 13:29:21 +0200
Subject: fanotify: support limited functionality for unprivileged users

Add limited support for unprivileged fanotify groups.
An unprivileged users is not allowed to get an open file descriptor in
the event nor the process pid of another process.  An unprivileged user
cannot request permission events, cannot set mount/filesystem marks and
cannot request unlimited queue/marks.

This enables the limited functionality similar to inotify when watching a
set of files and directories for OPEN/ACCESS/MODIFY/CLOSE events, without
requiring SYS_CAP_ADMIN privileges.

The FAN_REPORT_DFID_NAME init flag, provide a method for an unprivileged
listener watching a set of directories (with FAN_EVENT_ON_CHILD) to monitor
all changes inside those directories.

This typically requires that the listener keeps a map of watched directory
fid to dirfd (O_PATH), where fid is obtained with name_to_handle_at()
before starting to watch for changes.

When getting an event, the reported fid of the parent should be resolved
to dirfd and fstatsat(2) with dirfd and name should be used to query the
state of the filesystem entry.

Link: https://lore.kernel.org/r/20210304112921.3996419-3-amir73il@gmail.com
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c | 29 +++++++++++++++++++++++++++--
 fs/notify/fdinfo.c                 |  3 ++-
 include/linux/fanotify.h           | 33 ++++++++++++++++++++++++++++-----
 3 files changed, 57 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e81848e09646..65142b1fa823 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -419,6 +419,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	metadata.reserved = 0;
 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
 	metadata.pid = pid_vnr(event->pid);
+	/*
+	 * For an unprivileged listener, event->pid can be used to identify the
+	 * events generated by the listener process itself, without disclosing
+	 * the pids of other processes.
+	 */
+	if (!capable(CAP_SYS_ADMIN) &&
+	    task_tgid(current) != event->pid)
+		metadata.pid = 0;
 
 	if (path && path->mnt && path->dentry) {
 		fd = create_fd(group, path, &f);
@@ -1036,8 +1044,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	pr_debug("%s: flags=%x event_f_flags=%x\n",
 		 __func__, flags, event_f_flags);
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (!capable(CAP_SYS_ADMIN)) {
+		/*
+		 * An unprivileged user can setup an fanotify group with
+		 * limited functionality - an unprivileged group is limited to
+		 * notification events with file handles and it cannot use
+		 * unlimited queue/marks.
+		 */
+		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
+			return -EPERM;
+	}
 
 #ifdef CONFIG_AUDITSYSCALL
 	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
@@ -1288,6 +1304,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 		goto fput_and_out;
 	group = f.file->private_data;
 
+	/*
+	 * An unprivileged user is not allowed to watch a mount point nor
+	 * a filesystem.
+	 */
+	ret = -EPERM;
+	if (!capable(CAP_SYS_ADMIN) &&
+	    mark_type != FAN_MARK_INODE)
+		goto fput_and_out;
+
 	/*
 	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
 	 * allowed to set permissions events.
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index f0d6b54be412..a712b2aaa9ac 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -144,7 +144,8 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
 	struct fsnotify_group *group = f->private_data;
 
 	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
-		   group->fanotify_data.flags, group->fanotify_data.f_flags);
+		   group->fanotify_data.flags,
+		   group->fanotify_data.f_flags);
 
 	show_fdinfo(m, f, fanotify_fdinfo);
 }
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 031a97d8369a..bad41bcb25df 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -18,15 +18,38 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
  * these constant, the programs may break if re-compiled with new uapi headers
  * and then run on an old kernel.
  */
-#define FANOTIFY_CLASS_BITS	(FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \
+
+/* Group classes where permission events are allowed */
+#define FANOTIFY_PERM_CLASSES	(FAN_CLASS_CONTENT | \
 				 FAN_CLASS_PRE_CONTENT)
 
+#define FANOTIFY_CLASS_BITS	(FAN_CLASS_NOTIF | FANOTIFY_PERM_CLASSES)
+
 #define FANOTIFY_FID_BITS	(FAN_REPORT_FID | FAN_REPORT_DFID_NAME)
 
-#define FANOTIFY_INIT_FLAGS	(FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \
-				 FAN_REPORT_TID | \
-				 FAN_CLOEXEC | FAN_NONBLOCK | \
-				 FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
+/*
+ * fanotify_init() flags that require CAP_SYS_ADMIN.
+ * We do not allow unprivileged groups to request permission events.
+ * We do not allow unprivileged groups to get other process pid in events.
+ * We do not allow unprivileged groups to use unlimited resources.
+ */
+#define FANOTIFY_ADMIN_INIT_FLAGS	(FANOTIFY_PERM_CLASSES | \
+					 FAN_REPORT_TID | \
+					 FAN_UNLIMITED_QUEUE | \
+					 FAN_UNLIMITED_MARKS)
+
+/*
+ * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN.
+ * FAN_CLASS_NOTIF is the only class we allow for unprivileged group.
+ * We do not allow unprivileged groups to get file descriptors in events,
+ * so one of the flags for reporting file handles is required.
+ */
+#define FANOTIFY_USER_INIT_FLAGS	(FAN_CLASS_NOTIF | \
+					 FANOTIFY_FID_BITS | \
+					 FAN_CLOEXEC | FAN_NONBLOCK)
+
+#define FANOTIFY_INIT_FLAGS	(FANOTIFY_ADMIN_INIT_FLAGS | \
+				 FANOTIFY_USER_INIT_FLAGS)
 
 #define FANOTIFY_MARK_TYPE_BITS	(FAN_MARK_INODE | FAN_MARK_MOUNT | \
 				 FAN_MARK_FILESYSTEM)
-- 
cgit v1.2.3


From 3b49dfb08c750d4745ad42ec042288aba932b9d5 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 23 Feb 2021 15:17:48 -0400
Subject: ARM: amba: Allow some ARM_AMBA users to compile with COMPILE_TEST

CONFIG_VFIO_AMBA has a light use of AMBA, adding some inline fallbacks
when AMBA is disabled will allow it to be compiled under COMPILE_TEST and
make VFIO easier to maintain.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <3-v1-df057e0f92c3+91-vfio_arm_compile_test_jgg@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/platform/Kconfig |  2 +-
 include/linux/amba/bus.h      | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index 233efde219cc..ab341108a0be 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -12,7 +12,7 @@ config VFIO_PLATFORM
 
 config VFIO_AMBA
 	tristate "VFIO support for AMBA devices"
-	depends on VFIO_PLATFORM && ARM_AMBA
+	depends on VFIO_PLATFORM && (ARM_AMBA || COMPILE_TEST)
 	help
 	  Support for ARM AMBA devices with VFIO. This is required to make
 	  use of ARM AMBA devices present on the system using the VFIO
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 6cc93ab5b809..c68d87b87283 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -105,8 +105,19 @@ extern struct bus_type amba_bustype;
 #define amba_get_drvdata(d)	dev_get_drvdata(&d->dev)
 #define amba_set_drvdata(d,p)	dev_set_drvdata(&d->dev, p)
 
+#ifdef CONFIG_ARM_AMBA
 int amba_driver_register(struct amba_driver *);
 void amba_driver_unregister(struct amba_driver *);
+#else
+static inline int amba_driver_register(struct amba_driver *drv)
+{
+	return -EINVAL;
+}
+static inline void amba_driver_unregister(struct amba_driver *drv)
+{
+}
+#endif
+
 struct amba_device *amba_device_alloc(const char *, resource_size_t, size_t);
 void amba_device_put(struct amba_device *);
 int amba_device_add(struct amba_device *, struct resource *);
-- 
cgit v1.2.3


From c124fd9a969acaa83f6dfa5e160a99a500af9e4b Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Thu, 18 Feb 2021 20:03:58 +0100
Subject: PCI: Add pci_find_vsec_capability() to find a specific VSEC

Add pci_find_vsec_capability() to locate a Vendor-Specific Extended
Capability with the specified VSEC ID.

The Vendor-Specific Extended Capability (VSEC) allows one or more
proprietary capabilities defined by the vendor which aren't standard
or shared between vendors.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/d89506834fb11c6fa0bd5d515c0dd55b13ac6958.1613674948.git.gustavo.pimentel@synopsys.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/pci/pci.c   | 30 ++++++++++++++++++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 16a17215f633..97c0f2ec922b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -692,6 +692,36 @@ u8 pci_find_ht_capability(struct pci_dev *dev, int ht_cap)
 }
 EXPORT_SYMBOL_GPL(pci_find_ht_capability);
 
+/**
+ * pci_find_vsec_capability - Find a vendor-specific extended capability
+ * @dev: PCI device to query
+ * @vendor: Vendor ID for which capability is defined
+ * @cap: Vendor-specific capability ID
+ *
+ * If @dev has Vendor ID @vendor, search for a VSEC capability with
+ * VSEC ID @cap. If found, return the capability offset in
+ * config space; otherwise return 0.
+ */
+u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap)
+{
+	u16 vsec = 0;
+	u32 header;
+
+	if (vendor != dev->vendor)
+		return 0;
+
+	while ((vsec = pci_find_next_ext_capability(dev, vsec,
+						     PCI_EXT_CAP_ID_VNDR))) {
+		if (pci_read_config_dword(dev, vsec + PCI_VNDR_HEADER,
+					  &header) == PCIBIOS_SUCCESSFUL &&
+		    PCI_VNDR_HEADER_ID(header) == cap)
+			return vsec;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_find_vsec_capability);
+
 /**
  * pci_find_parent_resource - return resource region of parent bus of given
  *			      region
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 86c799c97b77..59e731a2d1f8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1077,6 +1077,7 @@ u8 pci_find_next_ht_capability(struct pci_dev *dev, u8 pos, int ht_cap);
 u16 pci_find_ext_capability(struct pci_dev *dev, int cap);
 u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 pos, int cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
+u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap);
 
 u64 pci_get_dsn(struct pci_dev *dev);
 
-- 
cgit v1.2.3


From 5abbe51a526253b9f003e9a0a195638dc882d660 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 1 Feb 2021 18:46:41 +0100
Subject: kernel, fs: Introduce and use set_restart_fn() and
 arch_set_restart_data()

Preparation for fixing get_nr_restart_syscall() on X86 for COMPAT.

Add a new helper which sets restart_block->fn and calls a dummy
arch_set_restart_data() helper.

Fixes: 609c19a385c8 ("x86/ptrace: Stop setting TS_COMPAT in ptrace code")
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210201174641.GA17871@redhat.com
---
 fs/select.c                    | 10 ++++------
 include/linux/thread_info.h    | 13 +++++++++++++
 kernel/futex.c                 |  3 +--
 kernel/time/alarmtimer.c       |  2 +-
 kernel/time/hrtimer.c          |  2 +-
 kernel/time/posix-cpu-timers.c |  2 +-
 6 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/select.c b/fs/select.c
index 37aaa8317f3a..945896d0ac9e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1055,10 +1055,9 @@ static long do_restart_poll(struct restart_block *restart_block)
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	if (ret == -ERESTARTNOHAND) {
-		restart_block->fn = do_restart_poll;
-		ret = -ERESTART_RESTARTBLOCK;
-	}
+	if (ret == -ERESTARTNOHAND)
+		ret = set_restart_fn(restart_block, do_restart_poll);
+
 	return ret;
 }
 
@@ -1080,7 +1079,6 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct restart_block *restart_block;
 
 		restart_block = &current->restart_block;
-		restart_block->fn = do_restart_poll;
 		restart_block->poll.ufds = ufds;
 		restart_block->poll.nfds = nfds;
 
@@ -1091,7 +1089,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 		} else
 			restart_block->poll.has_timeout = 0;
 
-		ret = -ERESTART_RESTARTBLOCK;
+		ret = set_restart_fn(restart_block, do_restart_poll);
 	}
 	return ret;
 }
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9b2158c69275..157762db9d4b 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/restart_block.h>
+#include <linux/errno.h>
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 /*
@@ -59,6 +60,18 @@ enum syscall_work_bit {
 
 #ifdef __KERNEL__
 
+#ifndef arch_set_restart_data
+#define arch_set_restart_data(restart) do { } while (0)
+#endif
+
+static inline long set_restart_fn(struct restart_block *restart,
+					long (*fn)(struct restart_block *))
+{
+	restart->fn = fn;
+	arch_set_restart_data(restart);
+	return -ERESTART_RESTARTBLOCK;
+}
+
 #ifndef THREAD_ALIGN
 #define THREAD_ALIGN	THREAD_SIZE
 #endif
diff --git a/kernel/futex.c b/kernel/futex.c
index e68db7745039..00febd6dea9c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2728,14 +2728,13 @@ retry:
 		goto out;
 
 	restart = &current->restart_block;
-	restart->fn = futex_wait_restart;
 	restart->futex.uaddr = uaddr;
 	restart->futex.val = val;
 	restart->futex.time = *abs_time;
 	restart->futex.bitset = bitset;
 	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
-	ret = -ERESTART_RESTARTBLOCK;
+	ret = set_restart_fn(restart, futex_wait_restart);
 
 out:
 	if (to) {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 98d7a15e8cf6..4d94e2b5499d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 	if (flags == TIMER_ABSTIME)
 		return -ERESTARTNOHAND;
 
-	restart->fn = alarm_timer_nsleep_restart;
 	restart->nanosleep.clockid = type;
 	restart->nanosleep.expires = exp;
+	set_restart_fn(restart, alarm_timer_nsleep_restart);
 	return ret;
 }
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 788b9d137de4..5c9d968187ae 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1957,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
 	}
 
 	restart = &current->restart_block;
-	restart->fn = hrtimer_nanosleep_restart;
 	restart->nanosleep.clockid = t.timer.base->clockid;
 	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+	set_restart_fn(restart, hrtimer_nanosleep_restart);
 out:
 	destroy_hrtimer_on_stack(&t.timer);
 	return ret;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..9abe15255bc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		if (flags & TIMER_ABSTIME)
 			return -ERESTARTNOHAND;
 
-		restart_block->fn = posix_cpu_nsleep_restart;
 		restart_block->nanosleep.clockid = which_clock;
+		set_restart_fn(restart_block, posix_cpu_nsleep_restart);
 	}
 	return error;
 }
-- 
cgit v1.2.3


From b2e9df850c58c2b36e915e7d3bed3f6107cccba6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 1 Feb 2021 18:47:16 +0100
Subject: x86: Introduce restart_block->arch_data to remove TS_COMPAT_RESTART

Save the current_thread_info()->status of X86 in the new
restart_block->arch_data field so TS_COMPAT_RESTART can be removed again.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210201174716.GA17898@redhat.com
---
 arch/x86/include/asm/thread_info.h | 12 ++----------
 arch/x86/kernel/signal.c           |  2 +-
 include/linux/restart_block.h      |  1 +
 3 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 30d1d187019f..06b740bae431 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -217,18 +217,10 @@ static inline int arch_within_stack_frames(const void * const stack,
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_COMPAT
 #define TS_I386_REGS_POKED	0x0004	/* regs poked by 32-bit ptracer */
-#define TS_COMPAT_RESTART	0x0008
 
-#define arch_set_restart_data	arch_set_restart_data
+#define arch_set_restart_data(restart)	\
+	do { restart->arch_data = current_thread_info()->status; } while (0)
 
-static inline void arch_set_restart_data(struct restart_block *restart)
-{
-	struct thread_info *ti = current_thread_info();
-	if (ti->status & TS_COMPAT)
-		ti->status |= TS_COMPAT_RESTART;
-	else
-		ti->status &= ~TS_COMPAT_RESTART;
-}
 #endif
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6c26d2c3a2e4..f306e85a08a6 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -767,7 +767,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 {
 #ifdef CONFIG_IA32_EMULATION
-	if (current_thread_info()->status & TS_COMPAT_RESTART)
+	if (current->restart_block.arch_data & TS_COMPAT)
 		return __NR_ia32_restart_syscall;
 #endif
 #ifdef CONFIG_X86_X32_ABI
diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index bba2920e9c05..980a65594412 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -23,6 +23,7 @@ enum timespec_type {
  * System call restart block.
  */
 struct restart_block {
+	unsigned long arch_data;
 	long (*fn)(struct restart_block *);
 	union {
 		/* For futex_wait and futex_wait_requeue_pi */
-- 
cgit v1.2.3


From d29334c15d33a6a92d2043ca88f84cd5ad026c57 Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Tue, 16 Mar 2021 16:33:54 +0800
Subject: net/sched: act_api: fix miss set post_ct for ovs after do conntrack
 in act_ct

When openvswitch conntrack offload with act_ct action. The first rule
do conntrack in the act_ct in tc subsystem. And miss the next rule in
the tc and fallback to the ovs datapath but miss set post_ct flag
which will lead the ct_state_key with -trk flag.

Fixes: 7baf2429a1a9 ("net/sched: cls_flower add CT_FLAGS_INVALID flag support")
Signed-off-by: wenxu <wenxu@ucloud.cn>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h      | 1 +
 net/openvswitch/conntrack.c | 8 +++++---
 net/openvswitch/conntrack.h | 6 ++++--
 net/openvswitch/flow.c      | 4 +++-
 net/sched/cls_api.c         | 1 +
 5 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6d0a33d1c0db..f2c9ee71cb2c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -285,6 +285,7 @@ struct nf_bridge_info {
 struct tc_skb_ext {
 	__u32 chain;
 	__u16 mru;
+	bool post_ct;
 };
 #endif
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 5eddfe7bd391..71cec03e8612 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -271,9 +271,11 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 /* This is called to initialize CT key fields possibly coming in from the local
  * stack.
  */
-void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
+void ovs_ct_fill_key(const struct sk_buff *skb,
+		     struct sw_flow_key *key,
+		     bool post_ct)
 {
-	ovs_ct_update_key(skb, NULL, key, false, false);
+	ovs_ct_update_key(skb, NULL, key, post_ct, false);
 }
 
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
@@ -1332,7 +1334,7 @@ int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
 	if (skb_nfct(skb)) {
 		nf_conntrack_put(skb_nfct(skb));
 		nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
-		ovs_ct_fill_key(skb, key);
+		ovs_ct_fill_key(skb, key, false);
 	}
 
 	return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 59dc32761b91..317e525c8a11 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -25,7 +25,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
 		   const struct ovs_conntrack_info *);
 int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key);
 
-void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
+void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key,
+		     bool post_ct);
 int ovs_ct_put_key(const struct sw_flow_key *swkey,
 		   const struct sw_flow_key *output, struct sk_buff *skb);
 void ovs_ct_free_action(const struct nlattr *a);
@@ -74,7 +75,8 @@ static inline int ovs_ct_clear(struct sk_buff *skb,
 }
 
 static inline void ovs_ct_fill_key(const struct sk_buff *skb,
-				   struct sw_flow_key *key)
+				   struct sw_flow_key *key,
+				   bool post_ct)
 {
 	key->ct_state = 0;
 	key->ct_zone = 0;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index c7f34d6a9934..e586424d8b04 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -857,6 +857,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
 	struct tc_skb_ext *tc_ext;
 #endif
+	bool post_ct = false;
 	int res, err;
 
 	/* Extract metadata from packet. */
@@ -895,6 +896,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 		tc_ext = skb_ext_find(skb, TC_SKB_EXT);
 		key->recirc_id = tc_ext ? tc_ext->chain : 0;
 		OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0;
+		post_ct = tc_ext ? tc_ext->post_ct : false;
 	} else {
 		key->recirc_id = 0;
 	}
@@ -904,7 +906,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
 
 	err = key_extract(skb, key);
 	if (!err)
-		ovs_ct_fill_key(skb, key);   /* Must be after key_extract(). */
+		ovs_ct_fill_key(skb, key, post_ct);   /* Must be after key_extract(). */
 	return err;
 }
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index e37556cc37ab..13341e7fb077 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1629,6 +1629,7 @@ int tcf_classify_ingress(struct sk_buff *skb,
 			return TC_ACT_SHOT;
 		ext->chain = last_executed_chain;
 		ext->mru = qdisc_skb_cb(skb)->mru;
+		ext->post_ct = qdisc_skb_cb(skb)->post_ct;
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 2ed2c5f0391106406ead3a74bfa571575eafe8b6 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Tue, 16 Mar 2021 21:10:19 +0100
Subject: net: ocelot: Remove ocelot_xfh_get_cpuq

Now when extracting frames from CPU the cpuq is not used anymore so
remove it.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c | 3 +--
 include/linux/dsa/ocelot.h         | 5 -----
 net/dsa/tag_ocelot.c               | 2 --
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 9cc9378157e4..9f0c9bdd9f5d 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -687,7 +687,7 @@ static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh)
 int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
 {
 	struct skb_shared_hwtstamps *shhwtstamps;
-	u64 tod_in_ns, full_ts_in_ns, cpuq;
+	u64 tod_in_ns, full_ts_in_ns;
 	u64 timestamp, src_port, len;
 	u32 xfh[OCELOT_TAG_LEN / 4];
 	struct net_device *dev;
@@ -704,7 +704,6 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
 	ocelot_xfh_get_src_port(xfh, &src_port);
 	ocelot_xfh_get_len(xfh, &len);
 	ocelot_xfh_get_rew_val(xfh, &timestamp);
-	ocelot_xfh_get_cpuq(xfh, &cpuq);
 
 	if (WARN_ON(src_port >= ocelot->num_phys_ports))
 		return -EINVAL;
diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
index 4265f328681a..c6bc45ae5e03 100644
--- a/include/linux/dsa/ocelot.h
+++ b/include/linux/dsa/ocelot.h
@@ -160,11 +160,6 @@ static inline void ocelot_xfh_get_src_port(void *extraction, u64 *src_port)
 	packing(extraction, src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0);
 }
 
-static inline void ocelot_xfh_get_cpuq(void *extraction, u64 *cpuq)
-{
-	packing(extraction, cpuq, 28, 20, OCELOT_TAG_LEN, UNPACK, 0);
-}
-
 static inline void ocelot_xfh_get_qos_class(void *extraction, u64 *qos_class)
 {
 	packing(extraction, qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0);
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 157f95689d8d..f9df9cac81c5 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -83,7 +83,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	struct dsa_port *dp;
 	u8 *extraction;
 	u16 vlan_tpid;
-	u64 cpuq;
 
 	/* Revert skb->data by the amount consumed by the DSA master,
 	 * so it points to the beginning of the frame.
@@ -113,7 +112,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
 	ocelot_xfh_get_qos_class(extraction, &qos_class);
 	ocelot_xfh_get_tag_type(extraction, &tag_type);
 	ocelot_xfh_get_vlan_tci(extraction, &vlan_tci);
-	ocelot_xfh_get_cpuq(extraction, &cpuq);
 
 	skb->dev = dsa_master_find_slave(netdev, 0, src_port);
 	if (!skb->dev)
-- 
cgit v1.2.3


From 01035bcc0f9195a19a76c8a006b3c520428acb61 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 16 Mar 2021 15:52:15 -0700
Subject: Revert "net: socket: use BIT() for MSG_*"

This reverts commit 0bb3262c0248d44aea3be31076f44beb82a7b120.

Breaks things on mips64/qemu

Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 71 ++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index e88859f38cd0..385894b4a8bb 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -283,45 +283,42 @@ struct ucred {
    Added those for 1003.1g not all are supported yet
  */
 
-#define MSG_OOB		BIT(0)
-#define MSG_PEEK	BIT(1)
-#define MSG_DONTROUTE	BIT(2)
-#define MSG_TRYHARD	BIT(2)	/* Synonym for MSG_DONTROUTE for DECnet		*/
-#define MSG_CTRUNC	BIT(3)
-#define MSG_PROBE	BIT(4)	/* Do not send. Only probe path f.e. for MTU	*/
-#define MSG_TRUNC	BIT(5)
-#define MSG_DONTWAIT	BIT(6)	/* Nonblocking io		*/
-#define MSG_EOR		BIT(7)	/* End of record		*/
-#define MSG_WAITALL	BIT(8)	/* Wait for a full request	*/
-#define MSG_FIN		BIT(9)
-#define MSG_SYN		BIT(10)
-#define MSG_CONFIRM	BIT(11)	/* Confirm path validity	*/
-#define MSG_RST		BIT(12)
-#define MSG_ERRQUEUE	BIT(13)	/* Fetch message from error queue */
-#define MSG_NOSIGNAL	BIT(14)	/* Do not generate SIGPIPE	*/
-#define MSG_MORE	BIT(15)	/* Sender will send more	*/
-#define MSG_WAITFORONE	BIT(16)	/* recvmmsg(): block until 1+ packets avail */
-#define MSG_SENDPAGE_NOPOLICY	BIT(16)	/* sendpage() internal : do no apply policy */
-#define MSG_SENDPAGE_NOTLAST	BIT(17)	/* sendpage() internal : not the last page  */
-#define MSG_BATCH	BIT(18)		/* sendmmsg(): more messages coming */
-#define MSG_EOF		MSG_FIN
-#define MSG_NO_SHARED_FRAGS	BIT(19)	/* sendpage() internal : page frags
-					 * are not shared
-					 */
-#define MSG_SENDPAGE_DECRYPTED	BIT(20)	/* sendpage() internal : page may carry
-					 * plain text and require encryption
-					 */
-
-#define MSG_ZEROCOPY	BIT(26)		/* Use user data in kernel path */
-#define MSG_FASTOPEN	BIT(29)		/* Send data in TCP SYN */
-#define MSG_CMSG_CLOEXEC	BIT(30)	/* Set close_on_exec for file
-					 * descriptor received through
-					 * SCM_RIGHTS
-					 */
+#define MSG_OOB		1
+#define MSG_PEEK	2
+#define MSG_DONTROUTE	4
+#define MSG_TRYHARD     4       /* Synonym for MSG_DONTROUTE for DECnet */
+#define MSG_CTRUNC	8
+#define MSG_PROBE	0x10	/* Do not send. Only probe path f.e. for MTU */
+#define MSG_TRUNC	0x20
+#define MSG_DONTWAIT	0x40	/* Nonblocking io		 */
+#define MSG_EOR         0x80	/* End of record */
+#define MSG_WAITALL	0x100	/* Wait for a full request */
+#define MSG_FIN         0x200
+#define MSG_SYN		0x400
+#define MSG_CONFIRM	0x800	/* Confirm path validity */
+#define MSG_RST		0x1000
+#define MSG_ERRQUEUE	0x2000	/* Fetch message from error queue */
+#define MSG_NOSIGNAL	0x4000	/* Do not generate SIGPIPE */
+#define MSG_MORE	0x8000	/* Sender will send more */
+#define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
+#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */
+#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
+#define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
+#define MSG_EOF         MSG_FIN
+#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */
+#define MSG_SENDPAGE_DECRYPTED	0x100000 /* sendpage() internal : page may carry
+					  * plain text and require encryption
+					  */
+
+#define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
+#define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
+#define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
+					   descriptor received through
+					   SCM_RIGHTS */
 #if defined(CONFIG_COMPAT)
-#define MSG_CMSG_COMPAT	BIT(31)	/* This message needs 32 bit fixups */
+#define MSG_CMSG_COMPAT	0x80000000	/* This message needs 32 bit fixups */
 #else
-#define MSG_CMSG_COMPAT	0	/* We never have 32 bit fixups */
+#define MSG_CMSG_COMPAT	0		/* We never have 32 bit fixups */
 #endif
 
 
-- 
cgit v1.2.3


From 7a126a43a3dcf0fa6b9f7f2fe3ce82102517afe3 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Wed, 16 Sep 2020 10:11:20 +0300
Subject: net: Change dev parameter to const in netif_device_present()

Not all ndos check the present bit before calling the ndo and the driver
may want to check it. Sometimes the dev parameter passed as const so we
pass it to netif_device_present() as const.
Since netif_device_present() doesn't modify dev parameter anyway, declare
it as const.

Signed-off-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b379d08a12ed..97254c089eb2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4175,7 +4175,7 @@ static inline bool netif_oper_up(const struct net_device *dev)
  *
  * Check if device has not been removed from system.
  */
-static inline bool netif_device_present(struct net_device *dev)
+static inline bool netif_device_present(const struct net_device *dev)
 {
 	return test_bit(__LINK_STATE_PRESENT, &dev->state);
 }
-- 
cgit v1.2.3


From c276aae8c19d65e21a43c2690c7c7dafea0e97fa Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Tue, 26 Jan 2021 11:51:04 +0200
Subject: net/mlx5: Move mlx5e hw resources into a sub object

This is to separate between resources attributes and other
attributes we will want to use.

Signed-off-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c   |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en/trap.c  |  6 ++---
 .../ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en_common.c    | 27 ++++++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 22 +++++++++---------
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/crypto.c   |  2 +-
 include/linux/mlx5/driver.h                        | 10 ++++----
 8 files changed, 40 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index d57b6f06382f..bb5d108f75d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -174,7 +174,7 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_port_ptp *c, int txq_ix,
 	sq->mdev      = mdev;
 	sq->ch_ix     = c->ix;
 	sq->txq_ix    = txq_ix;
-	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+	sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 	sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
 	sq->stats     = &c->priv->port_ptp_stats.sq[tc];
@@ -475,7 +475,7 @@ int mlx5e_port_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params,
 	c->ix       = 0;
 	c->pdev     = mlx5_core_dma_dev(priv->mdev);
 	c->netdev   = priv->netdev;
-	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
 	c->num_tc   = params->num_tc;
 	c->stats    = &priv->port_ptp_stats.ch;
 	c->lag_port = lag_port;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
index 37fc1d77ded7..41db93883fea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c
@@ -84,7 +84,7 @@ static int mlx5e_alloc_trap_rq(struct mlx5e_priv *priv, struct mlx5e_rq_param *r
 	if (err)
 		goto err_free_frags;
 
-	rq->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+	rq->mkey_be = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey.key);
 
 	mlx5e_rq_set_trap_handlers(rq, params);
 
@@ -213,7 +213,7 @@ static int mlx5e_create_trap_direct_rq_tir(struct mlx5_core_dev *mdev, struct ml
 		return -ENOMEM;
 
 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
-	MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.td.tdn);
+	MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn);
 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_NONE);
 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
 	MLX5_SET(tirc, tirc, inline_rqn, rqn);
@@ -266,7 +266,7 @@ static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv)
 	t->tstamp   = &priv->tstamp;
 	t->pdev     = mlx5_core_dma_dev(priv->mdev);
 	t->netdev   = priv->netdev;
-	t->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+	t->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
 	t->stats    = &priv->trap_stats.ch;
 
 	netif_napi_add(netdev, &t->napi, mlx5e_trap_napi_poll, 64);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
index d06532d0baa4..f7c880edae37 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
@@ -84,7 +84,7 @@ static int mlx5e_ktls_create_tir(struct mlx5_core_dev *mdev, u32 *tirn, u32 rqtn
 
 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
 
-	MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.td.tdn);
+	MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn);
 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_INVERTED_XOR8);
 	MLX5_SET(tirc, tirc, indirect_table, rqtn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index a6cf008057b5..8c166ee56d8b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -38,15 +38,16 @@
 
 int mlx5e_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, u32 *in)
 {
+	struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs;
 	int err;
 
 	err = mlx5_core_create_tir(mdev, in, &tir->tirn);
 	if (err)
 		return err;
 
-	mutex_lock(&mdev->mlx5e_res.td.list_lock);
-	list_add(&tir->list, &mdev->mlx5e_res.td.tirs_list);
-	mutex_unlock(&mdev->mlx5e_res.td.list_lock);
+	mutex_lock(&res->td.list_lock);
+	list_add(&tir->list, &res->td.tirs_list);
+	mutex_unlock(&res->td.list_lock);
 
 	return 0;
 }
@@ -54,10 +55,12 @@ int mlx5e_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, u32 *in)
 void mlx5e_destroy_tir(struct mlx5_core_dev *mdev,
 		       struct mlx5e_tir *tir)
 {
-	mutex_lock(&mdev->mlx5e_res.td.list_lock);
+	struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs;
+
+	mutex_lock(&res->td.list_lock);
 	mlx5_core_destroy_tir(mdev, tir->tirn);
 	list_del(&tir->list);
-	mutex_unlock(&mdev->mlx5e_res.td.list_lock);
+	mutex_unlock(&res->td.list_lock);
 }
 
 void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc)
@@ -99,7 +102,7 @@ static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
 
 int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 {
-	struct mlx5e_resources *res = &mdev->mlx5e_res;
+	struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs;
 	int err;
 
 	err = mlx5_core_alloc_pd(mdev, &res->pdn);
@@ -126,8 +129,8 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 		goto err_destroy_mkey;
 	}
 
-	INIT_LIST_HEAD(&mdev->mlx5e_res.td.tirs_list);
-	mutex_init(&mdev->mlx5e_res.td.list_lock);
+	INIT_LIST_HEAD(&res->td.tirs_list);
+	mutex_init(&res->td.list_lock);
 
 	return 0;
 
@@ -142,7 +145,7 @@ err_dealloc_pd:
 
 void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
 {
-	struct mlx5e_resources *res = &mdev->mlx5e_res;
+	struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs;
 
 	mlx5_free_bfreg(mdev, &res->bfreg);
 	mlx5_core_destroy_mkey(mdev, &res->mkey);
@@ -180,8 +183,8 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb,
 
 	MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1);
 
-	mutex_lock(&mdev->mlx5e_res.td.list_lock);
-	list_for_each_entry(tir, &mdev->mlx5e_res.td.tirs_list, list) {
+	mutex_lock(&mdev->mlx5e_res.hw_objs.td.list_lock);
+	list_for_each_entry(tir, &mdev->mlx5e_res.hw_objs.td.tirs_list, list) {
 		tirn = tir->tirn;
 		err = mlx5_core_modify_tir(mdev, tirn, in);
 		if (err)
@@ -192,7 +195,7 @@ out:
 	kvfree(in);
 	if (err)
 		netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err);
-	mutex_unlock(&mdev->mlx5e_res.td.list_lock);
+	mutex_unlock(&mdev->mlx5e_res.hw_objs.td.list_lock);
 
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3e8434dcc1df..2f961bd9e528 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -302,7 +302,7 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
 	mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
-	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn);
+	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
 	MLX5_SET64(mkc, mkc, len, npages << page_shift);
 	MLX5_SET(mkc, mkc, translations_octword_size,
 		 MLX5_MTT_OCTW(npages));
@@ -1019,7 +1019,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 	sq->pdev      = c->pdev;
 	sq->mkey_be   = c->mkey_be;
 	sq->channel   = c;
-	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+	sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 	sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
 	sq->xsk_pool  = xsk_pool;
@@ -1090,7 +1090,7 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
 	int err;
 
 	sq->channel   = c;
-	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+	sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
 
 	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
@@ -1174,7 +1174,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	sq->priv      = c->priv;
 	sq->ch_ix     = c->ix;
 	sq->txq_ix    = txq_ix;
-	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+	sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 	sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
 	INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
@@ -1257,7 +1257,7 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
 	MLX5_SET(sqc,  sqc, flush_in_error_en, 1);
 
 	MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
-	MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.bfreg.index);
+	MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.hw_objs.bfreg.index);
 	MLX5_SET(wq,   wq, log_wq_pg_sz,  csp->wq_ctrl->buf.page_shift -
 					  MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr,      csp->wq_ctrl->db.dma);
@@ -2032,7 +2032,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->cpu      = cpu;
 	c->pdev     = mlx5_core_dma_dev(priv->mdev);
 	c->netdev   = priv->netdev;
-	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
 	c->num_tc   = params->num_tc;
 	c->xdp      = !!params->xdp_prog;
 	c->stats    = &priv->channel_stats[ix].ch;
@@ -2217,7 +2217,7 @@ void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
 	MLX5_SET(wq, wq, log_wq_stride,
 		 mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
-	MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.pdn);
+	MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.hw_objs.pdn);
 	MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
 	MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
 	MLX5_SET(rqc, rqc, scatter_fcs,    params->scatter_fcs_en);
@@ -2248,7 +2248,7 @@ void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
 
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
-	MLX5_SET(wq, wq, pd,            priv->mdev->mlx5e_res.pdn);
+	MLX5_SET(wq, wq, pd,            priv->mdev->mlx5e_res.hw_objs.pdn);
 
 	param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev));
 }
@@ -3421,10 +3421,10 @@ int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn)
 {
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
-	MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn);
+	MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn);
 
 	if (MLX5_GET(tisc, tisc, tls_en))
-		MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.pdn);
+		MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.hw_objs.pdn);
 
 	if (mlx5_lag_is_lacp_owner(mdev))
 		MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
@@ -3494,7 +3494,7 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
 static void mlx5e_build_indir_tir_ctx_common(struct mlx5e_priv *priv,
 					     u32 rqtn, u32 *tirc)
 {
-	MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+	MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.hw_objs.td.tdn);
 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
 	MLX5_SET(tirc, tirc, indirect_table, rqtn);
 	MLX5_SET(tirc, tirc, tunneled_offload_en,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 1eeca45cfcdf..0fc055cdf221 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -708,7 +708,7 @@ static void mlx5_rdma_netdev_free(struct net_device *netdev)
 
 static bool mlx5_is_sub_interface(struct mlx5_core_dev *mdev)
 {
-	return mdev->mlx5e_res.pdn != 0;
+	return mdev->mlx5e_res.hw_objs.pdn != 0;
 }
 
 static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c
index 57eb91bcbca7..e995f8378df7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c
@@ -46,7 +46,7 @@ int mlx5_create_encryption_key(struct mlx5_core_dev *mdev,
 		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type,
 		 MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY);
-	MLX5_SET(encryption_key_obj, obj, pd, mdev->mlx5e_res.pdn);
+	MLX5_SET(encryption_key_obj, obj, pd, mdev->mlx5e_res.hw_objs.pdn);
 
 	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
 	if (!err)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 53b89631a1d9..9887181dea5f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -644,10 +644,12 @@ struct mlx5_td {
 };
 
 struct mlx5e_resources {
-	u32                        pdn;
-	struct mlx5_td             td;
-	struct mlx5_core_mkey      mkey;
-	struct mlx5_sq_bfreg       bfreg;
+	struct mlx5e_hw_objs {
+		u32                        pdn;
+		struct mlx5_td             td;
+		struct mlx5_core_mkey      mkey;
+		struct mlx5_sq_bfreg       bfreg;
+	} hw_objs;
 };
 
 enum mlx5_sw_icm_type {
-- 
cgit v1.2.3


From c27971d08abecc91f06214dacc66ce3ce2662a44 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Wed, 28 Oct 2020 11:21:26 +0200
Subject: net/mlx5: Move devlink port from mlx5e priv to mlx5e resources

We re-use the native NIC port net device instance for the Uplink
representor, and the devlink port.
When changing profiles we reset the mlx5e priv but we should still
use the devlink port so move it to mlx5e resources.

Signed-off-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h            |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c    | 17 ++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h    |  6 ++++++
 .../net/ethernet/mellanox/mlx5/core/en/reporter_rx.c    |  4 +++-
 .../net/ethernet/mellanox/mlx5/core/en/reporter_tx.c    |  4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c         |  5 ++++-
 include/linux/mlx5/driver.h                             |  1 +
 7 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7435fe6829b6..4d621d142f76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -880,7 +880,6 @@ struct mlx5e_priv {
 #endif
 	struct devlink_health_reporter *tx_reporter;
 	struct devlink_health_reporter *rx_reporter;
-	struct devlink_port            dl_port;
 	struct mlx5e_xsk           xsk;
 #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
 	struct mlx5e_hv_vhca_stats_agent stats_agent;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
index 054bc2fc0520..765f3064689d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c
@@ -19,6 +19,7 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv)
 	struct devlink *devlink = priv_to_devlink(priv->mdev);
 	struct devlink_port_attrs attrs = {};
 	struct netdev_phys_item_id ppid = {};
+	struct devlink_port *dl_port;
 	unsigned int dl_port_index;
 
 	if (mlx5_core_is_pf(priv->mdev)) {
@@ -36,24 +37,30 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv)
 		dl_port_index = mlx5_esw_vport_to_devlink_port_index(priv->mdev, 0);
 	}
 
-	devlink_port_attrs_set(&priv->dl_port, &attrs);
+	dl_port = mlx5e_devlink_get_dl_port(priv);
+	memset(dl_port, 0, sizeof(*dl_port));
+	devlink_port_attrs_set(dl_port, &attrs);
 
-	return devlink_port_register(devlink, &priv->dl_port, dl_port_index);
+	return devlink_port_register(devlink, dl_port, dl_port_index);
 }
 
 void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv)
 {
-	devlink_port_type_eth_set(&priv->dl_port, priv->netdev);
+	struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv);
+
+	devlink_port_type_eth_set(dl_port, priv->netdev);
 }
 
 void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv)
 {
-	devlink_port_unregister(&priv->dl_port);
+	struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv);
+
+	devlink_port_unregister(dl_port);
 }
 
 struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
-	return &priv->dl_port;
+	return mlx5e_devlink_get_dl_port(priv);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h
index 83123a801adc..10b50feb9883 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h
@@ -12,4 +12,10 @@ void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv);
 void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv);
 struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev);
 
+static inline struct devlink_port *
+mlx5e_devlink_get_dl_port(struct mlx5e_priv *priv)
+{
+	return &priv->mdev->mlx5e_res.dl_port;
+}
+
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index d80bbd17e5f8..f0a419fc4adf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -4,6 +4,7 @@
 #include "health.h"
 #include "params.h"
 #include "txrx.h"
+#include "devlink.h"
 
 static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state)
 {
@@ -615,9 +616,10 @@ static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = {
 
 void mlx5e_reporter_rx_create(struct mlx5e_priv *priv)
 {
+	struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv);
 	struct devlink_health_reporter *reporter;
 
-	reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_rx_reporter_ops,
+	reporter = devlink_port_health_reporter_create(dl_port, &mlx5_rx_reporter_ops,
 						       MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv);
 	if (IS_ERR(reporter)) {
 		netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index d7275c84313e..db64fa2620c4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -3,6 +3,7 @@
 
 #include "health.h"
 #include "en/ptp.h"
+#include "en/devlink.h"
 
 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
 {
@@ -572,9 +573,10 @@ static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
 
 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
 {
+	struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv);
 	struct devlink_health_reporter *reporter;
 
-	reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops,
+	reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops,
 						       MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
 	if (IS_ERR(reporter)) {
 		netdev_warn(priv->netdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 1f15c6183dc1..b0604b113530 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -52,6 +52,7 @@
 #include "en/health.h"
 #include "en/params.h"
 #include "devlink.h"
+#include "en/devlink.h"
 
 static struct sk_buff *
 mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
@@ -1823,6 +1824,7 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe
 	struct mlx5e_priv *priv = netdev_priv(rq->netdev);
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
+	struct devlink_port *dl_port;
 	struct sk_buff *skb;
 	u32 cqe_bcnt;
 	u16 trap_id;
@@ -1845,7 +1847,8 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	skb_push(skb, ETH_HLEN);
 
-	mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port);
+	dl_port = mlx5e_devlink_get_dl_port(priv);
+	mlx5_devlink_trap_report(rq->mdev, trap_id, skb, dl_port);
 	dev_kfree_skb_any(skb);
 
 free_wqe:
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9887181dea5f..f1d0340e46a7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -650,6 +650,7 @@ struct mlx5e_resources {
 		struct mlx5_core_mkey      mkey;
 		struct mlx5_sq_bfreg       bfreg;
 	} hw_objs;
+	struct devlink_port dl_port;
 };
 
 enum mlx5_sw_icm_type {
-- 
cgit v1.2.3


From 7a9fb35e8c3a67145fca262c304de65cb2f83abf Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Wed, 16 Sep 2020 10:11:33 +0300
Subject: net/mlx5e: Do not reload ethernet ports when changing eswitch mode

When switching modes between legacy and switchdev and back, do not
reload ethernet interfaces. just change the profile from nic profile
to uplink rep profile in switchdev mode.

Signed-off-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c      |   3 -
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   1 +
 .../ethernet/mellanox/mlx5/core/en/reporter_rx.c   |   1 +
 .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 148 ++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h |   9 ++
 include/linux/mlx5/driver.h                        |   1 +
 8 files changed, 116 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index b051417ede67..4def64d0e669 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -58,9 +58,6 @@ static bool is_eth_supported(struct mlx5_core_dev *dev)
 	if (!IS_ENABLED(CONFIG_MLX5_CORE_EN))
 		return false;
 
-	if (is_eth_rep_supported(dev))
-		return false;
-
 	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
 		return false;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 4d621d142f76..1f5bc4d91060 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1173,6 +1173,7 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv);
 void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
 int mlx5e_netdev_change_profile(struct mlx5e_priv *priv,
 				const struct mlx5e_profile *new_profile, void *new_ppriv);
+void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv);
 void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv);
 void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu);
 void mlx5e_build_rq_params(struct mlx5_core_dev *mdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index f0a419fc4adf..34b3b316b688 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -635,4 +635,5 @@ void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv)
 		return;
 
 	devlink_port_health_reporter_destroy(priv->rx_reporter);
+	priv->rx_reporter = NULL;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index db64fa2620c4..63ee3b9416de 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -593,4 +593,5 @@ void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
 		return;
 
 	devlink_port_health_reporter_destroy(priv->tx_reporter);
+	priv->tx_reporter = NULL;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 685cf071a9de..9c08f0bd1fcc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5742,6 +5742,11 @@ rollback:
 	return err;
 }
 
+void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv)
+{
+	mlx5e_netdev_change_profile(priv, &mlx5e_nic_profile, NULL);
+}
+
 void mlx5e_destroy_netdev(struct mlx5e_priv *priv)
 {
 	struct net_device *netdev = priv->netdev;
@@ -5852,6 +5857,7 @@ static int mlx5e_probe(struct auxiliary_device *adev,
 	mlx5e_devlink_port_type_eth_set(priv);
 
 	mlx5e_dcbnl_init_app(priv);
+	mlx5_uplink_netdev_set(mdev, netdev);
 	return 0;
 
 err_resume:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 9533085005c3..4cc902e0d71b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -44,6 +44,7 @@
 #include "en_tc.h"
 #include "en/rep/tc.h"
 #include "en/rep/neigh.h"
+#include "en/devlink.h"
 #include "fs_core.h"
 #include "lib/mlx5.h"
 #define CREATE_TRACE_POINTS
@@ -588,26 +589,15 @@ static void mlx5e_build_rep_params(struct net_device *netdev)
 }
 
 static void mlx5e_build_rep_netdev(struct net_device *netdev,
-				   struct mlx5_core_dev *mdev,
-				   struct mlx5_eswitch_rep *rep)
+				   struct mlx5_core_dev *mdev)
 {
 	SET_NETDEV_DEV(netdev, mdev->device);
-	if (rep->vport == MLX5_VPORT_UPLINK) {
-		netdev->netdev_ops = &mlx5e_netdev_ops;
-		/* we want a persistent mac for the uplink rep */
-		mlx5_query_mac_address(mdev, netdev->dev_addr);
-		netdev->ethtool_ops = &mlx5e_ethtool_ops;
-		mlx5e_dcbnl_build_rep_netdev(netdev);
-	} else {
-		netdev->netdev_ops = &mlx5e_netdev_ops_rep;
-		eth_hw_addr_random(netdev);
-		netdev->ethtool_ops = &mlx5e_rep_ethtool_ops;
-	}
+	netdev->netdev_ops = &mlx5e_netdev_ops_rep;
+	eth_hw_addr_random(netdev);
+	netdev->ethtool_ops = &mlx5e_rep_ethtool_ops;
 
 	netdev->watchdog_timeo    = 15 * HZ;
 
-	netdev->features       |= NETIF_F_NETNS_LOCAL;
-
 #if IS_ENABLED(CONFIG_MLX5_CLS_ACT)
 	netdev->hw_features    |= NETIF_F_HW_TC;
 #endif
@@ -619,12 +609,9 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev,
 	netdev->hw_features    |= NETIF_F_TSO6;
 	netdev->hw_features    |= NETIF_F_RXCSUM;
 
-	if (rep->vport == MLX5_VPORT_UPLINK)
-		netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX;
-	else
-		netdev->features |= NETIF_F_VLAN_CHALLENGED;
-
 	netdev->features |= netdev->hw_features;
+	netdev->features |= NETIF_F_VLAN_CHALLENGED;
+	netdev->features |= NETIF_F_NETNS_LOCAL;
 }
 
 static int mlx5e_init_rep(struct mlx5_core_dev *mdev,
@@ -990,6 +977,14 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
 	mlx5e_dcbnl_initialize(priv);
 	mlx5e_dcbnl_init_app(priv);
 	mlx5e_rep_neigh_init(rpriv);
+
+	netdev->wanted_features |= NETIF_F_HW_TC;
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		mlx5e_open(netdev);
+	netif_device_attach(netdev);
+	rtnl_unlock();
 }
 
 static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
@@ -997,6 +992,12 @@ static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
+	rtnl_lock();
+	if (netif_running(priv->netdev))
+		mlx5e_close(priv->netdev);
+	netif_device_detach(priv->netdev);
+	rtnl_unlock();
+
 	mlx5e_rep_neigh_cleanup(rpriv);
 	mlx5e_dcbnl_delete_app(priv);
 	mlx5_notifier_unregister(mdev, &priv->events_nb);
@@ -1081,26 +1082,56 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = {
 
 /* e-Switch vport representors */
 static int
-mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev));
+	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+	struct devlink_port *dl_port;
+	int err;
+
+	rpriv->netdev = priv->netdev;
+
+	err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile,
+					  rpriv);
+	if (err)
+		return err;
+
+	dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport);
+	if (dl_port)
+		devlink_port_type_eth_set(dl_port, rpriv->netdev);
+
+	return 0;
+}
+
+static void
+mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv)
+{
+	struct net_device *netdev = rpriv->netdev;
+	struct devlink_port *dl_port;
+	struct mlx5_core_dev *dev;
+	struct mlx5e_priv *priv;
+
+	priv = netdev_priv(netdev);
+	dev = priv->mdev;
+
+	dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport);
+	if (dl_port)
+		devlink_port_type_clear(dl_port);
+	mlx5e_netdev_attach_nic_profile(priv);
+}
+
+static int
+mlx5e_vport_vf_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
+	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
 	const struct mlx5e_profile *profile;
-	struct mlx5e_rep_priv *rpriv;
 	struct devlink_port *dl_port;
 	struct net_device *netdev;
 	struct mlx5e_priv *priv;
 	unsigned int txqs, rxqs;
 	int nch, err;
 
-	rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
-	if (!rpriv)
-		return -ENOMEM;
-
-	/* rpriv->rep to be looked up when profile->init() is called */
-	rpriv->rep = rep;
-
-	profile = (rep->vport == MLX5_VPORT_UPLINK) ?
-		  &mlx5e_uplink_rep_profile : &mlx5e_rep_profile;
-
+	profile = &mlx5e_rep_profile;
 	nch = mlx5e_get_max_num_channels(dev);
 	txqs = nch * profile->max_tc;
 	rxqs = nch * profile->rq_groups;
@@ -1109,21 +1140,11 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 		mlx5_core_warn(dev,
 			       "Failed to create representor netdev for vport %d\n",
 			       rep->vport);
-		kfree(rpriv);
 		return -EINVAL;
 	}
 
-	mlx5e_build_rep_netdev(netdev, dev, rep);
-
+	mlx5e_build_rep_netdev(netdev, dev);
 	rpriv->netdev = netdev;
-	rep->rep_data[REP_ETH].priv = rpriv;
-	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
-
-	if (rep->vport == MLX5_VPORT_UPLINK) {
-		err = mlx5e_create_mdev_resources(dev);
-		if (err)
-			goto err_destroy_netdev;
-	}
 
 	priv = netdev_priv(netdev);
 	priv->profile = profile;
@@ -1131,7 +1152,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	err = profile->init(dev, netdev);
 	if (err) {
 		netdev_warn(netdev, "rep profile init failed, %d\n", err);
-		goto err_destroy_mdev_resources;
+		goto err_destroy_netdev;
 	}
 
 	err = mlx5e_attach_netdev(netdev_priv(netdev));
@@ -1161,13 +1182,34 @@ err_detach_netdev:
 err_cleanup_profile:
 	priv->profile->cleanup(priv);
 
-err_destroy_mdev_resources:
-	if (rep->vport == MLX5_VPORT_UPLINK)
-		mlx5e_destroy_mdev_resources(dev);
-
 err_destroy_netdev:
 	mlx5e_destroy_netdev(netdev_priv(netdev));
-	kfree(rpriv);
+	return err;
+}
+
+static int
+mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv;
+	int err;
+
+	rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
+	if (!rpriv)
+		return -ENOMEM;
+
+	/* rpriv->rep to be looked up when profile->init() is called */
+	rpriv->rep = rep;
+	rep->rep_data[REP_ETH].priv = rpriv;
+	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
+
+	if (rep->vport == MLX5_VPORT_UPLINK)
+		err = mlx5e_vport_uplink_rep_load(dev, rep);
+	else
+		err = mlx5e_vport_vf_rep_load(dev, rep);
+
+	if (err)
+		kfree(rpriv);
+
 	return err;
 }
 
@@ -1181,15 +1223,19 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 	struct devlink_port *dl_port;
 	void *ppriv = priv->ppriv;
 
+	if (rep->vport == MLX5_VPORT_UPLINK) {
+		mlx5e_vport_uplink_rep_unload(rpriv);
+		goto free_ppriv;
+	}
+
 	dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport);
 	if (dl_port)
 		devlink_port_type_clear(dl_port);
 	unregister_netdev(netdev);
 	mlx5e_detach_netdev(priv);
 	priv->profile->cleanup(priv);
-	if (rep->vport == MLX5_VPORT_UPLINK)
-		mlx5e_destroy_mdev_resources(priv->mdev);
 	mlx5e_destroy_netdev(priv);
+free_ppriv:
 	kfree(ppriv); /* mlx5e_rep_priv */
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index d046db7bb047..2f536c5d30b1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -95,4 +95,13 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev)
 	return devlink_net(priv_to_devlink(dev));
 }
 
+static inline void mlx5_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev)
+{
+	mdev->mlx5e_res.uplink_netdev = netdev;
+}
+
+static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev)
+{
+	return mdev->mlx5e_res.uplink_netdev;
+}
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f1d0340e46a7..23bb01d7c9b9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -651,6 +651,7 @@ struct mlx5e_resources {
 		struct mlx5_sq_bfreg       bfreg;
 	} hw_objs;
 	struct devlink_port dl_port;
+	struct net_device *uplink_netdev;
 };
 
 enum mlx5_sw_icm_type {
-- 
cgit v1.2.3


From 2973073a80b46daebc352c31d09d95d16cf6876e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 1 Mar 2021 08:44:24 +0100
Subject: swiotlb: remove the alloc_size parameter to swiotlb_tbl_unmap_single

Now that swiotlb remembers the allocation size there is no need to pass
it back to swiotlb_tbl_unmap_single.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/iommu/dma-iommu.c | 11 +++--------
 drivers/xen/swiotlb-xen.c |  4 ++--
 include/linux/swiotlb.h   |  1 -
 kernel/dma/direct.h       |  2 +-
 kernel/dma/swiotlb.c      | 45 +++++++++++++++++++++++----------------------
 5 files changed, 29 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index af765c813cc8..9149597410e2 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -499,8 +499,6 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 		unsigned long attrs)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
-	struct iommu_dma_cookie *cookie = domain->iova_cookie;
-	struct iova_domain *iovad = &cookie->iovad;
 	phys_addr_t phys;
 
 	phys = iommu_iova_to_phys(domain, dma_addr);
@@ -510,8 +508,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,
 	__iommu_dma_unmap(dev, dma_addr, size);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size,
-				iova_align(iovad, size), dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
@@ -581,10 +578,8 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,
 	}
 
 	iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
-	if ((iova == DMA_MAPPING_ERROR) && is_swiotlb_buffer(phys))
-		swiotlb_tbl_unmap_single(dev, phys, org_size,
-				aligned_size, dir, attrs);
-
+	if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+		swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
 	return iova;
 }
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 2b385c1b4a99..d47f1b311caa 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -406,7 +406,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 * Ensure that the address returned is DMA'ble
 	 */
 	if (unlikely(!dma_capable(dev, dev_addr, size, true))) {
-		swiotlb_tbl_unmap_single(dev, map, size, size, dir,
+		swiotlb_tbl_unmap_single(dev, map, size, dir,
 				attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		return DMA_MAPPING_ERROR;
 	}
@@ -445,7 +445,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 
 	/* NOTE: We use dev_addr here, not paddr! */
 	if (is_xen_swiotlb_buffer(hwdev, dev_addr))
-		swiotlb_tbl_unmap_single(hwdev, paddr, size, size, dir, attrs);
+		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
 }
 
 static void
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 5857a937c637..59f421d041ed 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -57,7 +57,6 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
 extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     phys_addr_t tlb_addr,
 				     size_t mapping_size,
-				     size_t alloc_size,
 				     enum dma_data_direction dir,
 				     unsigned long attrs);
 
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index b98615578737..e1bf721591c0 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -114,6 +114,6 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
 	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 #endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c10e855a03bc..03aa614565e4 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -102,7 +102,7 @@ static phys_addr_t *io_tlb_orig_addr;
 /*
  * The mapped buffer's size should be validated during a sync operation.
  */
-static size_t *io_tlb_orig_size;
+static size_t *io_tlb_alloc_size;
 
 /*
  * Protect the above data structures in the map and unmap calls
@@ -253,15 +253,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 		      __func__, alloc_size, PAGE_SIZE);
 
 	alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
-	io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!io_tlb_orig_size)
+	io_tlb_alloc_size = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!io_tlb_alloc_size)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
 	for (i = 0; i < io_tlb_nslabs; i++) {
 		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -393,18 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!io_tlb_orig_addr)
 		goto cleanup4;
 
-	io_tlb_orig_size = (size_t *)
+	io_tlb_alloc_size = (size_t *)
 		__get_free_pages(GFP_KERNEL,
 				 get_order(io_tlb_nslabs *
 					   sizeof(size_t)));
-	if (!io_tlb_orig_size)
+	if (!io_tlb_alloc_size)
 		goto cleanup5;
 
 
 	for (i = 0; i < io_tlb_nslabs; i++) {
 		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 	io_tlb_index = 0;
 	no_iotlb_memory = false;
@@ -436,7 +436,7 @@ void __init swiotlb_exit(void)
 		return;
 
 	if (late_alloc) {
-		free_pages((unsigned long)io_tlb_orig_size,
+		free_pages((unsigned long)io_tlb_alloc_size,
 			   get_order(io_tlb_nslabs * sizeof(size_t)));
 		free_pages((unsigned long)io_tlb_orig_addr,
 			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
@@ -447,7 +447,7 @@ void __init swiotlb_exit(void)
 	} else {
 		memblock_free_late(__pa(io_tlb_orig_addr),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
-		memblock_free_late(__pa(io_tlb_orig_size),
+		memblock_free_late(__pa(io_tlb_alloc_size),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
 		memblock_free_late(__pa(io_tlb_list),
 				   PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
@@ -639,7 +639,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 */
 	for (i = 0; i < nr_slots(alloc_size + offset); i++) {
 		io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
-		io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+		io_tlb_alloc_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
 	}
 	tlb_addr = slot_addr(io_tlb_start, index) + offset;
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
@@ -648,14 +648,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	return tlb_addr;
 }
 
-static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
+static void validate_sync_size_and_truncate(struct device *hwdev, size_t alloc_size, size_t *size)
 {
-	if (*size > orig_size) {
+	if (*size > alloc_size) {
 		/* Warn and truncate mapping_size */
 		dev_WARN_ONCE(hwdev, 1,
 			"Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
-			orig_size, *size);
-		*size = orig_size;
+			alloc_size, *size);
+		*size = alloc_size;
 	}
 }
 
@@ -663,16 +663,17 @@ static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_si
  * tlb_addr is the physical address of the bounce buffer to unmap.
  */
 void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
-			      size_t mapping_size, size_t alloc_size,
-			      enum dma_data_direction dir, unsigned long attrs)
+			      size_t mapping_size, enum dma_data_direction dir,
+			      unsigned long attrs)
 {
 	unsigned long flags;
 	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
-	int i, count, nslots = nr_slots(alloc_size + offset);
 	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
+	size_t alloc_size = io_tlb_alloc_size[index];
+	int i, count, nslots = nr_slots(alloc_size + offset);
 
-	validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+	validate_sync_size_and_truncate(hwdev, alloc_size, &mapping_size);
 
 	/*
 	 * First, sync the memory before unmapping the entry
@@ -701,7 +702,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	for (i = index + nslots - 1; i >= index; i--) {
 		io_tlb_list[i] = ++count;
 		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_orig_size[i] = 0;
+		io_tlb_alloc_size[i] = 0;
 	}
 
 	/*
@@ -721,13 +722,13 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
 			     enum dma_sync_target target)
 {
 	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	size_t orig_size = io_tlb_orig_size[index];
+	size_t alloc_size = io_tlb_alloc_size[index];
 	phys_addr_t orig_addr = io_tlb_orig_addr[index];
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
 
-	validate_sync_size_and_truncate(hwdev, orig_size, &size);
+	validate_sync_size_and_truncate(hwdev, alloc_size, &size);
 
 	switch (target) {
 	case SYNC_FOR_CPU:
@@ -770,7 +771,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 	/* Ensure that the address returned is DMA'ble */
 	dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
 	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
-		swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
+		swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		dev_WARN_ONCE(dev, 1,
 			"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
-- 
cgit v1.2.3


From 80808d273a3f075196d1a26463f65d4c9d2891c8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 1 Mar 2021 08:44:26 +0100
Subject: swiotlb: split swiotlb_tbl_sync_single

Split swiotlb_tbl_sync_single into two separate funtions for the to device
and to cpu synchronization.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/iommu/dma-iommu.c | 12 ++++++------
 drivers/xen/swiotlb-xen.c |  4 ++--
 include/linux/swiotlb.h   | 17 ++++-------------
 kernel/dma/direct.c       |  8 ++++----
 kernel/dma/direct.h       |  4 ++--
 kernel/dma/swiotlb.c      | 34 +++++++++++++++-------------------
 6 files changed, 33 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9149597410e2..3087d9fa6065 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -750,7 +750,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
 		arch_sync_dma_for_cpu(phys, size, dir);
 
 	if (is_swiotlb_buffer(phys))
-		swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_CPU);
+		swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
 
 static void iommu_dma_sync_single_for_device(struct device *dev,
@@ -763,7 +763,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
 	if (is_swiotlb_buffer(phys))
-		swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_DEVICE);
+		swiotlb_sync_single_for_device(dev, phys, size, dir);
 
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(phys, size, dir);
@@ -784,8 +784,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 			arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 
 		if (is_swiotlb_buffer(sg_phys(sg)))
-			swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
-						dir, SYNC_FOR_CPU);
+			swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
+						    sg->length, dir);
 	}
 }
 
@@ -801,8 +801,8 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (is_swiotlb_buffer(sg_phys(sg)))
-			swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
-						dir, SYNC_FOR_DEVICE);
+			swiotlb_sync_single_for_device(dev, sg_phys(sg),
+						       sg->length, dir);
 
 		if (!dev_is_dma_coherent(dev))
 			arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index d47f1b311caa..4e8a4e14942a 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -462,7 +462,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr,
 	}
 
 	if (is_xen_swiotlb_buffer(dev, dma_addr))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
+		swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
 }
 
 static void
@@ -472,7 +472,7 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr,
 	phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr);
 
 	if (is_xen_swiotlb_buffer(dev, dma_addr))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+		swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
 	if (!dev_is_dma_coherent(dev)) {
 		if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr))))
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 59f421d041ed..0696bdc8072e 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -42,14 +42,6 @@ extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 extern int swiotlb_late_init_with_default_size(size_t default_size);
 extern void __init swiotlb_update_mem_attributes(void);
 
-/*
- * Enumeration for sync targets
- */
-enum dma_sync_target {
-	SYNC_FOR_CPU = 0,
-	SYNC_FOR_DEVICE = 1,
-};
-
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
 		size_t mapping_size, size_t alloc_size,
 		enum dma_data_direction dir, unsigned long attrs);
@@ -60,11 +52,10 @@ extern void swiotlb_tbl_unmap_single(struct device *hwdev,
 				     enum dma_data_direction dir,
 				     unsigned long attrs);
 
-extern void swiotlb_tbl_sync_single(struct device *hwdev,
-				    phys_addr_t tlb_addr,
-				    size_t size, enum dma_data_direction dir,
-				    enum dma_sync_target target);
-
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir);
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir);
 dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 002268262c9a..f737e3347059 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -344,8 +344,8 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
-			swiotlb_tbl_sync_single(dev, paddr, sg->length,
-					dir, SYNC_FOR_DEVICE);
+			swiotlb_sync_single_for_device(dev, paddr, sg->length,
+						       dir);
 
 		if (!dev_is_dma_coherent(dev))
 			arch_sync_dma_for_device(paddr, sg->length,
@@ -370,8 +370,8 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 			arch_sync_dma_for_cpu(paddr, sg->length, dir);
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
-			swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
-					SYNC_FOR_CPU);
+			swiotlb_sync_single_for_cpu(dev, paddr, sg->length,
+						    dir);
 
 		if (dir == DMA_FROM_DEVICE)
 			arch_dma_mark_clean(paddr, sg->length);
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index e1bf721591c0..50afc05b6f1d 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -57,7 +57,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
 	phys_addr_t paddr = dma_to_phys(dev, addr);
 
 	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
+		swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(paddr, size, dir);
@@ -74,7 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 	}
 
 	if (unlikely(is_swiotlb_buffer(paddr)))
-		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
+		swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
 
 	if (dir == DMA_FROM_DEVICE)
 		arch_dma_mark_clean(paddr, size);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index a431c6b64e82..5fe8781be6f2 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -715,26 +715,22 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	spin_unlock_irqrestore(&io_tlb_lock, flags);
 }
 
-void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
-			     size_t size, enum dma_data_direction dir,
-			     enum dma_sync_target target)
+void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir)
 {
-	switch (target) {
-	case SYNC_FOR_CPU:
-		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(hwdev, tlb_addr, size, DMA_FROM_DEVICE);
-		else
-			BUG_ON(dir != DMA_TO_DEVICE);
-		break;
-	case SYNC_FOR_DEVICE:
-		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-			swiotlb_bounce(hwdev, tlb_addr, size, DMA_TO_DEVICE);
-		else
-			BUG_ON(dir != DMA_FROM_DEVICE);
-		break;
-	default:
-		BUG();
-	}
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+	else
+		BUG_ON(dir != DMA_FROM_DEVICE);
+}
+
+void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+		size_t size, enum dma_data_direction dir)
+{
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+	else
+		BUG_ON(dir != DMA_TO_DEVICE);
 }
 
 /*
-- 
cgit v1.2.3


From 3d9c3dcc58e968403f29767726407bc680e087b5 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Tue, 23 Feb 2021 14:44:54 -0800
Subject: scsi: storvsc: Enable scatterlist entry lengths > 4Kbytes

storvsc currently sets .dma_boundary to limit scatterlist entries to 4
Kbytes, which is less efficient with huge pages that offer large chunks of
contiguous physical memory. Improve the algorithm for creating the Hyper-V
guest physical address PFN array so that scatterlist entries with lengths >
4Kbytes are handled.  As a result, remove the .dma_boundary setting.

The improved algorithm also adds support for scatterlist entries with
offsets >= 4Kbytes, which is supported by many other SCSI low-level
drivers.  And it retains support for architectures where possibly PAGE_SIZE
!= HV_HYP_PAGE_SIZE (such as ARM64).

Link: https://lore.kernel.org/r/1614120294-1930-1-git-send-email-mikelley@microsoft.com
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/storvsc_drv.c | 66 ++++++++++++++++------------------------------
 include/linux/hyperv.h     |  1 +
 2 files changed, 24 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index dfe005c03734..e6718a74e5da 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1692,9 +1692,8 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
 	struct storvsc_cmd_request *cmd_request = scsi_cmd_priv(scmnd);
 	int i;
 	struct scatterlist *sgl;
-	unsigned int sg_count = 0;
+	unsigned int sg_count;
 	struct vmscsi_request *vm_srb;
-	struct scatterlist *cur_sgl;
 	struct vmbus_packet_mpb_array  *payload;
 	u32 payload_sz;
 	u32 length;
@@ -1773,8 +1772,8 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
 	payload_sz = sizeof(cmd_request->mpb);
 
 	if (sg_count) {
-		unsigned int hvpgoff = 0;
-		unsigned long offset_in_hvpg = sgl->offset & ~HV_HYP_PAGE_MASK;
+		unsigned int hvpgoff, hvpfns_to_add;
+		unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset);
 		unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length);
 		u64 hvpfn;
 
@@ -1787,51 +1786,34 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
 				return SCSI_MLQUEUE_DEVICE_BUSY;
 		}
 
-		/*
-		 * sgl is a list of PAGEs, and payload->range.pfn_array
-		 * expects the page number in the unit of HV_HYP_PAGE_SIZE (the
-		 * page size that Hyper-V uses, so here we need to divide PAGEs
-		 * into HV_HYP_PAGE in case that PAGE_SIZE > HV_HYP_PAGE_SIZE.
-		 * Besides, payload->range.offset should be the offset in one
-		 * HV_HYP_PAGE.
-		 */
 		payload->range.len = length;
 		payload->range.offset = offset_in_hvpg;
-		hvpgoff = sgl->offset >> HV_HYP_PAGE_SHIFT;
 
-		cur_sgl = sgl;
-		for (i = 0; i < hvpg_count; i++) {
+
+		for (i = 0; sgl != NULL; sgl = sg_next(sgl)) {
 			/*
-			 * 'i' is the index of hv pages in the payload and
-			 * 'hvpgoff' is the offset (in hv pages) of the first
-			 * hv page in the the first page. The relationship
-			 * between the sum of 'i' and 'hvpgoff' and the offset
-			 * (in hv pages) in a payload page ('hvpgoff_in_page')
-			 * is as follow:
-			 *
-			 * |------------------ PAGE -------------------|
-			 * |   NR_HV_HYP_PAGES_IN_PAGE hvpgs in total  |
-			 * |hvpg|hvpg| ...              |hvpg|... |hvpg|
-			 * ^         ^                                 ^                 ^
-			 * +-hvpgoff-+                                 +-hvpgoff_in_page-+
-			 *           ^                                                   |
-			 *           +--------------------- i ---------------------------+
+			 * Init values for the current sgl entry. hvpgoff
+			 * and hvpfns_to_add are in units of Hyper-V size
+			 * pages. Handling the PAGE_SIZE != HV_HYP_PAGE_SIZE
+			 * case also handles values of sgl->offset that are
+			 * larger than PAGE_SIZE. Such offsets are handled
+			 * even on other than the first sgl entry, provided
+			 * they are a multiple of PAGE_SIZE.
 			 */
-			unsigned int hvpgoff_in_page =
-				(i + hvpgoff) % NR_HV_HYP_PAGES_IN_PAGE;
+			hvpgoff = HVPFN_DOWN(sgl->offset);
+			hvpfn = page_to_hvpfn(sg_page(sgl)) + hvpgoff;
+			hvpfns_to_add =	HVPFN_UP(sgl->offset + sgl->length) -
+						hvpgoff;
 
 			/*
-			 * Two cases that we need to fetch a page:
-			 * 1) i == 0, the first step or
-			 * 2) hvpgoff_in_page == 0, when we reach the boundary
-			 *    of a page.
+			 * Fill the next portion of the PFN array with
+			 * sequential Hyper-V PFNs for the continguous physical
+			 * memory described by the sgl entry. The end of the
+			 * last sgl should be reached at the same time that
+			 * the PFN array is filled.
 			 */
-			if (hvpgoff_in_page == 0 || i == 0) {
-				hvpfn = page_to_hvpfn(sg_page(cur_sgl));
-				cur_sgl = sg_next(cur_sgl);
-			}
-
-			payload->range.pfn_array[i] = hvpfn + hvpgoff_in_page;
+			while (hvpfns_to_add--)
+				payload->range.pfn_array[i++] =	hvpfn++;
 		}
 	}
 
@@ -1865,8 +1847,6 @@ static struct scsi_host_template scsi_driver = {
 	.slave_configure =	storvsc_device_configure,
 	.cmd_per_lun =		2048,
 	.this_id =		-1,
-	/* Make sure we dont get a sg segment crosses a page boundary */
-	.dma_boundary =		PAGE_SIZE-1,
 	/* Ensure there are no gaps in presented sgls */
 	.virt_boundary_mask =	PAGE_SIZE-1,
 	.no_write_same =	1,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index f1d74dcf0353..7be8c5f7c5b2 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1726,6 +1726,7 @@ static inline unsigned long virt_to_hvpfn(void *addr)
 #define NR_HV_HYP_PAGES_IN_PAGE	(PAGE_SIZE / HV_HYP_PAGE_SIZE)
 #define offset_in_hvpage(ptr)	((unsigned long)(ptr) & ~HV_HYP_PAGE_MASK)
 #define HVPFN_UP(x)	(((x) + HV_HYP_PAGE_SIZE-1) >> HV_HYP_PAGE_SHIFT)
+#define HVPFN_DOWN(x)	((x) >> HV_HYP_PAGE_SHIFT)
 #define page_to_hvpfn(page)	(page_to_pfn(page) * NR_HV_HYP_PAGES_IN_PAGE)
 
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From 6c172e73690e59ba74ecf12139d841b8651693f8 Mon Sep 17 00:00:00 2001
From: Steen Hegelund <steen.hegelund@microchip.com>
Date: Thu, 18 Feb 2021 17:14:49 +0100
Subject: phy: Add media type and speed serdes configuration interfaces

Provide new phy configuration interfaces for media type and speed that
allows e.g. PHYs used for ethernet to be configured with this
information.

Signed-off-by: Lars Povlsen <lars.povlsen@microchip.com>
Signed-off-by: Steen Hegelund <steen.hegelund@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-By: Kishon Vijay Abraham I <kishon@ti.com>
Link: https://lore.kernel.org/r/20210218161451.3489955-3-steen.hegelund@microchip.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/phy/phy-core.c  | 30 ++++++++++++++++++++++++++++++
 include/linux/phy/phy.h | 26 ++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c
index 71cb10826326..ccb575b13777 100644
--- a/drivers/phy/phy-core.c
+++ b/drivers/phy/phy-core.c
@@ -373,6 +373,36 @@ int phy_set_mode_ext(struct phy *phy, enum phy_mode mode, int submode)
 }
 EXPORT_SYMBOL_GPL(phy_set_mode_ext);
 
+int phy_set_media(struct phy *phy, enum phy_media media)
+{
+	int ret;
+
+	if (!phy || !phy->ops->set_media)
+		return 0;
+
+	mutex_lock(&phy->mutex);
+	ret = phy->ops->set_media(phy, media);
+	mutex_unlock(&phy->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phy_set_media);
+
+int phy_set_speed(struct phy *phy, int speed)
+{
+	int ret;
+
+	if (!phy || !phy->ops->set_speed)
+		return 0;
+
+	mutex_lock(&phy->mutex);
+	ret = phy->ops->set_speed(phy, speed);
+	mutex_unlock(&phy->mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phy_set_speed);
+
 int phy_reset(struct phy *phy)
 {
 	int ret;
diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index e435bdb0bab3..0ed434d02196 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -44,6 +44,12 @@ enum phy_mode {
 	PHY_MODE_DP
 };
 
+enum phy_media {
+	PHY_MEDIA_DEFAULT,
+	PHY_MEDIA_SR,
+	PHY_MEDIA_DAC,
+};
+
 /**
  * union phy_configure_opts - Opaque generic phy configuration
  *
@@ -64,6 +70,8 @@ union phy_configure_opts {
  * @power_on: powering on the phy
  * @power_off: powering off the phy
  * @set_mode: set the mode of the phy
+ * @set_media: set the media type of the phy (optional)
+ * @set_speed: set the speed of the phy (optional)
  * @reset: resetting the phy
  * @calibrate: calibrate the phy
  * @release: ops to be performed while the consumer relinquishes the PHY
@@ -75,6 +83,8 @@ struct phy_ops {
 	int	(*power_on)(struct phy *phy);
 	int	(*power_off)(struct phy *phy);
 	int	(*set_mode)(struct phy *phy, enum phy_mode mode, int submode);
+	int	(*set_media)(struct phy *phy, enum phy_media media);
+	int	(*set_speed)(struct phy *phy, int speed);
 
 	/**
 	 * @configure:
@@ -215,6 +225,8 @@ int phy_power_off(struct phy *phy);
 int phy_set_mode_ext(struct phy *phy, enum phy_mode mode, int submode);
 #define phy_set_mode(phy, mode) \
 	phy_set_mode_ext(phy, mode, 0)
+int phy_set_media(struct phy *phy, enum phy_media media);
+int phy_set_speed(struct phy *phy, int speed);
 int phy_configure(struct phy *phy, union phy_configure_opts *opts);
 int phy_validate(struct phy *phy, enum phy_mode mode, int submode,
 		 union phy_configure_opts *opts);
@@ -344,6 +356,20 @@ static inline int phy_set_mode_ext(struct phy *phy, enum phy_mode mode,
 #define phy_set_mode(phy, mode) \
 	phy_set_mode_ext(phy, mode, 0)
 
+static inline int phy_set_media(struct phy *phy, enum phy_media media)
+{
+	if (!phy)
+		return 0;
+	return -ENODEV;
+}
+
+static inline int phy_set_speed(struct phy *phy, int speed)
+{
+	if (!phy)
+		return 0;
+	return -ENODEV;
+}
+
 static inline enum phy_mode phy_get_mode(struct phy *phy)
 {
 	return PHY_MODE_INVALID;
-- 
cgit v1.2.3


From bee645788e07eea63055d261d2884ea45c2ba857 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 16 Mar 2021 11:31:17 -0400
Subject: locking/ww_mutex: Fix acquire/release imbalance in
 ww_acquire_init()/ww_acquire_fini()

In ww_acquire_init(), mutex_acquire() is gated by CONFIG_DEBUG_LOCK_ALLOC.
The dep_map in the ww_acquire_ctx structure is also gated by the
same config. However mutex_release() in ww_acquire_fini() is gated by
CONFIG_DEBUG_MUTEXES. It is possible to set CONFIG_DEBUG_MUTEXES without
setting CONFIG_DEBUG_LOCK_ALLOC though it is an unlikely configuration.
That may cause a compilation error as dep_map isn't defined in this
case. Fix this potential problem by enclosing mutex_release() inside
CONFIG_DEBUG_LOCK_ALLOC.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210316153119.13802-3-longman@redhat.com
---
 include/linux/ww_mutex.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 850424e5d030..6ecf2a0220db 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -173,9 +173,10 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
  */
 static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 {
-#ifdef CONFIG_DEBUG_MUTEXES
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 	mutex_release(&ctx->dep_map, _THIS_IP_);
-
+#endif
+#ifdef CONFIG_DEBUG_MUTEXES
 	DEBUG_LOCKS_WARN_ON(ctx->acquired);
 	if (!IS_ENABLED(CONFIG_PROVE_LOCKING))
 		/*
-- 
cgit v1.2.3


From fa8b90070a80bb1a3042b4b25af4b3ee2c4c27e1 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Thu, 4 Mar 2021 13:35:40 +0100
Subject: quota: wire up quotactl_path

Wire up the quotactl_path syscall added in the previous patch.

Link: https://lore.kernel.org/r/20210304123541.30749-3-s.hauer@pengutronix.de
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
 arch/arm/tools/syscall.tbl                  | 1 +
 arch/arm64/include/asm/unistd.h             | 2 +-
 arch/arm64/include/asm/unistd32.h           | 2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       | 1 +
 arch/m68k/kernel/syscalls/syscall.tbl       | 1 +
 arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 1 +
 arch/parisc/kernel/syscalls/syscall.tbl     | 1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    | 1 +
 arch/s390/kernel/syscalls/syscall.tbl       | 1 +
 arch/sh/kernel/syscalls/syscall.tbl         | 1 +
 arch/sparc/kernel/syscalls/syscall.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 1 +
 include/linux/syscalls.h                    | 2 ++
 include/uapi/asm-generic/unistd.h           | 4 +++-
 kernel/sys_ni.c                             | 1 +
 21 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 02f0244e005c..c5f7e595adab 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -482,3 +482,4 @@
 550	common	process_madvise			sys_process_madvise
 551	common	epoll_pwait2			sys_epoll_pwait2
 552	common	mount_setattr			sys_mount_setattr
+553	common	quotactl_path			sys_quotactl_path
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index dcc1191291a2..90cbe207cf3e 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -456,3 +456,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 949788f5ba40..d1f7d35f986e 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		443
+#define __NR_compat_syscalls		444
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 3d874f624056..8361c5138e5f 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -893,6 +893,8 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise)
 __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2)
 #define __NR_mount_setattr 442
 __SYSCALL(__NR_mount_setattr, sys_mount_setattr)
+#define __NR_quotactl_path 443
+__SYSCALL(__NR_quotactl_path, sys_quotactl_path)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index d89231166e19..c072cd459bb5 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -363,3 +363,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 72bde6707dd3..5e9f81073ff4 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -442,3 +442,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index d603a5ec9338..8e74d690c64d 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -448,3 +448,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 8fd8c1790941..6f397e56926f 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -381,3 +381,4 @@
 440	n32	process_madvise			sys_process_madvise
 441	n32	epoll_pwait2			compat_sys_epoll_pwait2
 442	n32	mount_setattr			sys_mount_setattr
+443	n32	quotactl_path			sys_quotactl_path
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 169f21438065..ab85a357c4fa 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -357,3 +357,4 @@
 440	n64	process_madvise			sys_process_madvise
 441	n64	epoll_pwait2			sys_epoll_pwait2
 442	n64	mount_setattr			sys_mount_setattr
+443	n64	quotactl_path			sys_quotactl_path
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 090d29ca80ff..9c4cd2b40b38 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -430,3 +430,4 @@
 440	o32	process_madvise			sys_process_madvise
 441	o32	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
 442	o32	mount_setattr			sys_mount_setattr
+443	o32	quotactl_path			sys_quotactl_path
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 271a92519683..80fba3f7d47b 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 0b2480cf3e47..f66f9c9b9d6c 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -522,3 +522,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 3abef2144dac..a421905c36e8 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -445,3 +445,4 @@
 440  common	process_madvise		sys_process_madvise		sys_process_madvise
 441  common	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
 442  common	mount_setattr		sys_mount_setattr		sys_mount_setattr
+443  common	quotactl_path		sys_quotactl_path		sys_quotactl_path
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index d08eebad6b7f..f68517aaa4f1 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -445,3 +445,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 84403a99039c..3ee82321504d 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -488,3 +488,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index a1c9f496fca6..f52a443eede0 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -447,3 +447,4 @@
 440	i386	process_madvise		sys_process_madvise
 441	i386	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
 442	i386	mount_setattr		sys_mount_setattr
+443	i386	quotactl_path		sys_quotactl_path
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7bf01cbe582f..7eb007b8cab5 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -364,6 +364,7 @@
 440	common	process_madvise		sys_process_madvise
 441	common	epoll_pwait2		sys_epoll_pwait2
 442	common	mount_setattr		sys_mount_setattr
+443	common	quotactl_path		sys_quotactl_path
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 365a9b849224..c71cc45633de 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -413,3 +413,4 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
+443	common	quotactl_path			sys_quotactl_path
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2839dc9a7c01..a672bbe28577 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -483,6 +483,8 @@ asmlinkage long sys_pipe2(int __user *fildes, int flags);
 /* fs/quota.c */
 asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special,
 				qid_t id, void __user *addr);
+asmlinkage long sys_quotactl_path(unsigned int cmd, const char __user *mountpoint,
+				  qid_t id, void __user *addr);
 
 /* fs/readdir.c */
 asmlinkage long sys_getdents64(unsigned int fd,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index ce58cff99b66..739c839d28fe 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -863,9 +863,11 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise)
 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
 #define __NR_mount_setattr 442
 __SYSCALL(__NR_mount_setattr, sys_mount_setattr)
+#define __NR_quotactl_path 443
+__SYSCALL(__NR_quotactl_path, sys_quotactl_path)
 
 #undef __NR_syscalls
-#define __NR_syscalls 443
+#define __NR_syscalls 444
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 19aa806890d5..d24431782414 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -99,6 +99,7 @@ COND_SYSCALL(flock);
 
 /* fs/quota.c */
 COND_SYSCALL(quotactl);
+COND_SYSCALL(quotactl_path);
 
 /* fs/readdir.c */
 
-- 
cgit v1.2.3


From d2da74d1278a1b51ef18beafa9da770f0db1c617 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:04 +0100
Subject: tasklets: Replace barrier() with cpu_relax() in tasklet_unlock_wait()

A barrier() in a tight loop which waits for something to happen on a remote
CPU is a pointless exercise. Replace it with cpu_relax() which allows HT
siblings to make progress.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309084241.249343366@linutronix.de
---
 include/linux/interrupt.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 2b98156ec707..d689fd738152 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -672,7 +672,8 @@ static inline void tasklet_unlock(struct tasklet_struct *t)
 
 static inline void tasklet_unlock_wait(struct tasklet_struct *t)
 {
-	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
+	while (test_bit(TASKLET_STATE_RUN, &t->state))
+		cpu_relax();
 }
 #else
 #define tasklet_trylock(t) 1
-- 
cgit v1.2.3


From 6951547a1399c8f56468ed93bea8f769b891aec3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:05 +0100
Subject: tasklets: Use static inlines for stub implementations

Inlines exist for a reason.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309084241.407702697@linutronix.de
---
 include/linux/interrupt.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index d689fd738152..0a4ce25c1464 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -676,9 +676,9 @@ static inline void tasklet_unlock_wait(struct tasklet_struct *t)
 		cpu_relax();
 }
 #else
-#define tasklet_trylock(t) 1
-#define tasklet_unlock_wait(t) do { } while (0)
-#define tasklet_unlock(t) do { } while (0)
+static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; }
+static inline void tasklet_unlock(struct tasklet_struct *t) { }
+static inline void tasklet_unlock_wait(struct tasklet_struct *t) { }
 #endif
 
 extern void __tasklet_schedule(struct tasklet_struct *t);
-- 
cgit v1.2.3


From ca5f625118955fc544c3cb3dee7055d33ecadafb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:06 +0100
Subject: tasklets: Provide tasklet_disable_in_atomic()

Replacing the spin wait loops in tasklet_unlock_wait() with
wait_var_event() is not possible as a handful of tasklet_disable()
invocations are happening in atomic context. All other invocations are in
teardown paths which can sleep.

Provide tasklet_disable_in_atomic() and tasklet_unlock_spin_wait() to
convert the few atomic use cases over, which allows to change
tasklet_disable() and tasklet_unlock_wait() in a later step.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309084241.563164193@linutronix.de
---
 include/linux/interrupt.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0a4ce25c1464..3c8a29176258 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -675,10 +675,21 @@ static inline void tasklet_unlock_wait(struct tasklet_struct *t)
 	while (test_bit(TASKLET_STATE_RUN, &t->state))
 		cpu_relax();
 }
+
+/*
+ * Do not use in new code. Waiting for tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t)
+{
+	while (test_bit(TASKLET_STATE_RUN, &t->state))
+		cpu_relax();
+}
 #else
 static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; }
 static inline void tasklet_unlock(struct tasklet_struct *t) { }
 static inline void tasklet_unlock_wait(struct tasklet_struct *t) { }
+static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { }
 #endif
 
 extern void __tasklet_schedule(struct tasklet_struct *t);
@@ -703,6 +714,17 @@ static inline void tasklet_disable_nosync(struct tasklet_struct *t)
 	smp_mb__after_atomic();
 }
 
+/*
+ * Do not use in new code. Disabling tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+static inline void tasklet_disable_in_atomic(struct tasklet_struct *t)
+{
+	tasklet_disable_nosync(t);
+	tasklet_unlock_spin_wait(t);
+	smp_mb();
+}
+
 static inline void tasklet_disable(struct tasklet_struct *t)
 {
 	tasklet_disable_nosync(t);
-- 
cgit v1.2.3


From b0cd02c2a9494dbf0a1cc7dc7a3b8b400c158d37 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:07 +0100
Subject: tasklets: Use spin wait in tasklet_disable() temporarily

To ease the transition use spin waiting in tasklet_disable() until all
usage sites from atomic context have been cleaned up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309084241.685352806@linutronix.de
---
 include/linux/interrupt.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 3c8a29176258..b7f00121f124 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -728,7 +728,8 @@ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t)
 static inline void tasklet_disable(struct tasklet_struct *t)
 {
 	tasklet_disable_nosync(t);
-	tasklet_unlock_wait(t);
+	/* Spin wait until all atomic users are converted */
+	tasklet_unlock_spin_wait(t);
 	smp_mb();
 }
 
-- 
cgit v1.2.3


From da044747401fc16202e223c9da970ed4e84fd84d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Mar 2021 09:42:08 +0100
Subject: tasklets: Replace spin wait in tasklet_unlock_wait()

tasklet_unlock_wait() spin waits for TASKLET_STATE_RUN to be cleared. This
is wasting CPU cycles in a tight loop which is especially painful in a
guest when the CPU running the tasklet is scheduled out.

tasklet_unlock_wait() is invoked from tasklet_kill() which is used in
teardown paths and not performance critical at all. Replace the spin wait
with wait_var_event().

There are no users of tasklet_unlock_wait() which are invoked from atomic
contexts. The usage in tasklet_disable() has been replaced temporarily with
the spin waiting variant until the atomic users are fixed up and will be
converted to the sleep wait variant later.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309084241.783936921@linutronix.de
---
 include/linux/interrupt.h | 13 ++-----------
 kernel/softirq.c          | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index b7f00121f124..b50be4fbbc98 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -664,17 +664,8 @@ static inline int tasklet_trylock(struct tasklet_struct *t)
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
 }
 
-static inline void tasklet_unlock(struct tasklet_struct *t)
-{
-	smp_mb__before_atomic();
-	clear_bit(TASKLET_STATE_RUN, &(t)->state);
-}
-
-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
-{
-	while (test_bit(TASKLET_STATE_RUN, &t->state))
-		cpu_relax();
-}
+void tasklet_unlock(struct tasklet_struct *t);
+void tasklet_unlock_wait(struct tasklet_struct *t);
 
 /*
  * Do not use in new code. Waiting for tasklets from atomic contexts is
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8d56bbf852ea..ef6429a33883 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,7 @@
 #include <linux/smpboot.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/wait_bit.h>
 
 #include <asm/softirq_stack.h>
 
@@ -632,6 +633,23 @@ void tasklet_kill(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(tasklet_kill);
 
+#ifdef CONFIG_SMP
+void tasklet_unlock(struct tasklet_struct *t)
+{
+	smp_mb__before_atomic();
+	clear_bit(TASKLET_STATE_RUN, &t->state);
+	smp_mb__after_atomic();
+	wake_up_var(&t->state);
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock);
+
+void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+	wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state));
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock_wait);
+#endif
+
 void __init softirq_init(void)
 {
 	int cpu;
-- 
cgit v1.2.3


From eb2dafbba8b824ee77f166629babd470dd0b1c0a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:10 +0100
Subject: tasklets: Prevent tasklet_unlock_spin_wait() deadlock on RT

tasklet_unlock_spin_wait() spin waits for the TASKLET_STATE_SCHED bit in
the tasklet state to be cleared. This works on !RT nicely because the
corresponding execution can only happen on a different CPU.

On RT softirq processing is preemptible, therefore a task preempting the
softirq processing thread can spin forever.

Prevent this by invoking local_bh_disable()/enable() inside the loop. In
case that the softirq processing thread was preempted by the current task,
current will block on the local lock which yields the CPU to the preempted
softirq processing thread. If the tasklet is processed on a different CPU
then the local_bh_disable()/enable() pair is just a waste of processor
cycles.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309084241.988908275@linutronix.de
---
 include/linux/interrupt.h | 12 ++----------
 kernel/softirq.c          | 28 +++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index b50be4fbbc98..352db93c2eed 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -658,7 +658,7 @@ enum
 	TASKLET_STATE_RUN	/* Tasklet is running (SMP only) */
 };
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
 static inline int tasklet_trylock(struct tasklet_struct *t)
 {
 	return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
@@ -666,16 +666,8 @@ static inline int tasklet_trylock(struct tasklet_struct *t)
 
 void tasklet_unlock(struct tasklet_struct *t);
 void tasklet_unlock_wait(struct tasklet_struct *t);
+void tasklet_unlock_spin_wait(struct tasklet_struct *t);
 
-/*
- * Do not use in new code. Waiting for tasklets from atomic contexts is
- * error prone and should be avoided.
- */
-static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t)
-{
-	while (test_bit(TASKLET_STATE_RUN, &t->state))
-		cpu_relax();
-}
 #else
 static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; }
 static inline void tasklet_unlock(struct tasklet_struct *t) { }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ba89ca77698a..f1eb83dc3d50 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -620,6 +620,32 @@ void tasklet_init(struct tasklet_struct *t,
 }
 EXPORT_SYMBOL(tasklet_init);
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * Do not use in new code. Waiting for tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+void tasklet_unlock_spin_wait(struct tasklet_struct *t)
+{
+	while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+		if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+			/*
+			 * Prevent a live lock when current preempted soft
+			 * interrupt processing or prevents ksoftirqd from
+			 * running. If the tasklet runs on a different CPU
+			 * then this has no effect other than doing the BH
+			 * disable/enable dance for nothing.
+			 */
+			local_bh_disable();
+			local_bh_enable();
+		} else {
+			cpu_relax();
+		}
+	}
+}
+EXPORT_SYMBOL(tasklet_unlock_spin_wait);
+#endif
+
 void tasklet_kill(struct tasklet_struct *t)
 {
 	if (in_interrupt())
@@ -633,7 +659,7 @@ void tasklet_kill(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(tasklet_kill);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
 void tasklet_unlock(struct tasklet_struct *t)
 {
 	smp_mb__before_atomic();
-- 
cgit v1.2.3


From 6fd4e861250b5c89ad460a9f265caeb1bbbfc323 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:17 +0100
Subject: tasklets: Switch tasklet_disable() to the sleep wait variant

 -- NOT FOR IMMEDIATE MERGING --

Now that all users of tasklet_disable() are invoked from sleepable context,
convert it to use tasklet_unlock_wait() which might sleep.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309084242.726452321@linutronix.de
---
 include/linux/interrupt.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 352db93c2eed..4777850a6dc7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -711,8 +711,7 @@ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t)
 static inline void tasklet_disable(struct tasklet_struct *t)
 {
 	tasklet_disable_nosync(t);
-	/* Spin wait until all atomic users are converted */
-	tasklet_unlock_spin_wait(t);
+	tasklet_unlock_wait(t);
 	smp_mb();
 }
 
-- 
cgit v1.2.3


From 728b478d2d358480b333b42d0e10e0fecb20114c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:53 +0100
Subject: softirq: Add RT specific softirq accounting

RT requires the softirq processing and local bottomhalf disabled regions to
be preemptible. Using the normal preempt count based serialization is
therefore not possible because this implicitely disables preemption.

RT kernels use a per CPU local lock to serialize bottomhalfs. As
local_bh_disable() can nest the lock can only be acquired on the outermost
invocation of local_bh_disable() and released when the nest count becomes
zero. Tasks which hold the local lock can be preempted so its required to
keep track of the nest count per task.

Add a RT only counter to task struct and adjust the relevant macros in
preempt.h.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309085726.983627589@linutronix.de
---
 include/linux/hardirq.h | 1 +
 include/linux/preempt.h | 6 +++++-
 include/linux/sched.h   | 3 +++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 7c9d6a2d7e90..69bc86ea382c 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -6,6 +6,7 @@
 #include <linux/preempt.h>
 #include <linux/lockdep.h>
 #include <linux/ftrace_irq.h>
+#include <linux/sched.h>
 #include <linux/vtime.h>
 #include <asm/hardirq.h>
 
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 69cc8b64aa3a..9881eac0698f 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -79,7 +79,11 @@
 
 #define nmi_count()	(preempt_count() & NMI_MASK)
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
-#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+#ifdef CONFIG_PREEMPT_RT
+# define softirq_count()	(current->softirq_disable_cnt & SOFTIRQ_MASK)
+#else
+# define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+#endif
 #define irq_count()	(nmi_count() | hardirq_count() | softirq_count())
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ef00bb22164c..743a613c9cf3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1044,6 +1044,9 @@ struct task_struct {
 	int				softirq_context;
 	int				irq_config;
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	int				softirq_disable_cnt;
+#endif
 
 #ifdef CONFIG_LOCKDEP
 # define MAX_LOCK_DEPTH			48UL
-- 
cgit v1.2.3


From 8b1c04acad082dec76f3f8f7e1fa13493d6cbb79 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:56 +0100
Subject: softirq: Make softirq control and processing RT aware

Provide a local lock based serialization for soft interrupts on RT which
allows the local_bh_disabled() sections and servicing soft interrupts to be
preemptible.

Provide the necessary inline helpers which allow to reuse the bulk of the
softirq processing code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309085727.426370483@linutronix.de
---
 include/linux/bottom_half.h |   2 +-
 kernel/softirq.c            | 188 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 182 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index a19519f4241d..e4dd613a070e 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -4,7 +4,7 @@
 
 #include <linux/preempt.h>
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS)
 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
 #else
 static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index eaca3337dbda..1ed1c55aa2a7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -13,6 +13,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/local_lock.h>
 #include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/percpu.h>
@@ -103,20 +104,189 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
 #endif
 
 /*
- * preempt_count and SOFTIRQ_OFFSET usage:
- * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
- *   softirq processing.
- * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * SOFTIRQ_OFFSET usage:
+ *
+ * On !RT kernels 'count' is the preempt counter, on RT kernels this applies
+ * to a per CPU counter and to task::softirqs_disabled_cnt.
+ *
+ * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq
+ *   processing.
+ *
+ * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
  *   on local_bh_disable or local_bh_enable.
+ *
  * This lets us distinguish between whether we are currently processing
  * softirq and whether we just have bh disabled.
  */
+#ifdef CONFIG_PREEMPT_RT
 
-#ifdef CONFIG_TRACE_IRQFLAGS
 /*
- * This is for softirq.c-internal use, where hardirqs are disabled
+ * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and
+ * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a
+ * softirq disabled section to be preempted.
+ *
+ * The per task counter is used for softirq_count(), in_softirq() and
+ * in_serving_softirqs() because these counts are only valid when the task
+ * holding softirq_ctrl::lock is running.
+ *
+ * The per CPU counter prevents pointless wakeups of ksoftirqd in case that
+ * the task which is in a softirq disabled section is preempted or blocks.
+ */
+struct softirq_ctrl {
+	local_lock_t	lock;
+	int		cnt;
+};
+
+static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
+	.lock	= INIT_LOCAL_LOCK(softirq_ctrl.lock),
+};
+
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+	unsigned long flags;
+	int newcnt;
+
+	WARN_ON_ONCE(in_hardirq());
+
+	/* First entry of a task into a BH disabled section? */
+	if (!current->softirq_disable_cnt) {
+		if (preemptible()) {
+			local_lock(&softirq_ctrl.lock);
+			/* Required to meet the RCU bottomhalf requirements. */
+			rcu_read_lock();
+		} else {
+			DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt));
+		}
+	}
+
+	/*
+	 * Track the per CPU softirq disabled state. On RT this is per CPU
+	 * state to allow preemption of bottom half disabled sections.
+	 */
+	newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
+	/*
+	 * Reflect the result in the task state to prevent recursion on the
+	 * local lock and to make softirq_count() & al work.
+	 */
+	current->softirq_disable_cnt = newcnt;
+
+	if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
+		raw_local_irq_save(flags);
+		lockdep_softirqs_off(ip);
+		raw_local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL(__local_bh_disable_ip);
+
+static void __local_bh_enable(unsigned int cnt, bool unlock)
+{
+	unsigned long flags;
+	int newcnt;
+
+	DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
+			    this_cpu_read(softirq_ctrl.cnt));
+
+	if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
+		raw_local_irq_save(flags);
+		lockdep_softirqs_on(_RET_IP_);
+		raw_local_irq_restore(flags);
+	}
+
+	newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+	current->softirq_disable_cnt = newcnt;
+
+	if (!newcnt && unlock) {
+		rcu_read_unlock();
+		local_unlock(&softirq_ctrl.lock);
+	}
+}
+
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+	bool preempt_on = preemptible();
+	unsigned long flags;
+	u32 pending;
+	int curcnt;
+
+	WARN_ON_ONCE(in_irq());
+	lockdep_assert_irqs_enabled();
+
+	local_irq_save(flags);
+	curcnt = __this_cpu_read(softirq_ctrl.cnt);
+
+	/*
+	 * If this is not reenabling soft interrupts, no point in trying to
+	 * run pending ones.
+	 */
+	if (curcnt != cnt)
+		goto out;
+
+	pending = local_softirq_pending();
+	if (!pending || ksoftirqd_running(pending))
+		goto out;
+
+	/*
+	 * If this was called from non preemptible context, wake up the
+	 * softirq daemon.
+	 */
+	if (!preempt_on) {
+		wakeup_softirqd();
+		goto out;
+	}
+
+	/*
+	 * Adjust softirq count to SOFTIRQ_OFFSET which makes
+	 * in_serving_softirq() become true.
+	 */
+	cnt = SOFTIRQ_OFFSET;
+	__local_bh_enable(cnt, false);
+	__do_softirq();
+
+out:
+	__local_bh_enable(cnt, preempt_on);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__local_bh_enable_ip);
+
+/*
+ * Invoked from ksoftirqd_run() outside of the interrupt disabled section
+ * to acquire the per CPU local lock for reentrancy protection.
+ */
+static inline void ksoftirqd_run_begin(void)
+{
+	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+	local_irq_disable();
+}
+
+/* Counterpart to ksoftirqd_run_begin() */
+static inline void ksoftirqd_run_end(void)
+{
+	__local_bh_enable(SOFTIRQ_OFFSET, true);
+	WARN_ON_ONCE(in_interrupt());
+	local_irq_enable();
+}
+
+static inline void softirq_handle_begin(void) { }
+static inline void softirq_handle_end(void) { }
+
+static inline bool should_wake_ksoftirqd(void)
+{
+	return !this_cpu_read(softirq_ctrl.cnt);
+}
+
+static inline void invoke_softirq(void)
+{
+	if (should_wake_ksoftirqd())
+		wakeup_softirqd();
+}
+
+#else /* CONFIG_PREEMPT_RT */
+
+/*
+ * This one is for softirq.c-internal use, where hardirqs are disabled
  * legitimately:
  */
+#ifdef CONFIG_TRACE_IRQFLAGS
 void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 {
 	unsigned long flags;
@@ -277,6 +447,8 @@ asmlinkage __visible void do_softirq(void)
 	local_irq_restore(flags);
 }
 
+#endif /* !CONFIG_PREEMPT_RT */
+
 /*
  * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
  * but break the loop if need_resched() is set or after 2 ms.
@@ -381,8 +553,10 @@ restart:
 		pending >>= softirq_bit;
 	}
 
-	if (__this_cpu_read(ksoftirqd) == current)
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
+	    __this_cpu_read(ksoftirqd) == current)
 		rcu_softirq_qs();
+
 	local_irq_disable();
 
 	pending = local_softirq_pending();
-- 
cgit v1.2.3


From 47c218dcae6587fb5bce30f1656b13e22391c8e3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:57 +0100
Subject: tick/sched: Prevent false positive softirq pending warnings on RT

On RT a task which has soft interrupts disabled can block on a lock and
schedule out to idle while soft interrupts are pending. This triggers the
warning in the NOHZ idle code which complains about going idle with pending
soft interrupts. But as the task is blocked soft interrupt processing is
temporarily blocked as well which means that such a warning is a false
positive.

To prevent that check the per CPU state which indicates that a scheduled
out task has soft interrupts disabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309085727.527563866@linutronix.de
---
 include/linux/bottom_half.h |  6 ++++++
 kernel/softirq.c            | 15 +++++++++++++++
 kernel/time/tick-sched.c    |  2 +-
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index e4dd613a070e..eed86eb0a1de 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -32,4 +32,10 @@ static inline void local_bh_enable(void)
 	__local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
 }
 
+#ifdef CONFIG_PREEMPT_RT
+extern bool local_bh_blocked(void);
+#else
+static inline bool local_bh_blocked(void) { return false; }
+#endif
+
 #endif /* _LINUX_BH_H */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1ed1c55aa2a7..5a99696da86a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -141,6 +141,21 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
 	.lock	= INIT_LOCAL_LOCK(softirq_ctrl.lock),
 };
 
+/**
+ * local_bh_blocked() - Check for idle whether BH processing is blocked
+ *
+ * Returns false if the per CPU softirq::cnt is 0 otherwise true.
+ *
+ * This is invoked from the idle task to guard against false positive
+ * softirq pending warnings, which would happen when the task which holds
+ * softirq_ctrl::lock was the only running task on the CPU and blocks on
+ * some other lock.
+ */
+bool local_bh_blocked(void)
+{
+	return __this_cpu_read(softirq_ctrl.cnt) != 0;
+}
+
 void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 {
 	unsigned long flags;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e10a4af88737..0cc55791b2b6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -973,7 +973,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	if (unlikely(local_softirq_pending())) {
 		static int ratelimit;
 
-		if (ratelimit < 10 &&
+		if (ratelimit < 10 && !local_bh_blocked() &&
 		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
 			pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
 				(unsigned int) local_softirq_pending());
-- 
cgit v1.2.3


From ba9e6cab49c1465c2c322dcb03d771d5cbecb692 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:58 +0100
Subject: rcu: Prevent false positive softirq warning on RT

Soft interrupt disabled sections can legitimately be preempted or schedule
out when blocking on a lock on RT enabled kernels so the RCU preempt check
warning has to be disabled for RT kernels.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210309085727.626304079@linutronix.de
---
 include/linux/rcupdate.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index bd04f722714f..6d855ef091ba 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -334,7 +334,8 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define rcu_sleep_check()						\
 	do {								\
 		rcu_preempt_sleep_check();				\
-		RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),	\
+		if (!IS_ENABLED(CONFIG_PREEMPT_RT))			\
+		    RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),	\
 				 "Illegal context switch in RCU-bh read-side critical section"); \
 		RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),	\
 				 "Illegal context switch in RCU-sched read-side critical section"); \
-- 
cgit v1.2.3


From 7888fe53b7066c284e172d98d98d1865d6a9e5a0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Tue, 16 Mar 2021 17:30:36 -0700
Subject: ethtool: Add common function for filling out strings

Add a function to handle the common pattern of printing a string into the
ethtool strings interface and incrementing the string pointer by the
ETH_GSTRING_LEN. Most of the drivers end up doing this and several have
implemented their own versions of this function so it would make sense to
consolidate on one implementation.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  9 +++++++++
 net/ethtool/ioctl.c     | 12 ++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index ec4cd3921c67..3583f7fc075c 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -571,4 +571,13 @@ struct ethtool_phy_ops {
  */
 void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops);
 
+/**
+ * ethtool_sprintf - Write formatted string to ethtool string data
+ * @data: Pointer to start of string to update
+ * @fmt: Format of string to write
+ *
+ * Write formatted string to data. Update data to point at start of
+ * next string.
+ */
+extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...);
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 24783b71c584..0788cc3b3114 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1844,6 +1844,18 @@ out:
 	return ret;
 }
 
+__printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(*data, ETH_GSTRING_LEN, fmt, args);
+	va_end(args);
+
+	*data += ETH_GSTRING_LEN;
+}
+EXPORT_SYMBOL(ethtool_sprintf);
+
 static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_value id;
-- 
cgit v1.2.3


From 3093c3c7c136458af692d5c3d309a66c3c12d9f4 Mon Sep 17 00:00:00 2001
From: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Date: Thu, 11 Mar 2021 15:04:09 +0100
Subject: rpmsg: Move RPMSG_ADDR_ANY in user API

As the RPMSG_ADDR_ANY is a valid src or dst address that can be set by
user applications, migrate its definition in user API.

Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Link: https://lore.kernel.org/r/20210311140413.31725-3-arnaud.pouliquen@foss.st.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/rpmsg.h      | 3 +--
 include/uapi/linux/rpmsg.h | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
index a5db828b2420..d97dcd049f18 100644
--- a/include/linux/rpmsg.h
+++ b/include/linux/rpmsg.h
@@ -18,8 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/poll.h>
 #include <linux/rpmsg/byteorder.h>
-
-#define RPMSG_ADDR_ANY		0xFFFFFFFF
+#include <uapi/linux/rpmsg.h>
 
 struct rpmsg_device;
 struct rpmsg_endpoint;
diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h
index e14c6dab4223..5e00748da319 100644
--- a/include/uapi/linux/rpmsg.h
+++ b/include/uapi/linux/rpmsg.h
@@ -9,6 +9,8 @@
 #include <linux/ioctl.h>
 #include <linux/types.h>
 
+#define RPMSG_ADDR_ANY		0xFFFFFFFF
+
 /**
  * struct rpmsg_endpoint_info - endpoint info representation
  * @name: name of service
-- 
cgit v1.2.3


From 6417f03132a6952cd17ddd8eaddbac92b61b17e0 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 17 Mar 2021 12:45:47 +0200
Subject: module: remove never implemented MODULE_SUPPORTED_DEVICE

MODULE_SUPPORTED_DEVICE was added in pre-git era and never was
implemented. We can safely remove it, because the kernel has grown
to have many more reliable mechanisms to determine if device is
supported or not.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/platform/iris/iris.c                      |  1 -
 drivers/atm/fore200e.c                             |  2 --
 drivers/block/floppy.c                             |  1 -
 drivers/bluetooth/btrsi.c                          |  1 -
 drivers/char/applicom.c                            |  3 --
 drivers/char/toshiba.c                             |  1 -
 drivers/input/joydev.c                             |  1 -
 drivers/media/firewire/firedtv-fw.c                |  1 -
 drivers/media/pci/cx18/cx18-alsa-main.c            |  1 -
 drivers/media/pci/cx18/cx18-driver.c               |  1 -
 drivers/media/pci/cx25821/cx25821-alsa.c           |  1 -
 drivers/media/pci/cx88/cx88-alsa.c                 |  1 -
 drivers/media/pci/ivtv/ivtv-alsa-main.c            |  1 -
 drivers/media/pci/ivtv/ivtv-driver.c               |  3 --
 drivers/media/pci/sta2x11/sta2x11_vip.c            |  1 -
 drivers/media/platform/atmel/atmel-isi.c           |  1 -
 drivers/media/platform/atmel/atmel-sama5d2-isc.c   |  1 -
 drivers/media/platform/marvell-ccic/cafe-driver.c  |  4 ---
 drivers/media/platform/stm32/stm32-dcmi.c          |  1 -
 drivers/media/usb/cpia2/cpia2_v4l.c                |  1 -
 drivers/media/usb/tm6000/tm6000-alsa.c             |  1 -
 drivers/media/usb/tm6000/tm6000-dvb.c              |  2 --
 drivers/mtd/maps/sun_uflash.c                      |  1 -
 drivers/net/can/peak_canfd/peak_pciefd_main.c      |  1 -
 drivers/net/can/sja1000/ems_pci.c                  |  1 -
 drivers/net/can/sja1000/ems_pcmcia.c               |  1 -
 drivers/net/can/sja1000/kvaser_pci.c               |  1 -
 drivers/net/can/sja1000/peak_pci.c                 |  2 --
 drivers/net/can/sja1000/peak_pcmcia.c              |  1 -
 drivers/net/can/sja1000/plx_pci.c                  | 12 --------
 drivers/net/can/usb/peak_usb/pcan_usb.c            |  2 --
 drivers/net/can/usb/peak_usb/pcan_usb_fd.c         |  3 --
 drivers/net/can/usb/peak_usb/pcan_usb_pro.c        |  2 --
 drivers/net/hamradio/scc.c                         |  1 -
 drivers/net/wireless/admtek/adm8211.c              |  1 -
 drivers/net/wireless/ath/ath5k/base.c              |  1 -
 drivers/net/wireless/ath/ath9k/hw.c                |  1 -
 drivers/net/wireless/ath/ath9k/init.c              |  1 -
 drivers/net/wireless/atmel/atmel.c                 |  1 -
 drivers/net/wireless/atmel/atmel_cs.c              |  1 -
 drivers/net/wireless/atmel/atmel_pci.c             |  1 -
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |  1 -
 .../wireless/broadcom/brcm80211/brcmutil/utils.c   |  1 -
 drivers/net/wireless/cisco/airo.c                  |  1 -
 drivers/net/wireless/cisco/airo_cs.c               |  1 -
 drivers/net/wireless/intersil/hostap/hostap_cs.c   |  1 -
 drivers/net/wireless/intersil/hostap/hostap_pci.c  |  1 -
 drivers/net/wireless/intersil/hostap/hostap_plx.c  |  1 -
 drivers/net/wireless/ralink/rt2x00/rt2400pci.c     |  1 -
 drivers/net/wireless/ralink/rt2x00/rt2500pci.c     |  1 -
 drivers/net/wireless/ralink/rt2x00/rt2500usb.c     |  1 -
 drivers/net/wireless/ralink/rt2x00/rt2800pci.c     |  1 -
 drivers/net/wireless/ralink/rt2x00/rt2800usb.c     |  1 -
 drivers/net/wireless/ralink/rt2x00/rt61pci.c       |  2 --
 drivers/net/wireless/ralink/rt2x00/rt73usb.c       |  1 -
 drivers/net/wireless/rsi/rsi_91x_main.c            |  1 -
 drivers/net/wireless/rsi/rsi_91x_sdio.c            |  1 -
 drivers/net/wireless/rsi/rsi_91x_usb.c             |  1 -
 drivers/parport/parport_amiga.c                    |  1 -
 drivers/parport/parport_atari.c                    |  1 -
 drivers/parport/parport_gsc.c                      |  1 -
 drivers/parport/parport_mfc3.c                     |  1 -
 drivers/parport/parport_sunbpp.c                   |  1 -
 drivers/s390/block/dasd.c                          |  1 -
 drivers/sbus/char/display7seg.c                    |  1 -
 drivers/scsi/hpsa.c                                |  1 -
 drivers/scsi/pcmcia/nsp_cs.c                       |  1 -
 drivers/scsi/smartpqi/smartpqi_init.c              |  1 -
 drivers/sh/maple/maple.c                           |  1 -
 drivers/staging/comedi/drivers/vmk80xx.c           |  1 -
 drivers/tee/optee/core.c                           |  1 -
 drivers/tty/serial/icom.c                          |  2 --
 drivers/tty/serial/jsm/jsm_driver.c                |  1 -
 drivers/usb/misc/ldusb.c                           |  1 -
 drivers/watchdog/cpu5wdt.c                         |  1 -
 drivers/watchdog/cpwd.c                            |  1 -
 drivers/watchdog/riowd.c                           |  1 -
 include/linux/module.h                             |  3 --
 net/batman-adv/main.c                              |  1 -
 sound/drivers/aloop.c                              |  1 -
 sound/drivers/dummy.c                              |  1 -
 sound/drivers/mtpav.c                              |  1 -
 sound/drivers/mts64.c                              |  1 -
 sound/drivers/pcsp/pcsp.c                          |  1 -
 sound/drivers/portman2x4.c                         |  1 -
 sound/drivers/serial-u16550.c                      |  1 -
 sound/drivers/virmidi.c                            |  1 -
 sound/isa/ad1816a/ad1816a.c                        |  7 -----
 sound/isa/ad1848/ad1848.c                          |  3 --
 sound/isa/als100.c                                 | 11 -------
 sound/isa/azt2320.c                                |  5 ----
 sound/isa/cmi8330.c                                |  1 -
 sound/isa/cs423x/cs4231.c                          |  1 -
 sound/isa/cs423x/cs4236.c                          | 34 ----------------------
 sound/isa/es1688/es1688.c                          |  5 ----
 sound/isa/es18xx.c                                 | 10 +------
 sound/isa/gus/gusclassic.c                         |  1 -
 sound/isa/gus/gusextreme.c                         |  1 -
 sound/isa/gus/gusmax.c                             |  1 -
 sound/isa/gus/interwave.c                          |  6 ----
 sound/isa/opl3sa2.c                                |  5 ----
 sound/isa/opti9xx/miro.c                           |  3 --
 sound/isa/opti9xx/opti92x-ad1848.c                 |  6 ----
 sound/isa/sb/jazz16.c                              |  3 --
 sound/isa/sb/sb16.c                                |  8 -----
 sound/isa/sb/sb8.c                                 |  1 -
 sound/isa/sc6000.c                                 |  3 --
 sound/isa/wavefront/wavefront.c                    |  1 -
 sound/mips/sgio2audio.c                            |  1 -
 sound/pci/ad1889.c                                 |  1 -
 sound/pci/ali5451/ali5451.c                        |  1 -
 sound/pci/als300.c                                 |  1 -
 sound/pci/als4000.c                                |  1 -
 sound/pci/atiixp.c                                 |  1 -
 sound/pci/atiixp_modem.c                           |  1 -
 sound/pci/au88x0/au88x0.c                          |  2 --
 sound/pci/azt3328.c                                |  1 -
 sound/pci/bt87x.c                                  |  2 --
 sound/pci/ca0106/ca0106_main.c                     |  1 -
 sound/pci/cmipci.c                                 |  4 ---
 sound/pci/cs4281.c                                 |  1 -
 sound/pci/cs46xx/cs46xx.c                          |  7 -----
 sound/pci/cs5535audio/cs5535audio.c                |  1 -
 sound/pci/ctxfi/xfi.c                              |  1 -
 sound/pci/echoaudio/echoaudio.c                    |  1 -
 sound/pci/emu10k1/emu10k1.c                        |  2 --
 sound/pci/emu10k1/emu10k1x.c                       |  1 -
 sound/pci/ens1370.c                                |  8 -----
 sound/pci/es1938.c                                 |  4 ---
 sound/pci/es1968.c                                 |  4 ---
 sound/pci/fm801.c                                  |  2 --
 sound/pci/hda/hda_intel.c                          | 34 ----------------------
 sound/pci/ice1712/ice1712.c                        |  6 ----
 sound/pci/ice1712/ice1724.c                        | 19 ------------
 sound/pci/intel8x0.c                               | 23 ---------------
 sound/pci/intel8x0m.c                              | 15 ----------
 sound/pci/korg1212/korg1212.c                      |  1 -
 sound/pci/lola/lola.c                              |  1 -
 sound/pci/lx6464es/lx6464es.c                      |  2 --
 sound/pci/maestro3.c                               |  5 ----
 sound/pci/mixart/mixart.c                          |  1 -
 sound/pci/nm256/nm256.c                            |  2 --
 sound/pci/oxygen/oxygen.c                          |  3 --
 sound/pci/oxygen/se6x.c                            |  1 -
 sound/pci/oxygen/virtuoso.c                        |  1 -
 sound/pci/pcxhr/pcxhr.c                            |  1 -
 sound/pci/riptide/riptide.c                        |  1 -
 sound/pci/rme32.c                                  |  1 -
 sound/pci/rme96.c                                  |  5 ----
 sound/pci/rme9652/hdsp.c                           |  3 --
 sound/pci/rme9652/hdspm.c                          |  1 -
 sound/pci/rme9652/rme9652.c                        |  2 --
 sound/pci/sis7019.c                                |  1 -
 sound/pci/sonicvibes.c                             |  1 -
 sound/pci/trident/trident.c                        | 12 --------
 sound/pci/via82xx.c                                |  1 -
 sound/pci/via82xx_modem.c                          |  1 -
 sound/pci/vx222/vx222.c                            |  1 -
 sound/pci/ymfpci/ymfpci.c                          |  6 ----
 sound/pcmcia/pdaudiocf/pdaudiocf.c                 |  1 -
 sound/pcmcia/vx/vxpocket.c                         |  4 ---
 sound/ppc/powermac.c                               |  1 -
 sound/sh/aica.c                                    |  1 -
 sound/sh/sh_dac_audio.c                            |  1 -
 sound/sparc/amd7930.c                              |  1 -
 sound/sparc/cs4231.c                               |  1 -
 sound/sparc/dbri.c                                 |  1 -
 sound/usb/6fire/chip.c                             |  1 -
 sound/usb/caiaq/device.c                           | 14 ---------
 sound/usb/card.c                                   |  2 --
 sound/usb/hiface/chip.c                            | 17 -----------
 sound/usb/misc/ua101.c                             |  1 -
 sound/usb/usx2y/usbusx2y.c                         |  1 -
 sound/x86/intel_hdmi_audio.c                       |  1 -
 sound/xen/xen_snd_front.c                          |  1 -
 175 files changed, 1 insertion(+), 480 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 1ac8578258af..b42bfdab01a9 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -27,7 +27,6 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
 MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
-MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
 
 static bool force;
 
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 9a70bee84125..495fd0a1f040 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -100,8 +100,6 @@ static LIST_HEAD(fore200e_boards);
 
 MODULE_AUTHOR("Christophe Lizzi - credits to Uwe Dannowski and Heikki Vatiainen");
 MODULE_DESCRIPTION("FORE Systems 200E-series ATM driver - version " FORE200E_VERSION);
-MODULE_SUPPORTED_DEVICE("PCA-200E, SBA-200E");
-
 
 static const int fore200e_rx_buf_nbr[ BUFFER_SCHEME_NBR ][ BUFFER_MAGN_NBR ] = {
     { BUFFER_S1_NBR, BUFFER_L1_NBR },
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 0b71292d9d5a..4aa9683ee0c1 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -5091,7 +5091,6 @@ module_param(floppy, charp, 0);
 module_param(FLOPPY_IRQ, int, 0);
 module_param(FLOPPY_DMA, int, 0);
 MODULE_AUTHOR("Alain L. Knaff");
-MODULE_SUPPORTED_DEVICE("fd");
 MODULE_LICENSE("GPL");
 
 /* This doesn't actually get used other than for module information */
diff --git a/drivers/bluetooth/btrsi.c b/drivers/bluetooth/btrsi.c
index 3951f7b23840..bea1595f6432 100644
--- a/drivers/bluetooth/btrsi.c
+++ b/drivers/bluetooth/btrsi.c
@@ -194,5 +194,4 @@ module_init(rsi_91x_bt_module_init);
 module_exit(rsi_91x_bt_module_exit);
 MODULE_AUTHOR("Redpine Signals Inc");
 MODULE_DESCRIPTION("RSI BT driver");
-MODULE_SUPPORTED_DEVICE("RSI-BT");
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c
index 14b2d8034c51..45ac7ab003ce 100644
--- a/drivers/char/applicom.c
+++ b/drivers/char/applicom.c
@@ -81,9 +81,6 @@ MODULE_DESCRIPTION("Driver for Applicom Profibus card");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(AC_MINOR);
 
-MODULE_SUPPORTED_DEVICE("ac");
-
-
 static struct applicom_board {
 	unsigned long PhysIO;
 	void __iomem *RamIO;
diff --git a/drivers/char/toshiba.c b/drivers/char/toshiba.c
index aff0a8e44fff..776abbfd85d6 100644
--- a/drivers/char/toshiba.c
+++ b/drivers/char/toshiba.c
@@ -64,7 +64,6 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jonathan Buzzard <jonathan@buzzard.org.uk>");
 MODULE_DESCRIPTION("Toshiba laptop SMM driver");
-MODULE_SUPPORTED_DEVICE("toshiba");
 
 static DEFINE_MUTEX(tosh_mutex);
 static int tosh_fn;
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index 430dc6975004..da8963a9f044 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -26,7 +26,6 @@
 
 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
 MODULE_DESCRIPTION("Joystick device interfaces");
-MODULE_SUPPORTED_DEVICE("input/js");
 MODULE_LICENSE("GPL");
 
 #define JOYDEV_MINOR_BASE	0
diff --git a/drivers/media/firewire/firedtv-fw.c b/drivers/media/firewire/firedtv-fw.c
index 8a8585261bb8..5f6e97a8d1c0 100644
--- a/drivers/media/firewire/firedtv-fw.c
+++ b/drivers/media/firewire/firedtv-fw.c
@@ -430,4 +430,3 @@ MODULE_AUTHOR("Andreas Monitzer <andy@monitzer.com>");
 MODULE_AUTHOR("Ben Backx <ben@bbackx.com>");
 MODULE_DESCRIPTION("FireDTV DVB Driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("FireDTV DVB");
diff --git a/drivers/media/pci/cx18/cx18-alsa-main.c b/drivers/media/pci/cx18/cx18-alsa-main.c
index 692b95a685d1..9a82e68303b6 100644
--- a/drivers/media/pci/cx18/cx18-alsa-main.c
+++ b/drivers/media/pci/cx18/cx18-alsa-main.c
@@ -41,7 +41,6 @@ MODULE_PARM_DESC(debug,
 
 MODULE_AUTHOR("Andy Walls");
 MODULE_DESCRIPTION("CX23418 ALSA Interface");
-MODULE_SUPPORTED_DEVICE("CX23418 MPEG2 encoder");
 MODULE_LICENSE("GPL");
 
 MODULE_VERSION(CX18_VERSION);
diff --git a/drivers/media/pci/cx18/cx18-driver.c b/drivers/media/pci/cx18/cx18-driver.c
index 95aed00f353b..f2440eb38820 100644
--- a/drivers/media/pci/cx18/cx18-driver.c
+++ b/drivers/media/pci/cx18/cx18-driver.c
@@ -232,7 +232,6 @@ MODULE_PARM_DESC(cx18_first_minor,
 
 MODULE_AUTHOR("Hans Verkuil");
 MODULE_DESCRIPTION("CX23418 driver");
-MODULE_SUPPORTED_DEVICE("CX23418 MPEG2 encoder");
 MODULE_LICENSE("GPL");
 
 MODULE_VERSION(CX18_VERSION);
diff --git a/drivers/media/pci/cx25821/cx25821-alsa.c b/drivers/media/pci/cx25821/cx25821-alsa.c
index 608fbaf0f659..8797d85a6b0a 100644
--- a/drivers/media/pci/cx25821/cx25821-alsa.c
+++ b/drivers/media/pci/cx25821/cx25821-alsa.c
@@ -104,7 +104,6 @@ MODULE_PARM_DESC(index, "Index value for cx25821 capture interface(s).");
 MODULE_DESCRIPTION("ALSA driver module for cx25821 based capture cards");
 MODULE_AUTHOR("Hiep Huynh");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Conexant,25821}");	/* "{{Conexant,23881}," */
 
 static unsigned int debug;
 module_param(debug, int, 0644);
diff --git a/drivers/media/pci/cx88/cx88-alsa.c b/drivers/media/pci/cx88/cx88-alsa.c
index 95e0cbb1277d..c83814c052d3 100644
--- a/drivers/media/pci/cx88/cx88-alsa.c
+++ b/drivers/media/pci/cx88/cx88-alsa.c
@@ -98,7 +98,6 @@ MODULE_AUTHOR("Mauro Carvalho Chehab <mchehab@kernel.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_VERSION(CX88_VERSION);
 
-MODULE_SUPPORTED_DEVICE("{{Conexant,23881},{{Conexant,23882},{{Conexant,23883}");
 static unsigned int debug;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "enable debug messages");
diff --git a/drivers/media/pci/ivtv/ivtv-alsa-main.c b/drivers/media/pci/ivtv/ivtv-alsa-main.c
index 39029b8e12c9..4cefdb2e4d40 100644
--- a/drivers/media/pci/ivtv/ivtv-alsa-main.c
+++ b/drivers/media/pci/ivtv/ivtv-alsa-main.c
@@ -38,7 +38,6 @@ MODULE_PARM_DESC(index,
 
 MODULE_AUTHOR("Andy Walls");
 MODULE_DESCRIPTION("CX23415/CX23416 ALSA Interface");
-MODULE_SUPPORTED_DEVICE("CX23415/CX23416 MPEG2 encoder");
 MODULE_LICENSE("GPL");
 
 MODULE_VERSION(IVTV_VERSION);
diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c
index 6e448cb3b51c..942b8c266f50 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.c
+++ b/drivers/media/pci/ivtv/ivtv-driver.c
@@ -275,9 +275,6 @@ MODULE_PARM_DESC(ivtv_first_minor, "Set device node number assigned to first car
 
 MODULE_AUTHOR("Kevin Thayer, Chris Kennedy, Hans Verkuil");
 MODULE_DESCRIPTION("CX23415/CX23416 driver");
-MODULE_SUPPORTED_DEVICE
-    ("CX23415/CX23416 MPEG2 encoder (WinTV PVR-150/250/350/500,\n"
-		"\t\t\tYuan MPG series and similar)");
 MODULE_LICENSE("GPL");
 
 MODULE_VERSION(IVTV_VERSION);
diff --git a/drivers/media/pci/sta2x11/sta2x11_vip.c b/drivers/media/pci/sta2x11/sta2x11_vip.c
index 336df65c8af1..524912f20d9f 100644
--- a/drivers/media/pci/sta2x11/sta2x11_vip.c
+++ b/drivers/media/pci/sta2x11/sta2x11_vip.c
@@ -1269,6 +1269,5 @@ late_initcall_sync(sta2x11_vip_init_module);
 MODULE_DESCRIPTION("STA2X11 Video Input Port driver");
 MODULE_AUTHOR("Wind River");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("sta2x11 video input");
 MODULE_VERSION(DRV_VERSION);
 MODULE_DEVICE_TABLE(pci, sta2x11_vip_pci_tbl);
diff --git a/drivers/media/platform/atmel/atmel-isi.c b/drivers/media/platform/atmel/atmel-isi.c
index 0514be6153df..e392b3efe363 100644
--- a/drivers/media/platform/atmel/atmel-isi.c
+++ b/drivers/media/platform/atmel/atmel-isi.c
@@ -1363,4 +1363,3 @@ module_platform_driver(atmel_isi_driver);
 MODULE_AUTHOR("Josh Wu <josh.wu@atmel.com>");
 MODULE_DESCRIPTION("The V4L2 driver for Atmel Linux");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("video");
diff --git a/drivers/media/platform/atmel/atmel-sama5d2-isc.c b/drivers/media/platform/atmel/atmel-sama5d2-isc.c
index 0b78fecfd2a8..61d9885765f4 100644
--- a/drivers/media/platform/atmel/atmel-sama5d2-isc.c
+++ b/drivers/media/platform/atmel/atmel-sama5d2-isc.c
@@ -330,4 +330,3 @@ module_platform_driver(atmel_isc_driver);
 MODULE_AUTHOR("Songjun Wu");
 MODULE_DESCRIPTION("The V4L2 driver for Atmel-ISC");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("video");
diff --git a/drivers/media/platform/marvell-ccic/cafe-driver.c b/drivers/media/platform/marvell-ccic/cafe-driver.c
index 9c94a8b58b7c..baac86f3d153 100644
--- a/drivers/media/platform/marvell-ccic/cafe-driver.c
+++ b/drivers/media/platform/marvell-ccic/cafe-driver.c
@@ -44,10 +44,6 @@
 MODULE_AUTHOR("Jonathan Corbet <corbet@lwn.net>");
 MODULE_DESCRIPTION("Marvell 88ALP01 CMOS Camera Controller driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("Video");
-
-
-
 
 struct cafe_camera {
 	int registered;			/* Fully initialized? */
diff --git a/drivers/media/platform/stm32/stm32-dcmi.c b/drivers/media/platform/stm32/stm32-dcmi.c
index bbcc2254fa2e..d9b4ad0abf0c 100644
--- a/drivers/media/platform/stm32/stm32-dcmi.c
+++ b/drivers/media/platform/stm32/stm32-dcmi.c
@@ -2149,4 +2149,3 @@ MODULE_AUTHOR("Yannick Fertre <yannick.fertre@st.com>");
 MODULE_AUTHOR("Hugues Fruchet <hugues.fruchet@st.com>");
 MODULE_DESCRIPTION("STMicroelectronics STM32 Digital Camera Memory Interface driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("video");
diff --git a/drivers/media/usb/cpia2/cpia2_v4l.c b/drivers/media/usb/cpia2/cpia2_v4l.c
index e488e7870f42..69d5c628a797 100644
--- a/drivers/media/usb/cpia2/cpia2_v4l.c
+++ b/drivers/media/usb/cpia2/cpia2_v4l.c
@@ -56,7 +56,6 @@ MODULE_PARM_DESC(flicker_mode, "Flicker frequency (0 (disabled), " __stringify(5
 
 MODULE_AUTHOR("Steve Miller (STMicroelectronics) <steve.miller@st.com>");
 MODULE_DESCRIPTION("V4L-driver for STMicroelectronics CPiA2 based cameras");
-MODULE_SUPPORTED_DEVICE("video");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(CPIA_VERSION);
 
diff --git a/drivers/media/usb/tm6000/tm6000-alsa.c b/drivers/media/usb/tm6000/tm6000-alsa.c
index 3a2df36ef1db..a19a46770c2b 100644
--- a/drivers/media/usb/tm6000/tm6000-alsa.c
+++ b/drivers/media/usb/tm6000/tm6000-alsa.c
@@ -51,7 +51,6 @@ MODULE_PARM_DESC(index, "Index value for tm6000x capture interface(s).");
 MODULE_DESCRIPTION("ALSA driver module for tm5600/tm6000/tm6010 based TV cards");
 MODULE_AUTHOR("Mauro Carvalho Chehab");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Trident,tm5600},{{Trident,tm6000},{{Trident,tm6010}");
 static unsigned int debug;
 module_param(debug, int, 0644);
 MODULE_PARM_DESC(debug, "enable debug messages");
diff --git a/drivers/media/usb/tm6000/tm6000-dvb.c b/drivers/media/usb/tm6000/tm6000-dvb.c
index 293a460f4616..4990fa886d7a 100644
--- a/drivers/media/usb/tm6000/tm6000-dvb.c
+++ b/drivers/media/usb/tm6000/tm6000-dvb.c
@@ -23,8 +23,6 @@ MODULE_DESCRIPTION("DVB driver extension module for tm5600/6000/6010 based TV ca
 MODULE_AUTHOR("Mauro Carvalho Chehab");
 MODULE_LICENSE("GPL");
 
-MODULE_SUPPORTED_DEVICE("{{Trident, tm5600},{{Trident, tm6000},{{Trident, tm6010}");
-
 static int debug;
 
 module_param(debug, int, 0644);
diff --git a/drivers/mtd/maps/sun_uflash.c b/drivers/mtd/maps/sun_uflash.c
index eb7258293256..f9cfb084c029 100644
--- a/drivers/mtd/maps/sun_uflash.c
+++ b/drivers/mtd/maps/sun_uflash.c
@@ -32,7 +32,6 @@
 
 MODULE_AUTHOR("Eric Brower <ebrower@usa.net>");
 MODULE_DESCRIPTION("User-programmable flash device on Sun Microsystems boardsets");
-MODULE_SUPPORTED_DEVICE(DRIVER_NAME);
 MODULE_LICENSE("GPL");
 MODULE_VERSION("2.1");
 
diff --git a/drivers/net/can/peak_canfd/peak_pciefd_main.c b/drivers/net/can/peak_canfd/peak_pciefd_main.c
index 0df1cdfa6835..1df3c4b54f03 100644
--- a/drivers/net/can/peak_canfd/peak_pciefd_main.c
+++ b/drivers/net/can/peak_canfd/peak_pciefd_main.c
@@ -21,7 +21,6 @@
 
 MODULE_AUTHOR("Stephane Grosjean <s.grosjean@peak-system.com>");
 MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCIe/M.2 FD family cards");
-MODULE_SUPPORTED_DEVICE("PEAK PCAN PCIe/M.2 FD CAN cards");
 MODULE_LICENSE("GPL v2");
 
 #define PCIEFD_DRV_NAME		"peak_pciefd"
diff --git a/drivers/net/can/sja1000/ems_pci.c b/drivers/net/can/sja1000/ems_pci.c
index 6f88c9932920..4ab91759a5c6 100644
--- a/drivers/net/can/sja1000/ems_pci.c
+++ b/drivers/net/can/sja1000/ems_pci.c
@@ -21,7 +21,6 @@
 
 MODULE_AUTHOR("Sebastian Haas <haas@ems-wuenche.com>");
 MODULE_DESCRIPTION("Socket-CAN driver for EMS CPC-PCI/PCIe/104P CAN cards");
-MODULE_SUPPORTED_DEVICE("EMS CPC-PCI/PCIe/104P CAN card");
 MODULE_LICENSE("GPL v2");
 
 #define EMS_PCI_V1_MAX_CHAN 2
diff --git a/drivers/net/can/sja1000/ems_pcmcia.c b/drivers/net/can/sja1000/ems_pcmcia.c
index 770304eaef95..e21b169c14c0 100644
--- a/drivers/net/can/sja1000/ems_pcmcia.c
+++ b/drivers/net/can/sja1000/ems_pcmcia.c
@@ -21,7 +21,6 @@
 
 MODULE_AUTHOR("Markus Plessing <plessing@ems-wuensche.com>");
 MODULE_DESCRIPTION("Socket-CAN driver for EMS CPC-CARD cards");
-MODULE_SUPPORTED_DEVICE("EMS CPC-CARD CAN card");
 MODULE_LICENSE("GPL v2");
 
 #define EMS_PCMCIA_MAX_CHAN 2
diff --git a/drivers/net/can/sja1000/kvaser_pci.c b/drivers/net/can/sja1000/kvaser_pci.c
index 0ea6b711c07b..95fe9ee1ce32 100644
--- a/drivers/net/can/sja1000/kvaser_pci.c
+++ b/drivers/net/can/sja1000/kvaser_pci.c
@@ -33,7 +33,6 @@
 
 MODULE_AUTHOR("Per Dalen <per.dalen@cnw.se>");
 MODULE_DESCRIPTION("Socket-CAN driver for KVASER PCAN PCI cards");
-MODULE_SUPPORTED_DEVICE("KVASER PCAN PCI CAN card");
 MODULE_LICENSE("GPL v2");
 
 #define MAX_NO_OF_CHANNELS        4 /* max no of channels on a single card */
diff --git a/drivers/net/can/sja1000/peak_pci.c b/drivers/net/can/sja1000/peak_pci.c
index 4713921bd511..84eac8cb8686 100644
--- a/drivers/net/can/sja1000/peak_pci.c
+++ b/drivers/net/can/sja1000/peak_pci.c
@@ -24,8 +24,6 @@
 
 MODULE_AUTHOR("Stephane Grosjean <s.grosjean@peak-system.com>");
 MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCI family cards");
-MODULE_SUPPORTED_DEVICE("PEAK PCAN PCI/PCIe/PCIeC miniPCI CAN cards");
-MODULE_SUPPORTED_DEVICE("PEAK PCAN miniPCIe/cPCI PC/104+ PCI/104e CAN Cards");
 MODULE_LICENSE("GPL v2");
 
 #define DRV_NAME  "peak_pci"
diff --git a/drivers/net/can/sja1000/peak_pcmcia.c b/drivers/net/can/sja1000/peak_pcmcia.c
index cf951a783078..131a084c3535 100644
--- a/drivers/net/can/sja1000/peak_pcmcia.c
+++ b/drivers/net/can/sja1000/peak_pcmcia.c
@@ -22,7 +22,6 @@
 MODULE_AUTHOR("Stephane Grosjean <s.grosjean@peak-system.com>");
 MODULE_DESCRIPTION("CAN driver for PEAK-System PCAN-PC Cards");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("PEAK PCAN-PC Card");
 
 /* PEAK-System PCMCIA driver name */
 #define PCC_NAME		"peak_pcmcia"
diff --git a/drivers/net/can/sja1000/plx_pci.c b/drivers/net/can/sja1000/plx_pci.c
index 85679588ef73..5de1ebb0c6f0 100644
--- a/drivers/net/can/sja1000/plx_pci.c
+++ b/drivers/net/can/sja1000/plx_pci.c
@@ -25,18 +25,6 @@
 MODULE_AUTHOR("Pavel Cheblakov <P.B.Cheblakov@inp.nsk.su>");
 MODULE_DESCRIPTION("Socket-CAN driver for PLX90xx PCI-bridge cards with "
 		   "the SJA1000 chips");
-MODULE_SUPPORTED_DEVICE("Adlink PCI-7841/cPCI-7841, "
-			"Adlink PCI-7841/cPCI-7841 SE, "
-			"Marathon CAN-bus-PCI, "
-			"Marathon CAN-bus-PCIe, "
-			"TEWS TECHNOLOGIES TPMC810, "
-			"esd CAN-PCI/CPCI/PCI104/200, "
-			"esd CAN-PCI/PMC/266, "
-			"esd CAN-PCIe/2000, "
-			"Connect Tech Inc. CANpro/104-Plus Opto (CRG001), "
-			"IXXAT PC-I 04/PCI, "
-			"ELCUS CAN-200-PCI, "
-			"ASEM DUAL CAN-RAW")
 MODULE_LICENSE("GPL v2");
 
 #define PLX_PCI_MAX_CHAN 2
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c
index e6c1e5d33924..e393e8457d77 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c
@@ -18,8 +18,6 @@
 
 #include "pcan_usb_core.h"
 
-MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB adapter");
-
 /* PCAN-USB Endpoints */
 #define PCAN_USB_EP_CMDOUT		1
 #define PCAN_USB_EP_CMDIN		(PCAN_USB_EP_CMDOUT | USB_DIR_IN)
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
index f347ecc79aef..bae078579c0d 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
@@ -16,9 +16,6 @@
 #include "pcan_usb_core.h"
 #include "pcan_usb_pro.h"
 
-MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB FD adapter");
-MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB Pro FD adapter");
-
 #define PCAN_USBPROFD_CHANNEL_COUNT	2
 #define PCAN_USBFD_CHANNEL_COUNT	1
 
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_pro.c b/drivers/net/can/usb/peak_usb/pcan_usb_pro.c
index 275087c39602..18fa180ecc81 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_pro.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_pro.c
@@ -17,8 +17,6 @@
 #include "pcan_usb_core.h"
 #include "pcan_usb_pro.h"
 
-MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB Pro adapter");
-
 #define PCAN_USBPRO_CHANNEL_COUNT	2
 
 /* PCAN-USB Pro adapter internal clock (MHz) */
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index 36eeb80406f2..4690c6a59054 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -2167,7 +2167,6 @@ static void __exit scc_cleanup_driver(void)
 
 MODULE_AUTHOR("Joerg Reuter <jreuter@yaina.de>");
 MODULE_DESCRIPTION("AX.25 Device Driver for Z8530 based HDLC cards");
-MODULE_SUPPORTED_DEVICE("Z8530 based SCC cards for Amateur Radio");
 MODULE_LICENSE("GPL");
 module_init(scc_init_driver);
 module_exit(scc_cleanup_driver);
diff --git a/drivers/net/wireless/admtek/adm8211.c b/drivers/net/wireless/admtek/adm8211.c
index c41e72508d3d..2db9c948c0fc 100644
--- a/drivers/net/wireless/admtek/adm8211.c
+++ b/drivers/net/wireless/admtek/adm8211.c
@@ -28,7 +28,6 @@
 MODULE_AUTHOR("Michael Wu <flamingice@sourmilk.net>");
 MODULE_AUTHOR("Jouni Malinen <j@w1.fi>");
 MODULE_DESCRIPTION("Driver for IEEE 802.11b wireless cards based on ADMtek ADM8211");
-MODULE_SUPPORTED_DEVICE("ADM8211");
 MODULE_LICENSE("GPL");
 
 static unsigned int tx_ring_size __read_mostly = 16;
diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c
index 4c6e57f9976d..cef17f33c69e 100644
--- a/drivers/net/wireless/ath/ath5k/base.c
+++ b/drivers/net/wireless/ath/ath5k/base.c
@@ -90,7 +90,6 @@ MODULE_PARM_DESC(no_hw_rfkill_switch, "Ignore the GPIO RFKill switch state");
 MODULE_AUTHOR("Jiri Slaby");
 MODULE_AUTHOR("Nick Kossifidis");
 MODULE_DESCRIPTION("Support for 5xxx series of Atheros 802.11 wireless LAN cards.");
-MODULE_SUPPORTED_DEVICE("Atheros 5xxx WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static int ath5k_init(struct ieee80211_hw *hw);
diff --git a/drivers/net/wireless/ath/ath9k/hw.c b/drivers/net/wireless/ath/ath9k/hw.c
index b66eeb577272..5abc2a5526ec 100644
--- a/drivers/net/wireless/ath/ath9k/hw.c
+++ b/drivers/net/wireless/ath/ath9k/hw.c
@@ -34,7 +34,6 @@ static bool ath9k_hw_set_reset_reg(struct ath_hw *ah, u32 type);
 
 MODULE_AUTHOR("Atheros Communications");
 MODULE_DESCRIPTION("Support for Atheros 802.11n wireless LAN cards.");
-MODULE_SUPPORTED_DEVICE("Atheros 802.11n WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static void ath9k_hw_set_clockrate(struct ath_hw *ah)
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index 42a208787f5a..01f9c26f9bf3 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -37,7 +37,6 @@ static char *dev_info = "ath9k";
 
 MODULE_AUTHOR("Atheros Communications");
 MODULE_DESCRIPTION("Support for Atheros 802.11n wireless LAN cards.");
-MODULE_SUPPORTED_DEVICE("Atheros 802.11n WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static unsigned int ath9k_debug = ATH_DBG_DEFAULT;
diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c
index 707fe66727f8..febce4e8b3dd 100644
--- a/drivers/net/wireless/atmel/atmel.c
+++ b/drivers/net/wireless/atmel/atmel.c
@@ -75,7 +75,6 @@
 MODULE_AUTHOR("Simon Kelley");
 MODULE_DESCRIPTION("Support for Atmel at76c50x 802.11 wireless ethernet cards.");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("Atmel at76c50x wireless cards");
 
 /* The name of the firmware file to be loaded
    over-rides any automatic selection */
diff --git a/drivers/net/wireless/atmel/atmel_cs.c b/drivers/net/wireless/atmel/atmel_cs.c
index 368eebefa741..453bb84cb338 100644
--- a/drivers/net/wireless/atmel/atmel_cs.c
+++ b/drivers/net/wireless/atmel/atmel_cs.c
@@ -57,7 +57,6 @@
 MODULE_AUTHOR("Simon Kelley");
 MODULE_DESCRIPTION("Support for Atmel at76c50x 802.11 wireless ethernet cards.");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("Atmel at76c50x PCMCIA cards");
 
 /*====================================================================*/
 
diff --git a/drivers/net/wireless/atmel/atmel_pci.c b/drivers/net/wireless/atmel/atmel_pci.c
index 47f7ccb32414..f428dc79d916 100644
--- a/drivers/net/wireless/atmel/atmel_pci.c
+++ b/drivers/net/wireless/atmel/atmel_pci.c
@@ -16,7 +16,6 @@
 MODULE_AUTHOR("Simon Kelley");
 MODULE_DESCRIPTION("Support for Atmel at76c50x 802.11 wireless ethernet cards.");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("Atmel at76c506 PCI wireless cards");
 
 static const struct pci_device_id card_ids[] = {
 	{ 0x1114, 0x0506, PCI_ANY_ID, PCI_ANY_ID },
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index 818e523f6025..39f3af2d0439 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -87,7 +87,6 @@ static int n_adapters_found;
 
 MODULE_AUTHOR("Broadcom Corporation");
 MODULE_DESCRIPTION("Broadcom 802.11n wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Broadcom 802.11n WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 /* This needs to be adjusted when brcms_firmwares changes */
 MODULE_FIRMWARE("brcm/bcm43xx-0.fw");
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmutil/utils.c b/drivers/net/wireless/broadcom/brcm80211/brcmutil/utils.c
index 4c84c3001c3f..e87e68cc46e2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmutil/utils.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmutil/utils.c
@@ -12,7 +12,6 @@
 
 MODULE_AUTHOR("Broadcom Corporation");
 MODULE_DESCRIPTION("Broadcom 802.11n wireless LAN driver utilities.");
-MODULE_SUPPORTED_DEVICE("Broadcom 802.11n WLAN cards");
 MODULE_LICENSE("Dual BSD/GPL");
 
 struct sk_buff *brcmu_pkt_buf_get_skb(uint len)
diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c
index e35e1380ae43..60db38c38960 100644
--- a/drivers/net/wireless/cisco/airo.c
+++ b/drivers/net/wireless/cisco/airo.c
@@ -251,7 +251,6 @@ MODULE_AUTHOR("Benjamin Reed");
 MODULE_DESCRIPTION("Support for Cisco/Aironet 802.11 wireless ethernet cards.  "
 		   "Direct support for ISA/PCI/MPI cards and support for PCMCIA when used with airo_cs.");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_SUPPORTED_DEVICE("Aironet 4500, 4800 and Cisco 340/350");
 module_param_hw_array(io, int, ioport, NULL, 0);
 module_param_hw_array(irq, int, irq, NULL, 0);
 module_param_array(rates, int, NULL, 0);
diff --git a/drivers/net/wireless/cisco/airo_cs.c b/drivers/net/wireless/cisco/airo_cs.c
index 3718f958c0fc..fcfe4c6d62f0 100644
--- a/drivers/net/wireless/cisco/airo_cs.c
+++ b/drivers/net/wireless/cisco/airo_cs.c
@@ -47,7 +47,6 @@ MODULE_DESCRIPTION("Support for Cisco/Aironet 802.11 wireless ethernet "
 		   "cards.  This is the module that links the PCMCIA card "
 		   "with the airo module.");
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_SUPPORTED_DEVICE("Aironet 4500, 4800 and Cisco 340 PCMCIA cards");
 
 /*====================================================================*/
 
diff --git a/drivers/net/wireless/intersil/hostap/hostap_cs.c b/drivers/net/wireless/intersil/hostap/hostap_cs.c
index 1a748670835a..ec7db2badc40 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_cs.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_cs.c
@@ -26,7 +26,6 @@ static char *dev_info = "hostap_cs";
 MODULE_AUTHOR("Jouni Malinen");
 MODULE_DESCRIPTION("Support for Intersil Prism2-based 802.11 wireless LAN "
 		   "cards (PC Card).");
-MODULE_SUPPORTED_DEVICE("Intersil Prism2-based WLAN cards (PC Card)");
 MODULE_LICENSE("GPL");
 
 
diff --git a/drivers/net/wireless/intersil/hostap/hostap_pci.c b/drivers/net/wireless/intersil/hostap/hostap_pci.c
index 101887e6bd0f..52d77506effd 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_pci.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_pci.c
@@ -27,7 +27,6 @@ static char *dev_info = "hostap_pci";
 MODULE_AUTHOR("Jouni Malinen");
 MODULE_DESCRIPTION("Support for Intersil Prism2.5-based 802.11 wireless LAN "
 		   "PCI cards.");
-MODULE_SUPPORTED_DEVICE("Intersil Prism2.5-based WLAN PCI cards");
 MODULE_LICENSE("GPL");
 
 
diff --git a/drivers/net/wireless/intersil/hostap/hostap_plx.c b/drivers/net/wireless/intersil/hostap/hostap_plx.c
index 841cfc68ce84..58247290fcbc 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_plx.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_plx.c
@@ -30,7 +30,6 @@ static char *dev_info = "hostap_plx";
 MODULE_AUTHOR("Jouni Malinen");
 MODULE_DESCRIPTION("Support for Intersil Prism2-based 802.11 wireless LAN "
 		   "cards (PLX).");
-MODULE_SUPPORTED_DEVICE("Intersil Prism2-based WLAN cards (PLX)");
 MODULE_LICENSE("GPL");
 
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2400pci.c b/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
index 8f860c14da58..dec6ffdf07c4 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2400pci.c
@@ -1821,7 +1821,6 @@ static const struct pci_device_id rt2400pci_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2400 PCI & PCMCIA Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2460 PCI & PCMCIA chipset based cards");
 MODULE_DEVICE_TABLE(pci, rt2400pci_device_table);
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2500pci.c b/drivers/net/wireless/ralink/rt2x00/rt2500pci.c
index e940443c52ad..8faa0a80e73a 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2500pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2500pci.c
@@ -2119,7 +2119,6 @@ static const struct pci_device_id rt2500pci_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2500 PCI & PCMCIA Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2560 PCI & PCMCIA chipset based cards");
 MODULE_DEVICE_TABLE(pci, rt2500pci_device_table);
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2500usb.c b/drivers/net/wireless/ralink/rt2x00/rt2500usb.c
index fce05fc88aaf..bb5ed6630645 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2500usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2500usb.c
@@ -1956,7 +1956,6 @@ static const struct usb_device_id rt2500usb_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2500 USB Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2570 USB chipset based cards");
 MODULE_DEVICE_TABLE(usb, rt2500usb_device_table);
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800pci.c b/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
index 9a33baaa6184..1fde0e767ce3 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
@@ -439,7 +439,6 @@ static const struct pci_device_id rt2800pci_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2800 PCI & PCMCIA Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2860 PCI & PCMCIA chipset based cards");
 MODULE_FIRMWARE(FIRMWARE_RT2860);
 MODULE_DEVICE_TABLE(pci, rt2800pci_device_table);
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
index 36ac18ca8082..b5c67f656cfd 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
@@ -1248,7 +1248,6 @@ static const struct usb_device_id rt2800usb_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT2800 USB Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2870 USB chipset based cards");
 MODULE_DEVICE_TABLE(usb, rt2800usb_device_table);
 MODULE_FIRMWARE(FIRMWARE_RT2870);
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/wireless/ralink/rt2x00/rt61pci.c b/drivers/net/wireless/ralink/rt2x00/rt61pci.c
index 02da5dd37ddd..82cfc2aadc2b 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt61pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt61pci.c
@@ -2993,8 +2993,6 @@ static const struct pci_device_id rt61pci_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT61 PCI & PCMCIA Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2561, RT2561s & RT2661 "
-			"PCI & PCMCIA chipset based cards");
 MODULE_DEVICE_TABLE(pci, rt61pci_device_table);
 MODULE_FIRMWARE(FIRMWARE_RT2561);
 MODULE_FIRMWARE(FIRMWARE_RT2561s);
diff --git a/drivers/net/wireless/ralink/rt2x00/rt73usb.c b/drivers/net/wireless/ralink/rt2x00/rt73usb.c
index e69793773d87..5ff2c740c3ea 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt73usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt73usb.c
@@ -2513,7 +2513,6 @@ static const struct usb_device_id rt73usb_device_table[] = {
 MODULE_AUTHOR(DRV_PROJECT);
 MODULE_VERSION(DRV_VERSION);
 MODULE_DESCRIPTION("Ralink RT73 USB Wireless LAN driver.");
-MODULE_SUPPORTED_DEVICE("Ralink RT2571W & RT2671 USB chipset based cards");
 MODULE_DEVICE_TABLE(usb, rt73usb_device_table);
 MODULE_FIRMWARE(FIRMWARE_RT2571);
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/wireless/rsi/rsi_91x_main.c b/drivers/net/wireless/rsi/rsi_91x_main.c
index 9a3d2439a8e7..d98483298555 100644
--- a/drivers/net/wireless/rsi/rsi_91x_main.c
+++ b/drivers/net/wireless/rsi/rsi_91x_main.c
@@ -441,6 +441,5 @@ module_init(rsi_91x_hal_module_init);
 module_exit(rsi_91x_hal_module_exit);
 MODULE_AUTHOR("Redpine Signals Inc");
 MODULE_DESCRIPTION("Station driver for RSI 91x devices");
-MODULE_SUPPORTED_DEVICE("RSI-91x");
 MODULE_VERSION("0.1");
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio.c b/drivers/net/wireless/rsi/rsi_91x_sdio.c
index 592e9dadcb55..fe0287b22a25 100644
--- a/drivers/net/wireless/rsi/rsi_91x_sdio.c
+++ b/drivers/net/wireless/rsi/rsi_91x_sdio.c
@@ -1571,7 +1571,6 @@ module_exit(rsi_module_exit);
 
 MODULE_AUTHOR("Redpine Signals Inc");
 MODULE_DESCRIPTION("Common SDIO layer for RSI drivers");
-MODULE_SUPPORTED_DEVICE("RSI-91x");
 MODULE_DEVICE_TABLE(sdio, rsi_dev_table);
 MODULE_FIRMWARE(FIRMWARE_RSI9113);
 MODULE_VERSION("0.1");
diff --git a/drivers/net/wireless/rsi/rsi_91x_usb.c b/drivers/net/wireless/rsi/rsi_91x_usb.c
index a4a533c2a783..3fbe2a3c1455 100644
--- a/drivers/net/wireless/rsi/rsi_91x_usb.c
+++ b/drivers/net/wireless/rsi/rsi_91x_usb.c
@@ -928,7 +928,6 @@ module_usb_driver(rsi_driver);
 
 MODULE_AUTHOR("Redpine Signals Inc");
 MODULE_DESCRIPTION("Common USB layer for RSI drivers");
-MODULE_SUPPORTED_DEVICE("RSI-91x");
 MODULE_DEVICE_TABLE(usb, rsi_dev_table);
 MODULE_FIRMWARE(FIRMWARE_RSI9113);
 MODULE_VERSION("0.1");
diff --git a/drivers/parport/parport_amiga.c b/drivers/parport/parport_amiga.c
index 1e88bcfe0d7b..84d5701d606c 100644
--- a/drivers/parport/parport_amiga.c
+++ b/drivers/parport/parport_amiga.c
@@ -241,6 +241,5 @@ module_platform_driver_probe(amiga_parallel_driver, amiga_parallel_probe);
 
 MODULE_AUTHOR("Joerg Dorchain <joerg@dorchain.net>");
 MODULE_DESCRIPTION("Parport Driver for Amiga builtin Port");
-MODULE_SUPPORTED_DEVICE("Amiga builtin Parallel Port");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:amiga-parallel");
diff --git a/drivers/parport/parport_atari.c b/drivers/parport/parport_atari.c
index 2ff0fe053e6e..1623f010cdcc 100644
--- a/drivers/parport/parport_atari.c
+++ b/drivers/parport/parport_atari.c
@@ -218,7 +218,6 @@ static void __exit parport_atari_exit(void)
 
 MODULE_AUTHOR("Andreas Schwab");
 MODULE_DESCRIPTION("Parport Driver for Atari builtin Port");
-MODULE_SUPPORTED_DEVICE("Atari builtin Parallel Port");
 MODULE_LICENSE("GPL");
 
 module_init(parport_atari_init)
diff --git a/drivers/parport/parport_gsc.c b/drivers/parport/parport_gsc.c
index 9228e8f90309..1e43b3f399a8 100644
--- a/drivers/parport/parport_gsc.c
+++ b/drivers/parport/parport_gsc.c
@@ -41,7 +41,6 @@
 
 MODULE_AUTHOR("Helge Deller <deller@gmx.de>");
 MODULE_DESCRIPTION("HP-PARISC PC-style parallel port driver");
-MODULE_SUPPORTED_DEVICE("integrated PC-style parallel port");
 MODULE_LICENSE("GPL");
 
 
diff --git a/drivers/parport/parport_mfc3.c b/drivers/parport/parport_mfc3.c
index d6bbe8446301..f4d0da741e85 100644
--- a/drivers/parport/parport_mfc3.c
+++ b/drivers/parport/parport_mfc3.c
@@ -359,7 +359,6 @@ static void __exit parport_mfc3_exit(void)
 
 MODULE_AUTHOR("Joerg Dorchain <joerg@dorchain.net>");
 MODULE_DESCRIPTION("Parport Driver for Multiface 3 expansion cards Parallel Port");
-MODULE_SUPPORTED_DEVICE("Multiface 3 Parallel Port");
 MODULE_LICENSE("GPL");
 
 module_init(parport_mfc3_init)
diff --git a/drivers/parport/parport_sunbpp.c b/drivers/parport/parport_sunbpp.c
index e840c1b5ab90..865fc41dbb6c 100644
--- a/drivers/parport/parport_sunbpp.c
+++ b/drivers/parport/parport_sunbpp.c
@@ -377,6 +377,5 @@ module_platform_driver(bpp_sbus_driver);
 
 MODULE_AUTHOR("Derrick J Brashear");
 MODULE_DESCRIPTION("Parport Driver for Sparc bidirectional Port");
-MODULE_SUPPORTED_DEVICE("Sparc Bidirectional Parallel Port");
 MODULE_VERSION("2.0");
 MODULE_LICENSE("GPL");
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index ba9ce4e0d30a..3a945abf268c 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -63,7 +63,6 @@ void dasd_int_handler(struct ccw_device *, unsigned long, struct irb *);
 MODULE_AUTHOR("Holger Smolinski <Holger.Smolinski@de.ibm.com>");
 MODULE_DESCRIPTION("Linux on S/390 DASD device driver,"
 		   " Copyright IBM Corp. 2000");
-MODULE_SUPPORTED_DEVICE("dasd");
 MODULE_LICENSE("GPL");
 
 /*
diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c
index 00e72b97d0b6..d93595b39afa 100644
--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -50,7 +50,6 @@ MODULE_PARM_DESC(sol_compat,
 MODULE_AUTHOR("Eric Brower <ebrower@usa.net>");
 MODULE_DESCRIPTION("7-Segment Display driver for Sun Microsystems CP1400/1500");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("d7s");
 
 struct d7s {
 	void __iomem	*regs;
diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 38369766511c..f135a10f582b 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -80,7 +80,6 @@
 MODULE_AUTHOR("Hewlett-Packard Company");
 MODULE_DESCRIPTION("Driver for HP Smart Array Controller version " \
 	HPSA_DRIVER_VERSION);
-MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
 MODULE_VERSION(HPSA_DRIVER_VERSION);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("cciss");
diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c
index 5d5f50d6a02d..ac89002646a3 100644
--- a/drivers/scsi/pcmcia/nsp_cs.c
+++ b/drivers/scsi/pcmcia/nsp_cs.c
@@ -55,7 +55,6 @@
 
 MODULE_AUTHOR("YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp>");
 MODULE_DESCRIPTION("WorkBit NinjaSCSI-3 / NinjaSCSI-32Bi(16bit) PCMCIA SCSI host adapter module");
-MODULE_SUPPORTED_DEVICE("sd,sr,sg,st");
 MODULE_LICENSE("GPL");
 
 #include "nsp_io.h"
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index c53f456fbd09..a1dacb6e993e 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -48,7 +48,6 @@
 MODULE_AUTHOR("Microsemi");
 MODULE_DESCRIPTION("Driver for Microsemi Smart Family Controller version "
 	DRIVER_VERSION);
-MODULE_SUPPORTED_DEVICE("Microsemi Smart Family Controllers");
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/sh/maple/maple.c b/drivers/sh/maple/maple.c
index e5d7fb81ad66..bd0fbcdbdefe 100644
--- a/drivers/sh/maple/maple.c
+++ b/drivers/sh/maple/maple.c
@@ -30,7 +30,6 @@
 MODULE_AUTHOR("Adrian McMenamin <adrian@mcmen.demon.co.uk>");
 MODULE_DESCRIPTION("Maple bus driver for Dreamcast");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{SEGA, Dreamcast/Maple}}");
 
 static void maple_dma_handler(struct work_struct *work);
 static void maple_vblank_handler(struct work_struct *work);
diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c
index 7956abcbae22..9f920819cd74 100644
--- a/drivers/staging/comedi/drivers/vmk80xx.c
+++ b/drivers/staging/comedi/drivers/vmk80xx.c
@@ -877,5 +877,4 @@ module_comedi_usb_driver(vmk80xx_driver, vmk80xx_usb_driver);
 
 MODULE_AUTHOR("Manuel Gebele <forensixs@gmx.de>");
 MODULE_DESCRIPTION("Velleman USB Board Low-Level Driver");
-MODULE_SUPPORTED_DEVICE("K8055/K8061 aka VM110/VM140");
 MODULE_LICENSE("GPL");
diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index cf4718c6d35d..319a1e701163 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -747,7 +747,6 @@ module_platform_driver(optee_driver);
 
 MODULE_AUTHOR("Linaro");
 MODULE_DESCRIPTION("OP-TEE driver");
-MODULE_SUPPORTED_DEVICE("");
 MODULE_VERSION("1.0");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS("platform:optee");
diff --git a/drivers/tty/serial/icom.c b/drivers/tty/serial/icom.c
index 9a872750581c..94af7a5ea497 100644
--- a/drivers/tty/serial/icom.c
+++ b/drivers/tty/serial/icom.c
@@ -1639,8 +1639,6 @@ module_exit(icom_exit);
 
 MODULE_AUTHOR("Michael Anderson <mjanders@us.ibm.com>");
 MODULE_DESCRIPTION("IBM iSeries Serial IOA driver");
-MODULE_SUPPORTED_DEVICE
-    ("IBM iSeries 2745, 2771, 2772, 2742, 2793 and 2805 Communications adapters");
 MODULE_LICENSE("GPL");
 MODULE_FIRMWARE("icom_call_setup.bin");
 MODULE_FIRMWARE("icom_res_dce.bin");
diff --git a/drivers/tty/serial/jsm/jsm_driver.c b/drivers/tty/serial/jsm/jsm_driver.c
index cd30da0ef083..0ea799bf8dbb 100644
--- a/drivers/tty/serial/jsm/jsm_driver.c
+++ b/drivers/tty/serial/jsm/jsm_driver.c
@@ -19,7 +19,6 @@
 MODULE_AUTHOR("Digi International, https://www.digi.com");
 MODULE_DESCRIPTION("Driver for the Digi International Neo and Classic PCI based product line");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("jsm");
 
 #define JSM_DRIVER_NAME "jsm"
 #define NR_PORTS	32
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index 670e4d91e9ca..dcc88df72df4 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -117,7 +117,6 @@ MODULE_DEVICE_TABLE(usb, ld_usb_table);
 MODULE_AUTHOR("Michael Hund <mhund@ld-didactic.de>");
 MODULE_DESCRIPTION("LD USB Driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("LD USB Devices");
 
 /* All interrupt in transfers are collected in a ring buffer to
  * avoid racing conditions and get better performance of the driver.
diff --git a/drivers/watchdog/cpu5wdt.c b/drivers/watchdog/cpu5wdt.c
index 9867a3a936df..688b112e712b 100644
--- a/drivers/watchdog/cpu5wdt.c
+++ b/drivers/watchdog/cpu5wdt.c
@@ -273,7 +273,6 @@ module_exit(cpu5wdt_exit_module);
 
 MODULE_AUTHOR("Heiko Ronsdorf <hero@ihg.uni-duisburg.de>");
 MODULE_DESCRIPTION("sma cpu5 watchdog driver");
-MODULE_SUPPORTED_DEVICE("sma cpu5 watchdog");
 MODULE_LICENSE("GPL");
 
 module_param_hw(port, int, ioport, 0);
diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index 808eeb4779e4..1eafe0b4d71c 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c
@@ -172,7 +172,6 @@ MODULE_PARM_DESC(wd2_timeout, "Default watchdog2 timeout in 1/10secs");
 MODULE_AUTHOR("Eric Brower <ebrower@usa.net>");
 MODULE_DESCRIPTION("Hardware watchdog driver for Sun Microsystems CP1400/1500");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("watchdog");
 
 static void cpwd_writew(u16 val, void __iomem *addr)
 {
diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c
index 7008596a575f..747e346ed06c 100644
--- a/drivers/watchdog/riowd.c
+++ b/drivers/watchdog/riowd.c
@@ -46,7 +46,6 @@
 
 MODULE_AUTHOR("David S. Miller <davem@davemloft.net>");
 MODULE_DESCRIPTION("Hardware watchdog driver for Sun RIO");
-MODULE_SUPPORTED_DEVICE("watchdog");
 MODULE_LICENSE("GPL");
 
 #define DRIVER_NAME	"riowd"
diff --git a/include/linux/module.h b/include/linux/module.h
index 59f094fa6f74..da4b6fbe8ebe 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -30,9 +30,6 @@
 #include <linux/percpu.h>
 #include <asm/module.h>
 
-/* Not Yet Implemented */
-#define MODULE_SUPPORTED_DEVICE(name)
-
 #define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN
 
 struct modversion_info {
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index e48f7ac8a854..3ddd66e4c29e 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -702,7 +702,6 @@ MODULE_LICENSE("GPL");
 
 MODULE_AUTHOR(BATADV_DRIVER_AUTHOR);
 MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
-MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE);
 MODULE_VERSION(BATADV_SOURCE_VERSION);
 MODULE_ALIAS_RTNL_LINK("batadv");
 MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME);
diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c
index 8a24e5ae7cef..52637180af33 100644
--- a/sound/drivers/aloop.c
+++ b/sound/drivers/aloop.c
@@ -33,7 +33,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("A loopback soundcard");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALSA,Loopback soundcard}}");
 
 #define MAX_PCM_SUBSTREAMS	8
 
diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c
index 316c9afadefe..01a3eab50d7b 100644
--- a/sound/drivers/dummy.c
+++ b/sound/drivers/dummy.c
@@ -25,7 +25,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Dummy soundcard (/dev/null)");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALSA,Dummy soundcard}}");
 
 #define MAX_PCM_DEVICES		4
 #define MAX_PCM_SUBSTREAMS	128
diff --git a/sound/drivers/mtpav.c b/sound/drivers/mtpav.c
index ce5fd17bd720..df4b7f9cd50f 100644
--- a/sound/drivers/mtpav.c
+++ b/sound/drivers/mtpav.c
@@ -53,7 +53,6 @@
 MODULE_AUTHOR("Michael T. Mayers");
 MODULE_DESCRIPTION("MOTU MidiTimePiece AV multiport MIDI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{MOTU,MidiTimePiece AV multiport MIDI}}");
 
 // io resources
 #define MTPAV_IOBASE		0x378
diff --git a/sound/drivers/mts64.c b/sound/drivers/mts64.c
index 9c708b693cb3..322d530ab07b 100644
--- a/sound/drivers/mts64.c
+++ b/sound/drivers/mts64.c
@@ -37,7 +37,6 @@ MODULE_PARM_DESC(enable, "Enable " CARD_NAME " soundcard.");
 MODULE_AUTHOR("Matthias Koenig <mk@phasorlab.de>");
 MODULE_DESCRIPTION("ESI Miditerminal 4140");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESI,Miditerminal 4140}}");
 
 /*********************************************************************
  * Chip specific
diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c
index fd79e57c85ca..7689fa2f9531 100644
--- a/sound/drivers/pcsp/pcsp.c
+++ b/sound/drivers/pcsp/pcsp.c
@@ -22,7 +22,6 @@
 MODULE_AUTHOR("Stas Sergeev <stsp@users.sourceforge.net>");
 MODULE_DESCRIPTION("PC-Speaker driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{PC-Speaker, pcsp}}");
 MODULE_ALIAS("platform:pcspkr");
 
 static int index = SNDRV_DEFAULT_IDX1;	/* Index 0-MAX */
diff --git a/sound/drivers/portman2x4.c b/sound/drivers/portman2x4.c
index c876cf9b5005..2f4514ed47c5 100644
--- a/sound/drivers/portman2x4.c
+++ b/sound/drivers/portman2x4.c
@@ -57,7 +57,6 @@ MODULE_PARM_DESC(enable, "Enable " CARD_NAME " soundcard.");
 MODULE_AUTHOR("Levent Guendogdu, Tobias Gehrig, Matthias Koenig");
 MODULE_DESCRIPTION("Midiman Portman2x4");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Midiman,Portman2x4}}");
 
 /*********************************************************************
  * Chip specific
diff --git a/sound/drivers/serial-u16550.c b/sound/drivers/serial-u16550.c
index 3947f084dd6b..6d5d1ca59ecf 100644
--- a/sound/drivers/serial-u16550.c
+++ b/sound/drivers/serial-u16550.c
@@ -34,7 +34,6 @@
 
 MODULE_DESCRIPTION("MIDI serial u16550");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALSA, MIDI serial u16550}}");
 
 #define SNDRV_SERIAL_SOUNDCANVAS 0 /* Roland Soundcanvas; F5 NN selects part */
 #define SNDRV_SERIAL_MS124T 1      /* Midiator MS-124T */
diff --git a/sound/drivers/virmidi.c b/sound/drivers/virmidi.c
index f1fb68b15498..4206d93ab47e 100644
--- a/sound/drivers/virmidi.c
+++ b/sound/drivers/virmidi.c
@@ -43,7 +43,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("Dummy soundcard for virtual rawmidi devices");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALSA,Virtual rawmidi device}}");
 
 #define MAX_MIDI_DEVICES	4
 
diff --git a/sound/isa/ad1816a/ad1816a.c b/sound/isa/ad1816a/ad1816a.c
index ca18fe3ff8a5..f11af983b3b6 100644
--- a/sound/isa/ad1816a/ad1816a.c
+++ b/sound/isa/ad1816a/ad1816a.c
@@ -22,13 +22,6 @@
 MODULE_AUTHOR("Massimo Piccioni <dafastidio@libero.it>");
 MODULE_DESCRIPTION("AD1816A, AD1815");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Highscreen,Sound-Boostar 16 3D},"
-		"{Analog Devices,AD1815},"
-		"{Analog Devices,AD1816A},"
-		"{TerraTec,Base 64},"
-		"{TerraTec,AudioSystem EWS64S},"
-		"{Aztech/Newcom SC-16 3D},"
-		"{Shark Predator ISA}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 1-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/ad1848/ad1848.c b/sound/isa/ad1848/ad1848.c
index 6f221eed44e2..edafb49797e7 100644
--- a/sound/isa/ad1848/ad1848.c
+++ b/sound/isa/ad1848/ad1848.c
@@ -22,9 +22,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Tugrul Galatali <galatalt@stuy.edu>, Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Analog Devices,AD1848},"
-	        "{Analog Devices,AD1847},"
-		"{Crystal Semiconductors,CS4248}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/als100.c b/sound/isa/als100.c
index 1085f5b01318..bacb7a1b930c 100644
--- a/sound/isa/als100.c
+++ b/sound/isa/als100.c
@@ -26,17 +26,6 @@
 #define PFX "als100: "
 
 MODULE_DESCRIPTION("Avance Logic ALS007/ALS1X0");
-MODULE_SUPPORTED_DEVICE("{{Diamond Technologies DT-019X},"
-		"{Avance Logic ALS-007}}"
-		"{{Avance Logic,ALS100 - PRO16PNP},"
-	        "{Avance Logic,ALS110},"
-	        "{Avance Logic,ALS120},"
-	        "{Avance Logic,ALS200},"
-	        "{3D Melody,MF1000},"
-	        "{Digimate,3D Sound},"
-	        "{Avance Logic,ALS120},"
-	        "{RTL,RTL3000}}");
-
 MODULE_AUTHOR("Massimo Piccioni <dafastidio@libero.it>");
 MODULE_LICENSE("GPL");
 
diff --git a/sound/isa/azt2320.c b/sound/isa/azt2320.c
index 4ed52094fc8d..867e9ae8f65a 100644
--- a/sound/isa/azt2320.c
+++ b/sound/isa/azt2320.c
@@ -35,11 +35,6 @@
 MODULE_AUTHOR("Massimo Piccioni <dafastidio@libero.it>");
 MODULE_DESCRIPTION("Aztech Systems AZT2320");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Aztech Systems,PRO16V},"
-		"{Aztech Systems,AZT2320},"
-		"{Aztech Systems,AZT3300},"
-		"{Aztech Systems,AZT2320},"
-		"{Aztech Systems,AZT3000}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/cmi8330.c b/sound/isa/cmi8330.c
index 19e258527d69..bc112df10fc5 100644
--- a/sound/isa/cmi8330.c
+++ b/sound/isa/cmi8330.c
@@ -51,7 +51,6 @@
 MODULE_AUTHOR("George Talusan <gstalusan@uwaterloo.ca>");
 MODULE_DESCRIPTION("C-Media CMI8330/CMI8329");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{C-Media,CMI8330,isapnp:{CMI0001,@@@0001,@X@0001}}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
diff --git a/sound/isa/cs423x/cs4231.c b/sound/isa/cs423x/cs4231.c
index c56cbc072918..ec054b929214 100644
--- a/sound/isa/cs423x/cs4231.c
+++ b/sound/isa/cs423x/cs4231.c
@@ -23,7 +23,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Crystal Semiconductors,CS4231}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/cs423x/cs4236.c b/sound/isa/cs423x/cs4236.c
index 63fb0cb754d0..186d7d4db45e 100644
--- a/sound/isa/cs423x/cs4236.c
+++ b/sound/isa/cs423x/cs4236.c
@@ -18,40 +18,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Cirrus Logic CS4232-9");
-MODULE_SUPPORTED_DEVICE("{{Turtle Beach,TBS-2000},"
-		"{Turtle Beach,Tropez Plus},"
-		"{SIC CrystalWave 32},"
-		"{Hewlett Packard,Omnibook 5500},"
-		"{TerraTec,Maestro 32/96},"
-		"{Philips,PCA70PS}},"
-		"{{Crystal Semiconductors,CS4235},"
-		"{Crystal Semiconductors,CS4236},"
-		"{Crystal Semiconductors,CS4237},"
-		"{Crystal Semiconductors,CS4238},"
-		"{Crystal Semiconductors,CS4239},"
-		"{Acer,AW37},"
-		"{Acer,AW35/Pro},"
-		"{Crystal,3D},"
-		"{Crystal Computer,TidalWave128},"
-		"{Dell,Optiplex GX1},"
-		"{Dell,Workstation 400 sound},"
-		"{EliteGroup,P5TX-LA sound},"
-		"{Gallant,SC-70P},"
-		"{Gateway,E1000 Onboard CS4236B},"
-		"{Genius,Sound Maker 3DJ},"
-		"{Hewlett Packard,HP6330 sound},"
-		"{IBM,PC 300PL sound},"
-		"{IBM,Aptiva 2137 E24},"
-		"{IBM,IntelliStation M Pro},"
-		"{Intel,Marlin Spike Mobo CS4235},"
-		"{Intel PR440FX Onboard},"
-		"{Guillemot,MaxiSound 16 PnP},"
-		"{NewClear,3D},"
-		"{TerraTec,AudioSystem EWS64L/XL},"
-		"{Typhoon Soundsystem,CS4236B},"
-		"{Turtle Beach,Malibu},"
-		"{Unknown,Digital PC 5000 Onboard}}");
-
 MODULE_ALIAS("snd_cs4232");
 
 #define IDENT "CS4232+"
diff --git a/sound/isa/es1688/es1688.c b/sound/isa/es1688/es1688.c
index 4a1f61f1a331..750d4995634f 100644
--- a/sound/isa/es1688/es1688.c
+++ b/sound/isa/es1688/es1688.c
@@ -26,11 +26,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,ES688 PnP AudioDrive,pnp:ESS0100},"
-	        "{ESS,ES1688 PnP AudioDrive,pnp:ESS0102},"
-	        "{ESS,ES688 AudioDrive,pnp:ESS6881},"
-	        "{ESS,ES1688 AudioDrive,pnp:ESS1681}}");
-
 MODULE_ALIAS("snd_es968");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
diff --git a/sound/isa/es18xx.c b/sound/isa/es18xx.c
index 9beef8079177..375a4a6a4717 100644
--- a/sound/isa/es18xx.c
+++ b/sound/isa/es18xx.c
@@ -1929,17 +1929,9 @@ static int snd_es18xx_mixer(struct snd_card *card)
 
 /* Card level */
 
-MODULE_AUTHOR("Christian Fischbach <fishbach@pool.informatik.rwth-aachen.de>, Abramo Bagnara <abramo@alsa-project.org>");  
+MODULE_AUTHOR("Christian Fischbach <fishbach@pool.informatik.rwth-aachen.de>, Abramo Bagnara <abramo@alsa-project.org>");
 MODULE_DESCRIPTION("ESS ES18xx AudioDrive");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,ES1868 PnP AudioDrive},"
-		"{ESS,ES1869 PnP AudioDrive},"
-		"{ESS,ES1878 PnP AudioDrive},"
-		"{ESS,ES1879 PnP AudioDrive},"
-		"{ESS,ES1887 PnP AudioDrive},"
-		"{ESS,ES1888 PnP AudioDrive},"
-		"{ESS,ES1887 AudioDrive},"
-		"{ESS,ES1888 AudioDrive}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/gus/gusclassic.c b/sound/isa/gus/gusclassic.c
index 015f88a11352..0fba5d8fe84f 100644
--- a/sound/isa/gus/gusclassic.c
+++ b/sound/isa/gus/gusclassic.c
@@ -23,7 +23,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Gravis,UltraSound Classic}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/gus/gusextreme.c b/sound/isa/gus/gusextreme.c
index c9f31b4fb887..da2b2ca6b721 100644
--- a/sound/isa/gus/gusextreme.c
+++ b/sound/isa/gus/gusextreme.c
@@ -27,7 +27,6 @@
 MODULE_DESCRIPTION(CRD_NAME);
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Gravis,UltraSound Extreme}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/gus/gusmax.c b/sound/isa/gus/gusmax.c
index dc09fbd6f88d..24b945f1768d 100644
--- a/sound/isa/gus/gusmax.c
+++ b/sound/isa/gus/gusmax.c
@@ -21,7 +21,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Gravis UltraSound MAX");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Gravis,UltraSound MAX}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/gus/interwave.c b/sound/isa/gus/interwave.c
index e4d412e72b75..99581fba4ca8 100644
--- a/sound/isa/gus/interwave.c
+++ b/sound/isa/gus/interwave.c
@@ -28,14 +28,8 @@ MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
 #ifndef SNDRV_STB
 MODULE_DESCRIPTION("AMD InterWave");
-MODULE_SUPPORTED_DEVICE("{{Gravis,UltraSound Plug & Play},"
-		"{STB,SoundRage32},"
-		"{MED,MED3210},"
-		"{Dynasonix,Dynasonix Pro},"
-		"{Panasonic,PCA761AW}}");
 #else
 MODULE_DESCRIPTION("AMD InterWave STB with TEA6330T");
-MODULE_SUPPORTED_DEVICE("{{AMD,InterWave STB with TEA6330T}}");
 #endif
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
diff --git a/sound/isa/opl3sa2.c b/sound/isa/opl3sa2.c
index 7649a8a4128d..9bde11d1cfe8 100644
--- a/sound/isa/opl3sa2.c
+++ b/sound/isa/opl3sa2.c
@@ -22,11 +22,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Yamaha OPL3SA2+");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Yamaha,YMF719E-S},"
-		"{Genius,Sound Maker 3DX},"
-		"{Yamaha,OPL3SA3},"
-		"{Intel,AL440LX sound},"
-	        "{NeoMagic,MagicWave 3DX}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/opti9xx/miro.c b/sound/isa/opti9xx/miro.c
index 20933342f5eb..a510b201143c 100644
--- a/sound/isa/opti9xx/miro.c
+++ b/sound/isa/opti9xx/miro.c
@@ -33,9 +33,6 @@
 MODULE_AUTHOR("Martin Langer <martin-langer@gmx.de>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Miro miroSOUND PCM1 pro, PCM12, PCM20 Radio");
-MODULE_SUPPORTED_DEVICE("{{Miro,miroSOUND PCM1 pro}, "
-			"{Miro,miroSOUND PCM12}, "
-			"{Miro,miroSOUND PCM20 Radio}}");
 
 static int index = SNDRV_DEFAULT_IDX1;		/* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;		/* ID for this card */
diff --git a/sound/isa/opti9xx/opti92x-ad1848.c b/sound/isa/opti9xx/opti92x-ad1848.c
index 758f5b579138..08e61d90057b 100644
--- a/sound/isa/opti9xx/opti92x-ad1848.c
+++ b/sound/isa/opti9xx/opti92x-ad1848.c
@@ -36,17 +36,11 @@ MODULE_AUTHOR("Massimo Piccioni <dafastidio@libero.it>");
 MODULE_LICENSE("GPL");
 #ifdef OPTi93X
 MODULE_DESCRIPTION("OPTi93X");
-MODULE_SUPPORTED_DEVICE("{{OPTi,82C931/3}}");
 #else	/* OPTi93X */
 #ifdef CS4231
 MODULE_DESCRIPTION("OPTi92X - CS4231");
-MODULE_SUPPORTED_DEVICE("{{OPTi,82C924 (CS4231)},"
-		"{OPTi,82C925 (CS4231)}}");
 #else	/* CS4231 */
 MODULE_DESCRIPTION("OPTi92X - AD1848");
-MODULE_SUPPORTED_DEVICE("{{OPTi,82C924 (AD1848)},"
-		"{OPTi,82C925 (AD1848)},"
-	        "{OAK,Mozart}}");
 #endif	/* CS4231 */
 #endif	/* OPTi93X */
 
diff --git a/sound/isa/sb/jazz16.c b/sound/isa/sb/jazz16.c
index 0e2e0ab3b9e4..7ba5dd1ec810 100644
--- a/sound/isa/sb/jazz16.c
+++ b/sound/isa/sb/jazz16.c
@@ -28,9 +28,6 @@
 #define PFX "jazz16: "
 
 MODULE_DESCRIPTION("Media Vision Jazz16");
-MODULE_SUPPORTED_DEVICE("{{Media Vision ??? },"
-		"{RTL,RTL3000}}");
-
 MODULE_AUTHOR("Krzysztof Helt <krzysztof.h1@wp.pl>");
 MODULE_LICENSE("GPL");
 
diff --git a/sound/isa/sb/sb16.c b/sound/isa/sb/sb16.c
index db284b7b88a7..63ef960abd25 100644
--- a/sound/isa/sb/sb16.c
+++ b/sound/isa/sb/sb16.c
@@ -31,16 +31,8 @@ MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_LICENSE("GPL");
 #ifndef SNDRV_SBAWE
 MODULE_DESCRIPTION("Sound Blaster 16");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs,SB 16},"
-		"{Creative Labs,SB Vibra16S},"
-		"{Creative Labs,SB Vibra16C},"
-		"{Creative Labs,SB Vibra16CL},"
-		"{Creative Labs,SB Vibra16X}}");
 #else
 MODULE_DESCRIPTION("Sound Blaster AWE");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs,SB AWE 32},"
-		"{Creative Labs,SB AWE 64},"
-		"{Creative Labs,SB AWE 64 Gold}}");
 #endif
 
 #if 0
diff --git a/sound/isa/sb/sb8.c b/sound/isa/sb/sb8.c
index 8e3e67b9a341..6c9d534ce8b6 100644
--- a/sound/isa/sb/sb8.c
+++ b/sound/isa/sb/sb8.c
@@ -17,7 +17,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Sound Blaster 1.0/2.0/Pro");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs,SB 1.0/SB 2.0/SB Pro}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/sc6000.c b/sound/isa/sc6000.c
index def137579717..3462663050bb 100644
--- a/sound/isa/sc6000.c
+++ b/sound/isa/sc6000.c
@@ -29,9 +29,6 @@
 MODULE_AUTHOR("Krzysztof Helt");
 MODULE_DESCRIPTION("Gallant SC-6000");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Gallant, SC-6000},"
-			"{AudioExcel, Audio Excel DSP 16},"
-			"{Zoltrix, AV302}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/isa/wavefront/wavefront.c b/sound/isa/wavefront/wavefront.c
index b750a4fd40de..a4437971df2f 100644
--- a/sound/isa/wavefront/wavefront.c
+++ b/sound/isa/wavefront/wavefront.c
@@ -21,7 +21,6 @@
 MODULE_AUTHOR("Paul Barton-Davis <pbd@op.net>");
 MODULE_DESCRIPTION("Turtle Beach Wavefront");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Turtle Beach,Maui/Tropez/Tropez+}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	    /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	    /* ID for this card */
diff --git a/sound/mips/sgio2audio.c b/sound/mips/sgio2audio.c
index 5bf1ea150f26..989f656e2de7 100644
--- a/sound/mips/sgio2audio.c
+++ b/sound/mips/sgio2audio.c
@@ -32,7 +32,6 @@
 MODULE_AUTHOR("Vivien Chappelier <vivien.chappelier@linux-mips.org>");
 MODULE_DESCRIPTION("SGI O2 Audio");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Silicon Graphics, O2 Audio}}");
 
 static int index = SNDRV_DEFAULT_IDX1;  /* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;   /* ID for this card */
diff --git a/sound/pci/ad1889.c b/sound/pci/ad1889.c
index 5d835d2af054..4520022801d9 100644
--- a/sound/pci/ad1889.c
+++ b/sound/pci/ad1889.c
@@ -43,7 +43,6 @@
 MODULE_AUTHOR("Kyle McMartin <kyle@parisc-linux.org>, Thibaut Varene <t-bone@parisc-linux.org>");
 MODULE_DESCRIPTION("Analog Devices AD1889 ALSA sound driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Analog Devices,AD1889}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 module_param_array(index, int, NULL, 0444);
diff --git a/sound/pci/ali5451/ali5451.c b/sound/pci/ali5451/ali5451.c
index 51f24796f03f..0d66b92466d5 100644
--- a/sound/pci/ali5451/ali5451.c
+++ b/sound/pci/ali5451/ali5451.c
@@ -29,7 +29,6 @@
 MODULE_AUTHOR("Matt Wu <Matt_Wu@acersoftech.com.cn>");
 MODULE_DESCRIPTION("ALI M5451");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ALI,M5451,pci},{ALI,M5451}}");
 
 static int index = SNDRV_DEFAULT_IDX1;	/* Index */
 static char *id = SNDRV_DEFAULT_STR1;	/* ID for this card */
diff --git a/sound/pci/als300.c b/sound/pci/als300.c
index 1dc8c4ed0592..bd4fd09e982b 100644
--- a/sound/pci/als300.c
+++ b/sound/pci/als300.c
@@ -86,7 +86,6 @@ enum {DEVICE_ALS300, DEVICE_ALS300_PLUS};
 MODULE_AUTHOR("Ash Willis <ashwillis@programmer.net>");
 MODULE_DESCRIPTION("Avance Logic ALS300");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Avance Logic,ALS300},{Avance Logic,ALS300+}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
diff --git a/sound/pci/als4000.c b/sound/pci/als4000.c
index 2edc7455285a..139ac2a3a0ef 100644
--- a/sound/pci/als4000.c
+++ b/sound/pci/als4000.c
@@ -68,7 +68,6 @@
 MODULE_AUTHOR("Bart Hartgers <bart@etpmod.phys.tue.nl>, Andreas Mohr");
 MODULE_DESCRIPTION("Avance Logic ALS4000");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Avance Logic,ALS4000}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
diff --git a/sound/pci/atiixp.c b/sound/pci/atiixp.c
index a25d75455802..579425ccbb6a 100644
--- a/sound/pci/atiixp.c
+++ b/sound/pci/atiixp.c
@@ -23,7 +23,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("ATI IXP AC97 controller");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ATI,IXP150/200/250/300/400/600}}");
 
 static int index = SNDRV_DEFAULT_IDX1;	/* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;	/* ID for this card */
diff --git a/sound/pci/atiixp_modem.c b/sound/pci/atiixp_modem.c
index ae88217d685a..45e75afec7a0 100644
--- a/sound/pci/atiixp_modem.c
+++ b/sound/pci/atiixp_modem.c
@@ -23,7 +23,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("ATI IXP MC97 controller");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ATI,IXP150/200/250}}");
 
 static int index = -2; /* Exclude the first card */
 static char *id = SNDRV_DEFAULT_STR1;	/* ID for this card */
diff --git a/sound/pci/au88x0/au88x0.c b/sound/pci/au88x0/au88x0.c
index 5dd98e6ff34b..1b37b7225b1d 100644
--- a/sound/pci/au88x0/au88x0.c
+++ b/sound/pci/au88x0/au88x0.c
@@ -41,8 +41,6 @@ MODULE_PARM_DESC(pcifix, "Enable VIA-workaround for " CARD_NAME " soundcard.");
 
 MODULE_DESCRIPTION("Aureal vortex");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Aureal Semiconductor Inc., Aureal Vortex Sound Processor}}");
-
 MODULE_DEVICE_TABLE(pci, snd_vortex_ids);
 
 static void vortex_fix_latency(struct pci_dev *vortex)
diff --git a/sound/pci/azt3328.c b/sound/pci/azt3328.c
index 2ac594dcf21c..51dcf1bc4c0c 100644
--- a/sound/pci/azt3328.c
+++ b/sound/pci/azt3328.c
@@ -196,7 +196,6 @@
 MODULE_AUTHOR("Andreas Mohr <andi AT lisas.de>");
 MODULE_DESCRIPTION("Aztech AZF3328 (PCI168)");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Aztech,AZF3328}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_GAMEPORT 1
diff --git a/sound/pci/bt87x.c b/sound/pci/bt87x.c
index cf9f8d80a0b6..91512b345d19 100644
--- a/sound/pci/bt87x.c
+++ b/sound/pci/bt87x.c
@@ -23,8 +23,6 @@
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_DESCRIPTION("Brooktree Bt87x audio driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Brooktree,Bt878},"
-		"{Brooktree,Bt879}}");
 
 static int index[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = -2}; /* Exclude the first card */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/ca0106/ca0106_main.c b/sound/pci/ca0106/ca0106_main.c
index ee20f9a1aae9..bee4710916c4 100644
--- a/sound/pci/ca0106/ca0106_main.c
+++ b/sound/pci/ca0106/ca0106_main.c
@@ -137,7 +137,6 @@
 MODULE_AUTHOR("James Courtier-Dutton <James@superbug.demon.co.uk>");
 MODULE_DESCRIPTION("CA0106");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Creative,SB CA0106 chip}}");
 
 // module parameters (see "Module Parameters")
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
diff --git a/sound/pci/cmipci.c b/sound/pci/cmipci.c
index 7363d61eaec2..598446348da6 100644
--- a/sound/pci/cmipci.c
+++ b/sound/pci/cmipci.c
@@ -30,10 +30,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("C-Media CMI8x38 PCI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{C-Media,CMI8738},"
-		"{C-Media,CMI8738B},"
-		"{C-Media,CMI8338A},"
-		"{C-Media,CMI8338B}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
diff --git a/sound/pci/cs4281.c b/sound/pci/cs4281.c
index 94d2a6a466a8..bf3bb70ffaf9 100644
--- a/sound/pci/cs4281.c
+++ b/sound/pci/cs4281.c
@@ -25,7 +25,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Cirrus Logic CS4281");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Cirrus Logic,CS4281}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/cs46xx/cs46xx.c b/sound/pci/cs46xx/cs46xx.c
index a6e0a4439332..1db7b4112840 100644
--- a/sound/pci/cs46xx/cs46xx.c
+++ b/sound/pci/cs46xx/cs46xx.c
@@ -21,13 +21,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Cirrus Logic Sound Fusion CS46XX");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Cirrus Logic,Sound Fusion (CS4280)},"
-		"{Cirrus Logic,Sound Fusion (CS4610)},"
-		"{Cirrus Logic,Sound Fusion (CS4612)},"
-		"{Cirrus Logic,Sound Fusion (CS4615)},"
-		"{Cirrus Logic,Sound Fusion (CS4622)},"
-		"{Cirrus Logic,Sound Fusion (CS4624)},"
-		"{Cirrus Logic,Sound Fusion (CS4630)}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/cs5535audio/cs5535audio.c b/sound/pci/cs5535audio/cs5535audio.c
index 359bc6af8670..9b716b56d739 100644
--- a/sound/pci/cs5535audio/cs5535audio.c
+++ b/sound/pci/cs5535audio/cs5535audio.c
@@ -393,4 +393,3 @@ module_pci_driver(cs5535audio_driver);
 MODULE_AUTHOR("Jaya Kumar");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("CS5535 Audio");
-MODULE_SUPPORTED_DEVICE("CS5535 Audio");
diff --git a/sound/pci/ctxfi/xfi.c b/sound/pci/ctxfi/xfi.c
index 8c07c6463c24..713d36ea40cb 100644
--- a/sound/pci/ctxfi/xfi.c
+++ b/sound/pci/ctxfi/xfi.c
@@ -18,7 +18,6 @@
 MODULE_AUTHOR("Creative Technology Ltd");
 MODULE_DESCRIPTION("X-Fi driver version 1.03");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs, Sound Blaster X-Fi}");
 
 static unsigned int reference_rate = 48000;
 static unsigned int multiple = 2;
diff --git a/sound/pci/echoaudio/echoaudio.c b/sound/pci/echoaudio/echoaudio.c
index a20b2bb5c898..9bd67ac33657 100644
--- a/sound/pci/echoaudio/echoaudio.c
+++ b/sound/pci/echoaudio/echoaudio.c
@@ -10,7 +10,6 @@
 MODULE_AUTHOR("Giuliano Pochini <pochini@shiny.it>");
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Echoaudio " ECHOCARD_NAME " soundcards driver");
-MODULE_SUPPORTED_DEVICE("{{Echoaudio," ECHOCARD_NAME "}}");
 MODULE_DEVICE_TABLE(pci, snd_echo_ids);
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
diff --git a/sound/pci/emu10k1/emu10k1.c b/sound/pci/emu10k1/emu10k1.c
index 353934c88cbd..45833bc2a7e7 100644
--- a/sound/pci/emu10k1/emu10k1.c
+++ b/sound/pci/emu10k1/emu10k1.c
@@ -18,8 +18,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("EMU10K1");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Creative Labs,SB Live!/PCI512/E-mu APS},"
-	       "{Creative Labs,SB Audigy}}");
 
 #if IS_ENABLED(CONFIG_SND_SEQUENCER)
 #define ENABLE_SYNTH
diff --git a/sound/pci/emu10k1/emu10k1x.c b/sound/pci/emu10k1/emu10k1x.c
index 785ec0cf3933..d9a12cd01647 100644
--- a/sound/pci/emu10k1/emu10k1x.c
+++ b/sound/pci/emu10k1/emu10k1x.c
@@ -31,7 +31,6 @@
 MODULE_AUTHOR("Francisco Moraes <fmoraes@nc.rr.com>");
 MODULE_DESCRIPTION("EMU10K1X");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Dell Creative Labs,SB Live!}");
 
 // module parameters (see "Module Parameters")
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
diff --git a/sound/pci/ens1370.c b/sound/pci/ens1370.c
index 93c4fd313311..3ccccdbc0029 100644
--- a/sound/pci/ens1370.c
+++ b/sound/pci/ens1370.c
@@ -52,17 +52,9 @@ MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>, Thomas Sailer <sailer@ife.ee.et
 MODULE_LICENSE("GPL");
 #ifdef CHIP1370
 MODULE_DESCRIPTION("Ensoniq AudioPCI ES1370");
-MODULE_SUPPORTED_DEVICE("{{Ensoniq,AudioPCI-97 ES1370},"
-	        "{Creative Labs,SB PCI64/128 (ES1370)}}");
 #endif
 #ifdef CHIP1371
 MODULE_DESCRIPTION("Ensoniq/Creative AudioPCI ES1371+");
-MODULE_SUPPORTED_DEVICE("{{Ensoniq,AudioPCI ES1371/73},"
-		"{Ensoniq,AudioPCI ES1373},"
-		"{Creative Labs,Ectiva EV1938},"
-		"{Creative Labs,SB PCI64/128 (ES1371/73)},"
-		"{Creative Labs,Vibra PCI128},"
-		"{Ectiva,EV1938}}");
 #endif
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
diff --git a/sound/pci/es1938.c b/sound/pci/es1938.c
index 3b5d68ce9dd5..afc66347d162 100644
--- a/sound/pci/es1938.c
+++ b/sound/pci/es1938.c
@@ -52,10 +52,6 @@
 MODULE_AUTHOR("Jaromir Koutek <miri@punknet.cz>");
 MODULE_DESCRIPTION("ESS Solo-1");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,ES1938},"
-                "{ESS,ES1946},"
-                "{ESS,ES1969},"
-		"{TerraTec,128i PCI}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c
index 747fa69bb1c9..5fa1861236f5 100644
--- a/sound/pci/es1968.c
+++ b/sound/pci/es1968.c
@@ -107,10 +107,6 @@
 
 MODULE_DESCRIPTION("ESS Maestro");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,Maestro 2e},"
-		"{ESS,Maestro 2},"
-		"{ESS,Maestro 1},"
-		"{TerraTec,DMX}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
diff --git a/sound/pci/fm801.c b/sound/pci/fm801.c
index c6ad6235a669..6279eb156e36 100644
--- a/sound/pci/fm801.c
+++ b/sound/pci/fm801.c
@@ -26,8 +26,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("ForteMedia FM801");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ForteMedia,FM801},"
-		"{Genius,SoundMaker Live 5.1}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 5eea130dcf0a..b59b0f323d4e 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -208,40 +208,6 @@ MODULE_PARM_DESC(snoop, "Enable/disable snooping");
 
 
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Intel, ICH6},"
-			 "{Intel, ICH6M},"
-			 "{Intel, ICH7},"
-			 "{Intel, ESB2},"
-			 "{Intel, ICH8},"
-			 "{Intel, ICH9},"
-			 "{Intel, ICH10},"
-			 "{Intel, PCH},"
-			 "{Intel, CPT},"
-			 "{Intel, PPT},"
-			 "{Intel, LPT},"
-			 "{Intel, LPT_LP},"
-			 "{Intel, WPT_LP},"
-			 "{Intel, SPT},"
-			 "{Intel, SPT_LP},"
-			 "{Intel, HPT},"
-			 "{Intel, PBG},"
-			 "{Intel, SCH},"
-			 "{ATI, SB450},"
-			 "{ATI, SB600},"
-			 "{ATI, RS600},"
-			 "{ATI, RS690},"
-			 "{ATI, RS780},"
-			 "{ATI, R600},"
-			 "{ATI, RV630},"
-			 "{ATI, RV610},"
-			 "{ATI, RV670},"
-			 "{ATI, RV635},"
-			 "{ATI, RV620},"
-			 "{ATI, RV770},"
-			 "{VIA, VT8251},"
-			 "{VIA, VT8237A},"
-			 "{SiS, SIS966},"
-			 "{ULI, M5461}}");
 MODULE_DESCRIPTION("Intel HDA driver");
 
 #if defined(CONFIG_PM) && defined(CONFIG_VGA_SWITCHEROO)
diff --git a/sound/pci/ice1712/ice1712.c b/sound/pci/ice1712/ice1712.c
index f814dbbec2a4..d54cd5143e9f 100644
--- a/sound/pci/ice1712/ice1712.c
+++ b/sound/pci/ice1712/ice1712.c
@@ -60,12 +60,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("ICEnsemble ICE1712 (Envy24)");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{"
-	       HOONTECH_DEVICE_DESC
-	       DELTA_DEVICE_DESC
-	       EWS_DEVICE_DESC
-	       "{ICEnsemble,Generic ICE1712},"
-	       "{ICEnsemble,Generic Envy24}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/ice1712/ice1724.c b/sound/pci/ice1712/ice1724.c
index c0fca94c1dd2..ef2367d86148 100644
--- a/sound/pci/ice1712/ice1724.c
+++ b/sound/pci/ice1712/ice1724.c
@@ -44,25 +44,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("VIA ICEnsemble ICE1724/1720 (Envy24HT/PT)");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{"
-	       REVO_DEVICE_DESC
-	       AMP_AUDIO2000_DEVICE_DESC
-	       AUREON_DEVICE_DESC
-	       VT1720_MOBO_DEVICE_DESC
-	       PONTIS_DEVICE_DESC
-	       PRODIGY192_DEVICE_DESC
-	       PRODIGY_HIFI_DEVICE_DESC
-	       JULI_DEVICE_DESC
-	       MAYA44_DEVICE_DESC
-	       PHASE_DEVICE_DESC
-	       WTM_DEVICE_DESC
-	       SE_DEVICE_DESC
-	       QTET_DEVICE_DESC
-		"{VIA,VT1720},"
-		"{VIA,VT1724},"
-		"{ICEnsemble,Generic ICE1724},"
-		"{ICEnsemble,Generic Envy24HT}"
-		"{ICEnsemble,Generic Envy24PT}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index 3349e455a871..35903d1a1cbd 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -27,29 +27,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Intel 82801AA,82901AB,i810,i820,i830,i840,i845,MX440; SiS 7012; Ali 5455");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Intel,82801AA-ICH},"
-		"{Intel,82901AB-ICH0},"
-		"{Intel,82801BA-ICH2},"
-		"{Intel,82801CA-ICH3},"
-		"{Intel,82801DB-ICH4},"
-		"{Intel,ICH5},"
-		"{Intel,ICH6},"
-		"{Intel,ICH7},"
-		"{Intel,6300ESB},"
-		"{Intel,ESB2},"
-		"{Intel,MX440},"
-		"{SiS,SI7012},"
-		"{NVidia,nForce Audio},"
-		"{NVidia,nForce2 Audio},"
-		"{NVidia,nForce3 Audio},"
-		"{NVidia,MCP04},"
-		"{NVidia,MCP501},"
-		"{NVidia,CK804},"
-		"{NVidia,CK8},"
-		"{NVidia,CK8S},"
-		"{AMD,AMD768},"
-		"{AMD,AMD8111},"
-	        "{ALI,M5455}}");
 
 static int index = SNDRV_DEFAULT_IDX1;	/* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;	/* ID for this card */
diff --git a/sound/pci/intel8x0m.c b/sound/pci/intel8x0m.c
index 19872cecc9d2..13ef838b26c1 100644
--- a/sound/pci/intel8x0m.c
+++ b/sound/pci/intel8x0m.c
@@ -25,21 +25,6 @@ MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Intel 82801AA,82901AB,i810,i820,i830,i840,i845,MX440; "
 		   "SiS 7013; NVidia MCP/2/2S/3 modems");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Intel,82801AA-ICH},"
-		"{Intel,82901AB-ICH0},"
-		"{Intel,82801BA-ICH2},"
-		"{Intel,82801CA-ICH3},"
-		"{Intel,82801DB-ICH4},"
-		"{Intel,ICH5},"
-		"{Intel,ICH6},"
-		"{Intel,ICH7},"
-	        "{Intel,MX440},"
-		"{SiS,7013},"
-		"{NVidia,NForce Modem},"
-		"{NVidia,NForce2 Modem},"
-		"{NVidia,NForce2s Modem},"
-		"{NVidia,NForce3 Modem},"
-		"{AMD,AMD768}}");
 
 static int index = -2; /* Exclude the first card */
 static char *id = SNDRV_DEFAULT_STR1;	/* ID for this card */
diff --git a/sound/pci/korg1212/korg1212.c b/sound/pci/korg1212/korg1212.c
index 2eddd9de9e6d..80ac3c6152ad 100644
--- a/sound/pci/korg1212/korg1212.c
+++ b/sound/pci/korg1212/korg1212.c
@@ -388,7 +388,6 @@ struct snd_korg1212 {
 
 MODULE_DESCRIPTION("korg1212");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{KORG,korg1212}}");
 MODULE_FIRMWARE("korg/k1212.dsp");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;     /* Index 0-MAX */
diff --git a/sound/pci/lola/lola.c b/sound/pci/lola/lola.c
index 491c90f83fbc..03b4be44bb26 100644
--- a/sound/pci/lola/lola.c
+++ b/sound/pci/lola/lola.c
@@ -54,7 +54,6 @@ MODULE_PARM_DESC(sample_rate_min, "Minimal sample rate");
  */
 
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram, Lola}}");
 MODULE_DESCRIPTION("Digigram Lola driver");
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 
diff --git a/sound/pci/lx6464es/lx6464es.c b/sound/pci/lx6464es/lx6464es.c
index b92ea074ff2a..1be97c38bc71 100644
--- a/sound/pci/lx6464es/lx6464es.c
+++ b/sound/pci/lx6464es/lx6464es.c
@@ -21,8 +21,6 @@
 MODULE_AUTHOR("Tim Blechmann");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("digigram lx6464es");
-MODULE_SUPPORTED_DEVICE("{digigram lx6464es{}}");
-
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c
index d2c2cd6006f0..cdc4b6106252 100644
--- a/sound/pci/maestro3.c
+++ b/sound/pci/maestro3.c
@@ -39,11 +39,6 @@
 MODULE_AUTHOR("Zach Brown <zab@zabbo.net>, Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("ESS Maestro3 PCI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{ESS,Maestro3 PCI},"
-		"{ESS,ES1988},"
-		"{ESS,Allegro PCI},"
-		"{ESS,Allegro-1 PCI},"
-	        "{ESS,Canyon3D-2/LE PCI}}");
 MODULE_FIRMWARE("ess/maestro3_assp_kernel.fw");
 MODULE_FIRMWARE("ess/maestro3_assp_minisrc.fw");
 
diff --git a/sound/pci/mixart/mixart.c b/sound/pci/mixart/mixart.c
index efff220b26ea..a0bbb386dc25 100644
--- a/sound/pci/mixart/mixart.c
+++ b/sound/pci/mixart/mixart.c
@@ -32,7 +32,6 @@
 MODULE_AUTHOR("Digigram <alsa@digigram.com>");
 MODULE_DESCRIPTION("Digigram " CARD_NAME);
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram," CARD_NAME "}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;             /* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;              /* ID for this card */
diff --git a/sound/pci/nm256/nm256.c b/sound/pci/nm256/nm256.c
index 975994623c2c..6cb689aa28c2 100644
--- a/sound/pci/nm256/nm256.c
+++ b/sound/pci/nm256/nm256.c
@@ -32,8 +32,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("NeoMagic NM256AV/ZX");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{NeoMagic,NM256AV},"
-		"{NeoMagic,NM256ZX}}");
 
 /*
  * some compile conditions.
diff --git a/sound/pci/oxygen/oxygen.c b/sound/pci/oxygen/oxygen.c
index a751fcce7c8e..e335c4b5b381 100644
--- a/sound/pci/oxygen/oxygen.c
+++ b/sound/pci/oxygen/oxygen.c
@@ -56,9 +56,6 @@
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_DESCRIPTION("C-Media CMI8788 driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{C-Media,CMI8786}"
-			",{C-Media,CMI8787}"
-			",{C-Media,CMI8788}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
diff --git a/sound/pci/oxygen/se6x.c b/sound/pci/oxygen/se6x.c
index 78c35a0a5477..434f885f6f91 100644
--- a/sound/pci/oxygen/se6x.c
+++ b/sound/pci/oxygen/se6x.c
@@ -29,7 +29,6 @@
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_DESCRIPTION("Studio Evolution SE6X driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Studio Evolution,SE6X}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
diff --git a/sound/pci/oxygen/virtuoso.c b/sound/pci/oxygen/virtuoso.c
index 98ab16329827..baa3244d4dab 100644
--- a/sound/pci/oxygen/virtuoso.c
+++ b/sound/pci/oxygen/virtuoso.c
@@ -16,7 +16,6 @@
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_DESCRIPTION("Asus Virtuoso driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Asus,AV66},{Asus,AV100},{Asus,AV200}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;
diff --git a/sound/pci/pcxhr/pcxhr.c b/sound/pci/pcxhr/pcxhr.c
index c2e4831c3a13..751f9744b089 100644
--- a/sound/pci/pcxhr/pcxhr.c
+++ b/sound/pci/pcxhr/pcxhr.c
@@ -35,7 +35,6 @@ MODULE_AUTHOR("Markus Bollinger <bollinger@digigram.com>, "
 	      "Marc Titinger <titinger@digigram.com>");
 MODULE_DESCRIPTION("Digigram " DRIVER_NAME " " PCXHR_DRIVER_VERSION_STRING);
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram," DRIVER_NAME "}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/riptide/riptide.c b/sound/pci/riptide/riptide.c
index fcc2073c5025..56827db97239 100644
--- a/sound/pci/riptide/riptide.c
+++ b/sound/pci/riptide/riptide.c
@@ -103,7 +103,6 @@
 MODULE_AUTHOR("Peter Gruber <nokos@gmx.net>");
 MODULE_DESCRIPTION("riptide");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Conexant,Riptide}}");
 MODULE_FIRMWARE("riptide.hex");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;
diff --git a/sound/pci/rme32.c b/sound/pci/rme32.c
index 4eabece4dcba..54f3e39f97f5 100644
--- a/sound/pci/rme32.c
+++ b/sound/pci/rme32.c
@@ -88,7 +88,6 @@ MODULE_PARM_DESC(fullduplex, "Support full-duplex mode.");
 MODULE_AUTHOR("Martin Langer <martin-langer@gmx.de>, Pilo Chambert <pilo.c@wanadoo.fr>");
 MODULE_DESCRIPTION("RME Digi32, Digi32/8, Digi32 PRO");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME,Digi32}," "{RME,Digi32/8}," "{RME,Digi32 PRO}}");
 
 /* Defines for RME Digi32 series */
 #define RME32_SPDIF_NCHANNELS 2
diff --git a/sound/pci/rme96.c b/sound/pci/rme96.c
index 84eef6a3739f..66082e9f526d 100644
--- a/sound/pci/rme96.c
+++ b/sound/pci/rme96.c
@@ -31,11 +31,6 @@ MODULE_AUTHOR("Anders Torger <torger@ludd.luth.se>");
 MODULE_DESCRIPTION("RME Digi96, Digi96/8, Digi96/8 PRO, Digi96/8 PST, "
 		   "Digi96/8 PAD");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME,Digi96},"
-		"{RME,Digi96/8},"
-		"{RME,Digi96/8 PRO},"
-		"{RME,Digi96/8 PST},"
-		"{RME,Digi96/8 PAD}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/rme9652/hdsp.c b/sound/pci/rme9652/hdsp.c
index 6d9029333a12..4cf879c42dc4 100644
--- a/sound/pci/rme9652/hdsp.c
+++ b/sound/pci/rme9652/hdsp.c
@@ -44,9 +44,6 @@ MODULE_PARM_DESC(enable, "Enable/disable specific Hammerfall DSP soundcards.");
 MODULE_AUTHOR("Paul Davis <paul@linuxaudiosystems.com>, Marcus Andersson, Thomas Charbonnel <thomas@undata.org>");
 MODULE_DESCRIPTION("RME Hammerfall DSP");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME Hammerfall-DSP},"
-	        "{RME HDSP-9652},"
-		"{RME HDSP-9632}}");
 MODULE_FIRMWARE("rpm_firmware.bin");
 MODULE_FIRMWARE("multiface_firmware.bin");
 MODULE_FIRMWARE("multiface_firmware_rev11.bin");
diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c
index b66711574b1a..8d900c132f0f 100644
--- a/sound/pci/rme9652/hdspm.c
+++ b/sound/pci/rme9652/hdspm.c
@@ -165,7 +165,6 @@ MODULE_AUTHOR
 );
 MODULE_DESCRIPTION("RME HDSPM");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME HDSPM-MADI}}");
 
 /* --- Write registers. ---
   These are defined as byte-offsets from the iobase value.  */
diff --git a/sound/pci/rme9652/rme9652.c b/sound/pci/rme9652/rme9652.c
index 012fbec5e6a7..4df992e846f2 100644
--- a/sound/pci/rme9652/rme9652.c
+++ b/sound/pci/rme9652/rme9652.c
@@ -39,8 +39,6 @@ MODULE_PARM_DESC(precise_ptr, "Enable precise pointer (doesn't work reliably).")
 MODULE_AUTHOR("Paul Davis <pbd@op.net>, Winfried Ritsch");
 MODULE_DESCRIPTION("RME Digi9652/Digi9636");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{RME,Hammerfall},"
-		"{RME,Hammerfall-Light}}");
 
 /* The Hammerfall has two sets of 24 ADAT + 2 S/PDIF channels, one for
    capture, one for playback. Both the ADAT and S/PDIF channels appear
diff --git a/sound/pci/sis7019.c b/sound/pci/sis7019.c
index 8ffa2f53c0b5..00ab51c889da 100644
--- a/sound/pci/sis7019.c
+++ b/sound/pci/sis7019.c
@@ -24,7 +24,6 @@
 MODULE_AUTHOR("David Dillow <dave@thedillows.org>");
 MODULE_DESCRIPTION("SiS7019");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{SiS,SiS7019 Audio Accelerator}}");
 
 static int index = SNDRV_DEFAULT_IDX1;	/* Index 0-MAX */
 static char *id = SNDRV_DEFAULT_STR1;	/* ID for this card */
diff --git a/sound/pci/sonicvibes.c b/sound/pci/sonicvibes.c
index 26fd1d08c179..7de10997775f 100644
--- a/sound/pci/sonicvibes.c
+++ b/sound/pci/sonicvibes.c
@@ -29,7 +29,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("S3 SonicVibes PCI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{S3,SonicVibes PCI}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
diff --git a/sound/pci/trident/trident.c b/sound/pci/trident/trident.c
index 5bc79da6e35e..a51041205f7c 100644
--- a/sound/pci/trident/trident.c
+++ b/sound/pci/trident/trident.c
@@ -17,18 +17,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>, <audio@tridentmicro.com>");
 MODULE_DESCRIPTION("Trident 4D-WaveDX/NX & SiS SI7018");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Trident,4DWave DX},"
-		"{Trident,4DWave NX},"
-		"{SiS,SI7018 PCI Audio},"
-		"{Best Union,Miss Melody 4DWave PCI},"
-		"{HIS,4DWave PCI},"
-		"{Warpspeed,ONSpeed 4DWave PCI},"
-		"{Aztech Systems,PCI 64-Q3D},"
-		"{Addonics,SV 750},"
-		"{CHIC,True Sound 4Dwave},"
-		"{Shark,Predator4D-PCI},"
-		"{Jaton,SonicWave 4D},"
-		"{Hoontech,SoundTrack Digital 4DWave NX}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/via82xx.c b/sound/pci/via82xx.c
index 154d88ce8813..fd1f2f9cfbc3 100644
--- a/sound/pci/via82xx.c
+++ b/sound/pci/via82xx.c
@@ -56,7 +56,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("VIA VT82xx audio");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{VIA,VT82C686A/B/C,pci},{VIA,VT8233A/C,8235}}");
 
 #if IS_REACHABLE(CONFIG_GAMEPORT)
 #define SUPPORT_JOYSTICK 1
diff --git a/sound/pci/via82xx_modem.c b/sound/pci/via82xx_modem.c
index addfa196df21..30253306f67c 100644
--- a/sound/pci/via82xx_modem.c
+++ b/sound/pci/via82xx_modem.c
@@ -38,7 +38,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("VIA VT82xx modem");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{VIA,VT82C686A/B/C modem,pci}}");
 
 static int index = -2; /* Exclude the first card */
 static char *id = SNDRV_DEFAULT_STR1;	/* ID for this card */
diff --git a/sound/pci/vx222/vx222.c b/sound/pci/vx222/vx222.c
index f7800ed1b67e..2a9e1a77a81a 100644
--- a/sound/pci/vx222/vx222.c
+++ b/sound/pci/vx222/vx222.c
@@ -20,7 +20,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("Digigram VX222 V2/Mic");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram," CARD_NAME "}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pci/ymfpci/ymfpci.c b/sound/pci/ymfpci/ymfpci.c
index 9b0d18a7bf35..99be1490ef0e 100644
--- a/sound/pci/ymfpci/ymfpci.c
+++ b/sound/pci/ymfpci/ymfpci.c
@@ -17,12 +17,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Yamaha DS-1 PCI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Yamaha,YMF724},"
-		"{Yamaha,YMF724F},"
-		"{Yamaha,YMF740},"
-		"{Yamaha,YMF740C},"
-		"{Yamaha,YMF744},"
-		"{Yamaha,YMF754}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pcmcia/pdaudiocf/pdaudiocf.c b/sound/pcmcia/pdaudiocf/pdaudiocf.c
index 27d9da6d61e8..144582350a05 100644
--- a/sound/pcmcia/pdaudiocf/pdaudiocf.c
+++ b/sound/pcmcia/pdaudiocf/pdaudiocf.c
@@ -22,7 +22,6 @@
 MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>");
 MODULE_DESCRIPTION("Sound Core " CARD_NAME);
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Sound Core," CARD_NAME "}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/pcmcia/vx/vxpocket.c b/sound/pcmcia/vx/vxpocket.c
index afd30a90c807..636320489805 100644
--- a/sound/pcmcia/vx/vxpocket.c
+++ b/sound/pcmcia/vx/vxpocket.c
@@ -17,13 +17,9 @@
 #include <sound/initval.h>
 #include <sound/tlv.h>
 
-/*
- */
-
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("Digigram VXPocket");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Digigram,VXPocket},{Digigram,VXPocket440}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/ppc/powermac.c b/sound/ppc/powermac.c
index 96ef55082bf9..9fb51ebafde1 100644
--- a/sound/ppc/powermac.c
+++ b/sound/ppc/powermac.c
@@ -18,7 +18,6 @@
 #define CHIP_NAME "PMac"
 
 MODULE_DESCRIPTION("PowerMac");
-MODULE_SUPPORTED_DEVICE("{{Apple,PowerMac}}");
 MODULE_LICENSE("GPL");
 
 static int index = SNDRV_DEFAULT_IDX1;		/* Index 0-MAX */
diff --git a/sound/sh/aica.c b/sound/sh/aica.c
index 8fa68432d3c1..6e9d6bd67369 100644
--- a/sound/sh/aica.c
+++ b/sound/sh/aica.c
@@ -32,7 +32,6 @@
 MODULE_AUTHOR("Adrian McMenamin <adrian@mcmen.demon.co.uk>");
 MODULE_DESCRIPTION("Dreamcast AICA sound (pcm) driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Yamaha/SEGA, AICA}}");
 MODULE_FIRMWARE("aica_firmware.bin");
 
 /* module parameters */
diff --git a/sound/sh/sh_dac_audio.c b/sound/sh/sh_dac_audio.c
index feb28502940f..8ebd972846ac 100644
--- a/sound/sh/sh_dac_audio.c
+++ b/sound/sh/sh_dac_audio.c
@@ -25,7 +25,6 @@
 MODULE_AUTHOR("Rafael Ignacio Zurita <rizurita@yahoo.com>");
 MODULE_DESCRIPTION("SuperH DAC audio driver");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{SuperH DAC audio support}}");
 
 /* Module Parameters */
 static int index = SNDRV_DEFAULT_IDX1;
diff --git a/sound/sparc/amd7930.c b/sound/sparc/amd7930.c
index 9d0da5fa1c70..d24ae00878f5 100644
--- a/sound/sparc/amd7930.c
+++ b/sound/sparc/amd7930.c
@@ -62,7 +62,6 @@ MODULE_PARM_DESC(enable, "Enable Sun AMD7930 soundcard.");
 MODULE_AUTHOR("Thomas K. Dyas and David S. Miller");
 MODULE_DESCRIPTION("Sun AMD7930");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Sun,AMD7930}}");
 
 /* Device register layout.  */
 
diff --git a/sound/sparc/cs4231.c b/sound/sparc/cs4231.c
index 0eed5f79a2bf..35c17803a430 100644
--- a/sound/sparc/cs4231.c
+++ b/sound/sparc/cs4231.c
@@ -52,7 +52,6 @@ MODULE_PARM_DESC(enable, "Enable Sun CS4231 soundcard.");
 MODULE_AUTHOR("Jaroslav Kysela, Derrick J. Brashear and David S. Miller");
 MODULE_DESCRIPTION("Sun CS4231");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Sun,CS4231}}");
 
 #ifdef SBUS_SUPPORT
 struct sbus_dma_info {
diff --git a/sound/sparc/dbri.c b/sound/sparc/dbri.c
index 5a6fb66dd118..b055f5839578 100644
--- a/sound/sparc/dbri.c
+++ b/sound/sparc/dbri.c
@@ -76,7 +76,6 @@
 MODULE_AUTHOR("Rudolf Koenig, Brent Baccala and Martin Habets");
 MODULE_DESCRIPTION("Sun DBRI");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Sun,DBRI}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/usb/6fire/chip.c b/sound/usb/6fire/chip.c
index 08c6e6a52eb9..33e962178c93 100644
--- a/sound/usb/6fire/chip.c
+++ b/sound/usb/6fire/chip.c
@@ -26,7 +26,6 @@
 MODULE_AUTHOR("Torsten Schenk <torsten.schenk@zoho.com>");
 MODULE_DESCRIPTION("TerraTec DMX 6Fire USB audio driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{TerraTec,DMX 6Fire USB}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for card */
diff --git a/sound/usb/caiaq/device.c b/sound/usb/caiaq/device.c
index e03481caf7f6..49f63f878e6f 100644
--- a/sound/usb/caiaq/device.c
+++ b/sound/usb/caiaq/device.c
@@ -26,20 +26,6 @@
 MODULE_AUTHOR("Daniel Mack <daniel@caiaq.de>");
 MODULE_DESCRIPTION("caiaq USB audio");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Native Instruments,RigKontrol2},"
-			 "{Native Instruments,RigKontrol3},"
-			 "{Native Instruments,Kore Controller},"
-			 "{Native Instruments,Kore Controller 2},"
-			 "{Native Instruments,Audio Kontrol 1},"
-			 "{Native Instruments,Audio 2 DJ},"
-			 "{Native Instruments,Audio 4 DJ},"
-			 "{Native Instruments,Audio 8 DJ},"
-			 "{Native Instruments,Traktor Audio 2},"
-			 "{Native Instruments,Session I/O},"
-			 "{Native Instruments,GuitarRig mobile},"
-			 "{Native Instruments,Traktor Kontrol X1},"
-			 "{Native Instruments,Traktor Kontrol S4},"
-			 "{Native Instruments,Maschine Controller}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */
 static char* id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for this card */
diff --git a/sound/usb/card.c b/sound/usb/card.c
index b6f4c0848e66..0826a437f8fc 100644
--- a/sound/usb/card.c
+++ b/sound/usb/card.c
@@ -58,8 +58,6 @@
 MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("USB Audio");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{Generic,USB Audio}}");
-
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX;	/* Index 0-MAX */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR;	/* ID for this card */
diff --git a/sound/usb/hiface/chip.c b/sound/usb/hiface/chip.c
index c2824188d142..95385e90882c 100644
--- a/sound/usb/hiface/chip.c
+++ b/sound/usb/hiface/chip.c
@@ -21,23 +21,6 @@ MODULE_AUTHOR("Michael Trimarchi <michael@amarulasolutions.com>");
 MODULE_AUTHOR("Antonio Ospite <ao2@amarulasolutions.com>");
 MODULE_DESCRIPTION("M2Tech hiFace USB-SPDIF audio driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{M2Tech,Young},"
-			 "{M2Tech,hiFace},"
-			 "{M2Tech,North Star},"
-			 "{M2Tech,W4S Young},"
-			 "{M2Tech,Corrson},"
-			 "{M2Tech,AUDIA},"
-			 "{M2Tech,SL Audio},"
-			 "{M2Tech,Empirical},"
-			 "{M2Tech,Rockna},"
-			 "{M2Tech,Pathos},"
-			 "{M2Tech,Metronome},"
-			 "{M2Tech,CAD},"
-			 "{M2Tech,Audio Esclusive},"
-			 "{M2Tech,Rotel},"
-			 "{M2Tech,Eeaudio},"
-			 "{The Chord Company,CHORD},"
-			 "{AVA Group A/S,Vitus}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */
 static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for card */
diff --git a/sound/usb/misc/ua101.c b/sound/usb/misc/ua101.c
index 6b30155964ec..5834d1dc317e 100644
--- a/sound/usb/misc/ua101.c
+++ b/sound/usb/misc/ua101.c
@@ -19,7 +19,6 @@
 MODULE_DESCRIPTION("Edirol UA-101/1000 driver");
 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{{Edirol,UA-101},{Edirol,UA-1000}}");
 
 /*
  * Should not be lower than the minimum scheduling delay of the host
diff --git a/sound/usb/usx2y/usbusx2y.c b/sound/usb/usx2y/usbusx2y.c
index c54158146917..3cd28d24f0a7 100644
--- a/sound/usb/usx2y/usbusx2y.c
+++ b/sound/usb/usx2y/usbusx2y.c
@@ -137,7 +137,6 @@
 MODULE_AUTHOR("Karsten Wiese <annabellesgarden@yahoo.de>");
 MODULE_DESCRIPTION("TASCAM "NAME_ALLCAPS" Version 0.8.7.2");
 MODULE_LICENSE("GPL");
-MODULE_SUPPORTED_DEVICE("{{TASCAM(0x1604),"NAME_ALLCAPS"(0x8001)(0x8005)(0x8007)}}");
 
 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-max */
 static char* id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* Id for this card */
diff --git a/sound/x86/intel_hdmi_audio.c b/sound/x86/intel_hdmi_audio.c
index 1d66c3a4fb10..33b12aa67cf5 100644
--- a/sound/x86/intel_hdmi_audio.c
+++ b/sound/x86/intel_hdmi_audio.c
@@ -1887,4 +1887,3 @@ MODULE_AUTHOR("Vaibhav Agarwal <vaibhav.agarwal@intel.com>");
 MODULE_AUTHOR("Jerome Anand <jerome.anand@intel.com>");
 MODULE_DESCRIPTION("Intel HDMI Audio driver");
 MODULE_LICENSE("GPL v2");
-MODULE_SUPPORTED_DEVICE("{Intel,Intel_HAD}");
diff --git a/sound/xen/xen_snd_front.c b/sound/xen/xen_snd_front.c
index 228d82031297..2cb0a19be2b8 100644
--- a/sound/xen/xen_snd_front.c
+++ b/sound/xen/xen_snd_front.c
@@ -391,4 +391,3 @@ module_exit(xen_drv_fini);
 MODULE_DESCRIPTION("Xen virtual sound device frontend");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("xen:" XENSND_DRIVER_NAME);
-MODULE_SUPPORTED_DEVICE("{{ALSA,Virtual soundcard}}");
-- 
cgit v1.2.3


From 546aa0e4ea6ed81b6c51baeebc4364542fa3f3a7 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 17 Mar 2021 15:06:54 -0400
Subject: usb-storage: Add quirk to defeat Kindle's automatic unload

Matthias reports that the Amazon Kindle automatically removes its
emulated media if it doesn't receive another SCSI command within about
one second after a SYNCHRONIZE CACHE.  It does so even when the host
has sent a PREVENT MEDIUM REMOVAL command.  The reason for this
behavior isn't clear, although it's not hard to make some guesses.

At any rate, the results can be unexpected for anyone who tries to
access the Kindle in an unusual fashion, and in theory they can lead
to data loss (for example, if one file is closed and synchronized
while other files are still in the middle of being written).

To avoid such problems, this patch creates a new usb-storage quirks
flag telling the driver always to issue a REQUEST SENSE following a
SYNCHRONIZE CACHE command, and adds an unusual_devs entry for the
Kindle with the flag set.  This is sufficient to prevent the Kindle
from doing its automatic unload, without interfering with proper
operation.

Another possible way to deal with this would be to increase the
frequency of TEST UNIT READY polling that the kernel normally carries
out for removable-media storage devices.  However that would increase
the overall load on the system and it is not as reliable, because the
user can override the polling interval.  Changing the driver's
behavior is safer and has minimal overhead.

CC: <stable@vger.kernel.org>
Reported-and-tested-by: Matthias Schwarzott <zzam@gentoo.org>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20210317190654.GA497856@rowland.harvard.edu
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/storage/transport.c    |  7 +++++++
 drivers/usb/storage/unusual_devs.h | 12 ++++++++++++
 include/linux/usb_usual.h          |  2 ++
 3 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c
index 5eb895b19c55..f4304ce69350 100644
--- a/drivers/usb/storage/transport.c
+++ b/drivers/usb/storage/transport.c
@@ -656,6 +656,13 @@ void usb_stor_invoke_transport(struct scsi_cmnd *srb, struct us_data *us)
 		need_auto_sense = 1;
 	}
 
+	/* Some devices (Kindle) require another command after SYNC CACHE */
+	if ((us->fflags & US_FL_SENSE_AFTER_SYNC) &&
+			srb->cmnd[0] == SYNCHRONIZE_CACHE) {
+		usb_stor_dbg(us, "-- sense after SYNC CACHE\n");
+		need_auto_sense = 1;
+	}
+
 	/*
 	 * If we have a failure, we're going to do a REQUEST_SENSE 
 	 * automatically.  Note that we differentiate between a command
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 5732e9691f08..efa972be2ee3 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -2211,6 +2211,18 @@ UNUSUAL_DEV( 0x1908, 0x3335, 0x0200, 0x0200,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_NO_READ_DISC_INFO ),
 
+/*
+ * Reported by Matthias Schwarzott <zzam@gentoo.org>
+ * The Amazon Kindle treats SYNCHRONIZE CACHE as an indication that
+ * the host may be finished with it, and automatically ejects its
+ * emulated media unless it receives another command within one second.
+ */
+UNUSUAL_DEV( 0x1949, 0x0004, 0x0000, 0x9999,
+		"Amazon",
+		"Kindle",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_SENSE_AFTER_SYNC ),
+
 /*
  * Reported by Oliver Neukum <oneukum@suse.com>
  * This device morphes spontaneously into another device if the access
diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index 6b03fdd69d27..712363c7a2e8 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -86,6 +86,8 @@
 		/* lies about caching, so always sync */	\
 	US_FLAG(NO_SAME, 0x40000000)				\
 		/* Cannot handle WRITE_SAME */			\
+	US_FLAG(SENSE_AFTER_SYNC, 0x80000000)			\
+		/* Do REQUEST_SENSE after SYNCHRONIZE_CACHE */	\
 
 #define US_FLAG(name, value)	US_FL_##name = value ,
 enum { US_DO_ALL_FLAGS };
-- 
cgit v1.2.3


From cb038357937ee4f589aab2469ec3896dce90f317 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Tue, 16 Mar 2021 15:36:47 -0700
Subject: net: fix race between napi kthread mode and busy poll

Currently, napi_thread_wait() checks for NAPI_STATE_SCHED bit to
determine if the kthread owns this napi and could call napi->poll() on
it. However, if socket busy poll is enabled, it is possible that the
busy poll thread grabs this SCHED bit (after the previous napi->poll()
invokes napi_complete_done() and clears SCHED bit) and tries to poll
on the same napi. napi_disable() could grab the SCHED bit as well.
This patch tries to fix this race by adding a new bit
NAPI_STATE_SCHED_THREADED in napi->state. This bit gets set in
____napi_schedule() if the threaded mode is enabled, and gets cleared
in napi_complete_done(), and we only poll the napi in kthread if this
bit is set. This helps distinguish the ownership of the napi between
kthread and other scenarios and fixes the race issue.

Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support")
Reported-by: Martin Zaharinov <micron10@gmail.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
Cc: Alexander Duyck <alexanderduyck@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 19 ++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5b67ea89d5f2..87a5d186faff 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -360,6 +360,7 @@ enum {
 	NAPI_STATE_IN_BUSY_POLL,	/* sk_busy_loop() owns this NAPI */
 	NAPI_STATE_PREFER_BUSY_POLL,	/* prefer busy-polling over softirq processing*/
 	NAPI_STATE_THREADED,		/* The poll is performed inside its own thread*/
+	NAPI_STATE_SCHED_THREADED,	/* Napi is currently scheduled in threaded mode */
 };
 
 enum {
@@ -372,6 +373,7 @@ enum {
 	NAPIF_STATE_IN_BUSY_POLL	= BIT(NAPI_STATE_IN_BUSY_POLL),
 	NAPIF_STATE_PREFER_BUSY_POLL	= BIT(NAPI_STATE_PREFER_BUSY_POLL),
 	NAPIF_STATE_THREADED		= BIT(NAPI_STATE_THREADED),
+	NAPIF_STATE_SCHED_THREADED	= BIT(NAPI_STATE_SCHED_THREADED),
 };
 
 enum gro_result {
diff --git a/net/core/dev.c b/net/core/dev.c
index a142a207fc1d..bb568f7cb81b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4294,6 +4294,13 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 		 */
 		thread = READ_ONCE(napi->thread);
 		if (thread) {
+			/* Avoid doing set_bit() if the thread is in
+			 * INTERRUPTIBLE state, cause napi_thread_wait()
+			 * makes sure to proceed with napi polling
+			 * if the thread is explicitly woken from here.
+			 */
+			if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE)
+				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 			wake_up_process(thread);
 			return;
 		}
@@ -6486,6 +6493,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
 
 		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+			      NAPIF_STATE_SCHED_THREADED |
 			      NAPIF_STATE_PREFER_BUSY_POLL);
 
 		/* If STATE_MISSED was set, leave STATE_SCHED set,
@@ -6968,16 +6976,25 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
 
 static int napi_thread_wait(struct napi_struct *napi)
 {
+	bool woken = false;
+
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop() && !napi_disable_pending(napi)) {
-		if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+		/* Testing SCHED_THREADED bit here to make sure the current
+		 * kthread owns this napi and could poll on this napi.
+		 * Testing SCHED bit is not enough because SCHED bit might be
+		 * set by some other busy poll thread or by napi_disable().
+		 */
+		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
 			WARN_ON(!list_empty(&napi->poll_list));
 			__set_current_state(TASK_RUNNING);
 			return 0;
 		}
 
 		schedule();
+		/* woken being true indicates this thread owns this napi. */
+		woken = true;
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3


From e21aa341785c679dd409c8cb71f864c00fe6c463 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 16 Mar 2021 14:00:07 -0700
Subject: bpf: Fix fexit trampoline.

The fexit/fmod_ret programs can be attached to kernel functions that can sleep.
The synchronize_rcu_tasks() will not wait for such tasks to complete.
In such case the trampoline image will be freed and when the task
wakes up the return IP will point to freed memory causing the crash.
Solve this by adding percpu_ref_get/put for the duration of trampoline
and separate trampoline vs its image life times.
The "half page" optimization has to be removed, since
first_half->second_half->first_half transition cannot be guaranteed to
complete in deterministic time. Every trampoline update becomes a new image.
The image with fmod_ret or fexit progs will be freed via percpu_ref_kill and
call_rcu_tasks. Together they will wait for the original function and
trampoline asm to complete. The trampoline is patched from nop to jmp to skip
fexit progs. They are freed independently from the trampoline. The image with
fentry progs only will be freed via call_rcu_tasks_trace+call_rcu_tasks which
will wait for both sleepable and non-sleepable progs to complete.

Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paul E. McKenney <paulmck@kernel.org>  # for RCU
Link: https://lore.kernel.org/bpf/20210316210007.38949-1-alexei.starovoitov@gmail.com
---
 arch/x86/net/bpf_jit_comp.c |  26 +++++-
 include/linux/bpf.h         |  24 ++++-
 kernel/bpf/bpf_struct_ops.c |   2 +-
 kernel/bpf/core.c           |   4 +-
 kernel/bpf/trampoline.c     | 218 ++++++++++++++++++++++++++++++++++----------
 5 files changed, 213 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 747bba0a584a..72b5a57e9e31 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1936,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
  * add rsp, 8                      // skip eth_type_trans's frame
  * ret                             // return to its caller
  */
-int arch_prepare_bpf_trampoline(void *image, void *image_end,
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_progs *tprogs,
 				void *orig_call)
@@ -1975,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 
 	save_regs(m, &prog, nr_args, stack_size);
 
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		/* arg1: mov rdi, im */
+		emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
+		if (emit_call(&prog, __bpf_tramp_enter, prog)) {
+			ret = -EINVAL;
+			goto cleanup;
+		}
+	}
+
 	if (fentry->nr_progs)
 		if (invoke_bpf(m, &prog, fentry, stack_size))
 			return -EINVAL;
@@ -1993,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 	}
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
-		if (fentry->nr_progs || fmod_ret->nr_progs)
-			restore_regs(m, &prog, nr_args, stack_size);
+		restore_regs(m, &prog, nr_args, stack_size);
 
 		/* call original function */
 		if (emit_call(&prog, orig_call, prog)) {
@@ -2003,6 +2011,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 		}
 		/* remember return value in a stack for bpf prog to access */
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
+		im->ip_after_call = prog;
+		emit_nops(&prog, 5);
 	}
 
 	if (fmod_ret->nr_progs) {
@@ -2033,9 +2043,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 	 * the return value is only updated on the stack and still needs to be
 	 * restored to R0.
 	 */
-	if (flags & BPF_TRAMP_F_CALL_ORIG)
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		im->ip_epilogue = prog;
+		/* arg1: mov rdi, im */
+		emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
+		if (emit_call(&prog, __bpf_tramp_exit, prog)) {
+			ret = -EINVAL;
+			goto cleanup;
+		}
 		/* restore original return value back into RAX */
 		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
+	}
 
 	EMIT1(0x5B); /* pop rbx */
 	EMIT1(0xC9); /* leave */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d7e0f479a5b0..3625f019767d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -21,6 +21,7 @@
 #include <linux/capability.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
+#include <linux/percpu-refcount.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -556,7 +557,8 @@ struct bpf_tramp_progs {
  *      fentry = a set of program to run before calling original function
  *      fexit = a set of program to run after original function
  */
-int arch_prepare_bpf_trampoline(void *image, void *image_end,
+struct bpf_tramp_image;
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_progs *tprogs,
 				void *orig_call);
@@ -565,6 +567,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog);
 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog);
 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start);
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
 
 struct bpf_ksym {
 	unsigned long		 start;
@@ -583,6 +587,18 @@ enum bpf_tramp_prog_type {
 	BPF_TRAMP_REPLACE, /* more than MAX */
 };
 
+struct bpf_tramp_image {
+	void *image;
+	struct bpf_ksym ksym;
+	struct percpu_ref pcref;
+	void *ip_after_call;
+	void *ip_epilogue;
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
+};
+
 struct bpf_trampoline {
 	/* hlist for trampoline_table */
 	struct hlist_node hlist;
@@ -605,9 +621,8 @@ struct bpf_trampoline {
 	/* Number of attached programs. A counter per kind. */
 	int progs_cnt[BPF_TRAMP_MAX];
 	/* Executable image of trampoline */
-	void *image;
+	struct bpf_tramp_image *cur_image;
 	u64 selector;
-	struct bpf_ksym ksym;
 };
 
 struct bpf_attach_target_info {
@@ -691,6 +706,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
+int bpf_jit_charge_modmem(u32 pages);
+void bpf_jit_uncharge_modmem(u32 pages);
 #else
 static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
 					   struct bpf_trampoline *tr)
@@ -787,7 +804,6 @@ struct bpf_prog_aux {
 	bool func_proto_unreliable;
 	bool sleepable;
 	bool tail_call_reachable;
-	enum bpf_tramp_prog_type trampoline_prog_type;
 	struct hlist_node tramp_hlist;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 1a666a975416..70f6fd4fa305 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 		tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
 		tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
-		err = arch_prepare_bpf_trampoline(image,
+		err = arch_prepare_bpf_trampoline(NULL, image,
 						  st_map->image + PAGE_SIZE,
 						  &st_ops->func_models[i], 0,
 						  tprogs, NULL);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3a283bf97f2f..75244ecb2389 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void)
 }
 pure_initcall(bpf_jit_charge_init);
 
-static int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 pages)
 {
 	if (atomic_long_add_return(pages, &bpf_jit_current) >
 	    (bpf_jit_limit >> PAGE_SHIFT)) {
@@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages)
 	return 0;
 }
 
-static void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 pages)
 {
 	atomic_long_sub(pages, &bpf_jit_current);
 }
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 7bc3b3209224..1f3a4be4b175 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -57,19 +57,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
 			   PAGE_SIZE, true, ksym->name);
 }
 
-static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
-{
-	struct bpf_ksym *ksym = &tr->ksym;
-
-	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key);
-	bpf_image_ksym_add(tr->image, ksym);
-}
-
 static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 {
 	struct bpf_trampoline *tr;
 	struct hlist_head *head;
-	void *image;
 	int i;
 
 	mutex_lock(&trampoline_mutex);
@@ -84,14 +75,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	if (!tr)
 		goto out;
 
-	/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
-	image = bpf_jit_alloc_exec_page();
-	if (!image) {
-		kfree(tr);
-		tr = NULL;
-		goto out;
-	}
-
 	tr->key = key;
 	INIT_HLIST_NODE(&tr->hlist);
 	hlist_add_head(&tr->hlist, head);
@@ -99,9 +82,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	mutex_init(&tr->mutex);
 	for (i = 0; i < BPF_TRAMP_MAX; i++)
 		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
-	tr->image = image;
-	INIT_LIST_HEAD_RCU(&tr->ksym.lnode);
-	bpf_trampoline_ksym_add(tr);
 out:
 	mutex_unlock(&trampoline_mutex);
 	return tr;
@@ -185,10 +165,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
 	return tprogs;
 }
 
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+	struct bpf_tramp_image *im;
+
+	im = container_of(work, struct bpf_tramp_image, work);
+	bpf_image_ksym_del(&im->ksym);
+	bpf_jit_free_exec(im->image);
+	bpf_jit_uncharge_modmem(1);
+	percpu_ref_exit(&im->pcref);
+	kfree_rcu(im, rcu);
+}
+
+/* callback, fexit step 3 or fentry step 2 */
+static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
+{
+	struct bpf_tramp_image *im;
+
+	im = container_of(rcu, struct bpf_tramp_image, rcu);
+	INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
+	schedule_work(&im->work);
+}
+
+/* callback, fexit step 2. Called after percpu_ref_kill confirms. */
+static void __bpf_tramp_image_release(struct percpu_ref *pcref)
+{
+	struct bpf_tramp_image *im;
+
+	im = container_of(pcref, struct bpf_tramp_image, pcref);
+	call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+/* callback, fexit or fentry step 1 */
+static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
+{
+	struct bpf_tramp_image *im;
+
+	im = container_of(rcu, struct bpf_tramp_image, rcu);
+	if (im->ip_after_call)
+		/* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
+		percpu_ref_kill(&im->pcref);
+	else
+		/* the case of fentry trampoline */
+		call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+static void bpf_tramp_image_put(struct bpf_tramp_image *im)
+{
+	/* The trampoline image that calls original function is using:
+	 * rcu_read_lock_trace to protect sleepable bpf progs
+	 * rcu_read_lock to protect normal bpf progs
+	 * percpu_ref to protect trampoline itself
+	 * rcu tasks to protect trampoline asm not covered by percpu_ref
+	 * (which are few asm insns before __bpf_tramp_enter and
+	 *  after __bpf_tramp_exit)
+	 *
+	 * The trampoline is unreachable before bpf_tramp_image_put().
+	 *
+	 * First, patch the trampoline to avoid calling into fexit progs.
+	 * The progs will be freed even if the original function is still
+	 * executing or sleeping.
+	 * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
+	 * first few asm instructions to execute and call into
+	 * __bpf_tramp_enter->percpu_ref_get.
+	 * Then use percpu_ref_kill to wait for the trampoline and the original
+	 * function to finish.
+	 * Then use call_rcu_tasks() to make sure few asm insns in
+	 * the trampoline epilogue are done as well.
+	 *
+	 * In !PREEMPT case the task that got interrupted in the first asm
+	 * insns won't go through an RCU quiescent state which the
+	 * percpu_ref_kill will be waiting for. Hence the first
+	 * call_rcu_tasks() is not necessary.
+	 */
+	if (im->ip_after_call) {
+		int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
+					     NULL, im->ip_epilogue);
+		WARN_ON(err);
+		if (IS_ENABLED(CONFIG_PREEMPTION))
+			call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+		else
+			percpu_ref_kill(&im->pcref);
+		return;
+	}
+
+	/* The trampoline without fexit and fmod_ret progs doesn't call original
+	 * function and doesn't use percpu_ref.
+	 * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+	 * Then use call_rcu_tasks() to wait for the rest of trampoline asm
+	 * and normal progs.
+	 */
+	call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+}
+
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+{
+	struct bpf_tramp_image *im;
+	struct bpf_ksym *ksym;
+	void *image;
+	int err = -ENOMEM;
+
+	im = kzalloc(sizeof(*im), GFP_KERNEL);
+	if (!im)
+		goto out;
+
+	err = bpf_jit_charge_modmem(1);
+	if (err)
+		goto out_free_im;
+
+	err = -ENOMEM;
+	im->image = image = bpf_jit_alloc_exec_page();
+	if (!image)
+		goto out_uncharge;
+
+	err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
+	if (err)
+		goto out_free_image;
+
+	ksym = &im->ksym;
+	INIT_LIST_HEAD_RCU(&ksym->lnode);
+	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
+	bpf_image_ksym_add(image, ksym);
+	return im;
+
+out_free_image:
+	bpf_jit_free_exec(im->image);
+out_uncharge:
+	bpf_jit_uncharge_modmem(1);
+out_free_im:
+	kfree(im);
+out:
+	return ERR_PTR(err);
+}
+
 static int bpf_trampoline_update(struct bpf_trampoline *tr)
 {
-	void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
-	void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+	struct bpf_tramp_image *im;
 	struct bpf_tramp_progs *tprogs;
 	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
 	int err, total;
@@ -198,41 +310,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 		return PTR_ERR(tprogs);
 
 	if (total == 0) {
-		err = unregister_fentry(tr, old_image);
+		err = unregister_fentry(tr, tr->cur_image->image);
+		bpf_tramp_image_put(tr->cur_image);
+		tr->cur_image = NULL;
 		tr->selector = 0;
 		goto out;
 	}
 
+	im = bpf_tramp_image_alloc(tr->key, tr->selector);
+	if (IS_ERR(im)) {
+		err = PTR_ERR(im);
+		goto out;
+	}
+
 	if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
 	    tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
 		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 
-	/* Though the second half of trampoline page is unused a task could be
-	 * preempted in the middle of the first half of trampoline and two
-	 * updates to trampoline would change the code from underneath the
-	 * preempted task. Hence wait for tasks to voluntarily schedule or go
-	 * to userspace.
-	 * The same trampoline can hold both sleepable and non-sleepable progs.
-	 * synchronize_rcu_tasks_trace() is needed to make sure all sleepable
-	 * programs finish executing.
-	 * Wait for these two grace periods together.
-	 */
-	synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
-
-	err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
+	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
 					  &tr->func.model, flags, tprogs,
 					  tr->func.addr);
 	if (err < 0)
 		goto out;
 
-	if (tr->selector)
+	WARN_ON(tr->cur_image && tr->selector == 0);
+	WARN_ON(!tr->cur_image && tr->selector);
+	if (tr->cur_image)
 		/* progs already running at this address */
-		err = modify_fentry(tr, old_image, new_image);
+		err = modify_fentry(tr, tr->cur_image->image, im->image);
 	else
 		/* first time registering */
-		err = register_fentry(tr, new_image);
+		err = register_fentry(tr, im->image);
 	if (err)
 		goto out;
+	if (tr->cur_image)
+		bpf_tramp_image_put(tr->cur_image);
+	tr->cur_image = im;
 	tr->selector++;
 out:
 	kfree(tprogs);
@@ -364,17 +477,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
 		goto out;
 	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
 		goto out;
-	bpf_image_ksym_del(&tr->ksym);
-	/* This code will be executed when all bpf progs (both sleepable and
-	 * non-sleepable) went through
-	 * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
-	 * Hence no need for another synchronize_rcu_tasks_trace() here,
-	 * but synchronize_rcu_tasks() is still needed, since trampoline
-	 * may not have had any sleepable programs and we need to wait
-	 * for tasks to get out of trampoline code before freeing it.
+	/* This code will be executed even when the last bpf_tramp_image
+	 * is alive. All progs are detached from the trampoline and the
+	 * trampoline image is patched with jmp into epilogue to skip
+	 * fexit progs. The fentry-only trampoline will be freed via
+	 * multiple rcu callbacks.
 	 */
-	synchronize_rcu_tasks();
-	bpf_jit_free_exec(tr->image);
 	hlist_del(&tr->hlist);
 	kfree(tr);
 out:
@@ -478,8 +586,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
 	rcu_read_unlock_trace();
 }
 
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
+{
+	percpu_ref_get(&tr->pcref);
+}
+
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
+{
+	percpu_ref_put(&tr->pcref);
+}
+
 int __weak
-arch_prepare_bpf_trampoline(void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
 			    const struct btf_func_model *m, u32 flags,
 			    struct bpf_tramp_progs *tprogs,
 			    void *orig_call)
-- 
cgit v1.2.3


From 97a19caf1b1f6a9d4f620a9d51405a1973bd4641 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 17 Mar 2021 10:41:32 -0700
Subject: bpf: net: Emit anonymous enum with BPF_TCP_CLOSE value explicitly

The selftest failed to compile with clang-built bpf-next.
Adding LLVM=1 to your vmlinux and selftest build will use clang.
The error message is:
  progs/test_sk_storage_tracing.c:38:18: error: use of undeclared identifier 'BPF_TCP_CLOSE'
          if (newstate == BPF_TCP_CLOSE)
                          ^
  1 error generated.
  make: *** [Makefile:423: /bpf-next/tools/testing/selftests/bpf/test_sk_storage_tracing.o] Error 1

The reason for the failure is that BPF_TCP_CLOSE, a value of
an anonymous enum defined in uapi bpf.h, is not defined in
vmlinux.h. gcc does not have this problem. Since vmlinux.h
is derived from BTF which is derived from vmlinux DWARF,
that means gcc-produced vmlinux DWARF has BPF_TCP_CLOSE
while llvm-produced vmlinux DWARF does not have.

BPF_TCP_CLOSE is referenced in net/ipv4/tcp.c as
  BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
The following test mimics the above BUILD_BUG_ON, preprocessed
with clang compiler, and shows gcc DWARF contains BPF_TCP_CLOSE while
llvm DWARF does not.

  $ cat t.c
  enum {
    BPF_TCP_ESTABLISHED = 1,
    BPF_TCP_CLOSE = 7,
  };
  enum {
    TCP_ESTABLISHED = 1,
    TCP_CLOSE = 7,
  };

  int test() {
    do {
      extern void __compiletime_assert_767(void) ;
      if ((int)BPF_TCP_CLOSE != (int)TCP_CLOSE) __compiletime_assert_767();
    } while (0);
    return 0;
  }
  $ clang t.c -O2 -c -g && llvm-dwarfdump t.o | grep BPF_TCP_CLOSE
  $ gcc t.c -O2 -c -g && llvm-dwarfdump t.o | grep BPF_TCP_CLOSE
                    DW_AT_name    ("BPF_TCP_CLOSE")

Further checking clang code find clang actually tried to
evaluate condition at compile time. If it is definitely
true/false, it will perform optimization and the whole if condition
will be removed before generating IR/debuginfo.

This patch explicited add an expression after the
above mentioned BUILD_BUG_ON in net/ipv4/tcp.c like
  (void)BPF_TCP_ESTABLISHED
to enable generation of debuginfo for the anonymous
enum which also includes BPF_TCP_CLOSE.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210317174132.589276-1-yhs@fb.com
---
 include/linux/btf.h |  1 +
 net/ipv4/tcp.c      | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 7fabf1428093..9c1b52738bbe 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -9,6 +9,7 @@
 #include <uapi/linux/bpf.h>
 
 #define BTF_TYPE_EMIT(type) ((void)(type *)0)
+#define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val)
 
 struct btf;
 struct btf_member;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index de7cc8445ac0..e14fd0c50c10 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -267,6 +267,7 @@
 #include <linux/slab.h>
 #include <linux/errqueue.h>
 #include <linux/static_key.h>
+#include <linux/btf.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -2587,6 +2588,17 @@ void tcp_set_state(struct sock *sk, int state)
 	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
 	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
 
+	/* bpf uapi header bpf.h defines an anonymous enum with values
+	 * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
+	 * is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
+	 * But clang built vmlinux does not have this enum in DWARF
+	 * since clang removes the above code before generating IR/debuginfo.
+	 * Let us explicitly emit the type debuginfo to ensure the
+	 * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
+	 * regardless of which compiler is used.
+	 */
+	BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
+
 	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
 		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
 
-- 
cgit v1.2.3


From 3542dcb15cef66c0b9e6c3b33168eb657e0d9520 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 5 Mar 2021 16:32:34 +0000
Subject: iommu/dma: Resurrect the "forcedac" option

In converting intel-iommu over to the common IOMMU DMA ops, it quietly
lost the functionality of its "forcedac" option. Since this is a handy
thing both for testing and for performance optimisation on certain
platforms, reimplement it under the common IOMMU parameter namespace.

For the sake of fixing the inadvertent breakage of the Intel-specific
parameter, remove the dmar_forcedac remnants and hook it up as an alias
while documenting the transition to the new common parameter.

Fixes: c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/7eece8e0ea7bfbe2cd0e30789e0d46df573af9b0.1614961776.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 Documentation/admin-guide/kernel-parameters.txt | 15 ++++++++-------
 drivers/iommu/dma-iommu.c                       | 13 ++++++++++++-
 drivers/iommu/intel/iommu.c                     |  5 ++---
 include/linux/dma-iommu.h                       |  2 ++
 4 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 04545725f187..835f810f2f26 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1869,13 +1869,6 @@
 			bypassed by not enabling DMAR with this option. In
 			this case, gfx device will use physical address for
 			DMA.
-		forcedac [X86-64]
-			With this option iommu will not optimize to look
-			for io virtual address below 32-bit forcing dual
-			address cycle on pci bus for cards supporting greater
-			than 32-bit addressing. The default is to look
-			for translation below 32-bit and if not available
-			then look in the higher range.
 		strict [Default Off]
 			With this option on every unmap_single operation will
 			result in a hardware IOTLB flush operation as opposed
@@ -1964,6 +1957,14 @@
 		nobypass	[PPC/POWERNV]
 			Disable IOMMU bypass, using IOMMU for PCI devices.
 
+	iommu.forcedac=	[ARM64, X86] Control IOVA allocation for PCI devices.
+			Format: { "0" | "1" }
+			0 - Try to allocate a 32-bit DMA address first, before
+			  falling back to the full range if needed.
+			1 - Allocate directly from the full usable range,
+			  forcing Dual Address Cycle for PCI cards supporting
+			  greater than 32-bit addressing.
+
 	iommu.strict=	[ARM64] Configure TLB invalidation behaviour
 			Format: { "0" | "1" }
 			0 - Lazy mode.
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index af765c813cc8..fdd095e1fa52 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -52,6 +52,17 @@ struct iommu_dma_cookie {
 };
 
 static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled);
+bool iommu_dma_forcedac __read_mostly;
+
+static int __init iommu_dma_forcedac_setup(char *str)
+{
+	int ret = kstrtobool(str, &iommu_dma_forcedac);
+
+	if (!ret && iommu_dma_forcedac)
+		pr_info("Forcing DAC for PCI devices\n");
+	return ret;
+}
+early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
 void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
 		struct iommu_domain *domain)
@@ -444,7 +455,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain,
 		dma_limit = min(dma_limit, (u64)domain->geometry.aperture_end);
 
 	/* Try to get PCI devices a SAC address */
-	if (dma_limit > DMA_BIT_MASK(32) && dev_is_pci(dev))
+	if (dma_limit > DMA_BIT_MASK(32) && !iommu_dma_forcedac && dev_is_pci(dev))
 		iova = alloc_iova_fast(iovad, iova_len,
 				       DMA_BIT_MASK(32) >> shift, false);
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index ee0932307d64..1c32522220bc 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -360,7 +360,6 @@ int intel_iommu_enabled = 0;
 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 
 static int dmar_map_gfx = 1;
-static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int iommu_identity_mapping;
@@ -451,8 +450,8 @@ static int __init intel_iommu_setup(char *str)
 			dmar_map_gfx = 0;
 			pr_info("Disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			pr_info("Forcing DAC for PCI devices\n");
-			dmar_forcedac = 1;
+			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
+			iommu_dma_forcedac = true;
 		} else if (!strncmp(str, "strict", 6)) {
 			pr_info("Disable batched IOTLB flush\n");
 			intel_iommu_strict = 1;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 706b68d1359b..13d1f4c14d7b 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -40,6 +40,8 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
 		struct iommu_domain *domain);
 
+extern bool iommu_dma_forcedac;
+
 #else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
-- 
cgit v1.2.3


From 6ca69e5841f01ccbfa45e56577e1b33e14e53504 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Thu, 18 Mar 2021 08:53:40 +0800
Subject: iommu/vt-d: Report more information about invalidation errors

When the invalidation queue errors are encountered, dump the information
logged by the VT-d hardware together with the pending queue invalidation
descriptors.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Tested-by: Guo Kaijie <Kaijie.Guo@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210318005340.187311-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/dmar.c  | 68 ++++++++++++++++++++++++++++++++++++++++++---
 include/linux/intel-iommu.h |  6 ++++
 2 files changed, 70 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index d5c51b5c20af..6971397805f3 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1205,6 +1205,63 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 	}
 }
 
+static const char *qi_type_string(u8 type)
+{
+	switch (type) {
+	case QI_CC_TYPE:
+		return "Context-cache Invalidation";
+	case QI_IOTLB_TYPE:
+		return "IOTLB Invalidation";
+	case QI_DIOTLB_TYPE:
+		return "Device-TLB Invalidation";
+	case QI_IEC_TYPE:
+		return "Interrupt Entry Cache Invalidation";
+	case QI_IWD_TYPE:
+		return "Invalidation Wait";
+	case QI_EIOTLB_TYPE:
+		return "PASID-based IOTLB Invalidation";
+	case QI_PC_TYPE:
+		return "PASID-cache Invalidation";
+	case QI_DEIOTLB_TYPE:
+		return "PASID-based Device-TLB Invalidation";
+	case QI_PGRP_RESP_TYPE:
+		return "Page Group Response";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static void qi_dump_fault(struct intel_iommu *iommu, u32 fault)
+{
+	unsigned int head = dmar_readl(iommu->reg + DMAR_IQH_REG);
+	u64 iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG);
+	struct qi_desc *desc = iommu->qi->desc + head;
+
+	if (fault & DMA_FSTS_IQE)
+		pr_err("VT-d detected Invalidation Queue Error: Reason %llx",
+		       DMAR_IQER_REG_IQEI(iqe_err));
+	if (fault & DMA_FSTS_ITE)
+		pr_err("VT-d detected Invalidation Time-out Error: SID %llx",
+		       DMAR_IQER_REG_ITESID(iqe_err));
+	if (fault & DMA_FSTS_ICE)
+		pr_err("VT-d detected Invalidation Completion Error: SID %llx",
+		       DMAR_IQER_REG_ICESID(iqe_err));
+
+	pr_err("QI HEAD: %s qw0 = 0x%llx, qw1 = 0x%llx\n",
+	       qi_type_string(desc->qw0 & 0xf),
+	       (unsigned long long)desc->qw0,
+	       (unsigned long long)desc->qw1);
+
+	head = ((head >> qi_shift(iommu)) + QI_LENGTH - 1) % QI_LENGTH;
+	head <<= qi_shift(iommu);
+	desc = iommu->qi->desc + head;
+
+	pr_err("QI PRIOR: %s qw0 = 0x%llx, qw1 = 0x%llx\n",
+	       qi_type_string(desc->qw0 & 0xf),
+	       (unsigned long long)desc->qw0,
+	       (unsigned long long)desc->qw1);
+}
+
 static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 {
 	u32 fault;
@@ -1216,6 +1273,8 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 		return -EAGAIN;
 
 	fault = readl(iommu->reg + DMAR_FSTS_REG);
+	if (fault & (DMA_FSTS_IQE | DMA_FSTS_ITE | DMA_FSTS_ICE))
+		qi_dump_fault(iommu, fault);
 
 	/*
 	 * If IQE happens, the head points to the descriptor associated
@@ -1232,12 +1291,10 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 			 * used by software as private data. We won't print
 			 * out these two qw's for security consideration.
 			 */
-			pr_err("VT-d detected invalid descriptor: qw0 = %llx, qw1 = %llx\n",
-			       (unsigned long long)desc->qw0,
-			       (unsigned long long)desc->qw1);
 			memcpy(desc, qi->desc + (wait_index << shift),
 			       1 << shift);
 			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+			pr_info("Invalidation Queue Error (IQE) cleared\n");
 			return -EINVAL;
 		}
 	}
@@ -1254,6 +1311,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 		tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
 
 		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+		pr_info("Invalidation Time-out Error (ITE) cleared\n");
 
 		do {
 			if (qi->desc_status[head] == QI_IN_USE)
@@ -1265,8 +1323,10 @@ static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 			return -EAGAIN;
 	}
 
-	if (fault & DMA_FSTS_ICE)
+	if (fault & DMA_FSTS_ICE) {
 		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+		pr_info("Invalidation Completion Error (ICE) cleared\n");
+	}
 
 	return 0;
 }
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 1bc46b88711a..1732298ce888 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -20,6 +20,7 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/dmar.h>
 #include <linux/ioasid.h>
+#include <linux/bitfield.h>
 
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -80,6 +81,7 @@
 #define DMAR_IQ_SHIFT	4	/* Invalidation queue head/tail shift */
 #define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
 #define DMAR_ICS_REG	0x9c	/* Invalidation complete status register */
+#define DMAR_IQER_REG	0xb0	/* Invalidation queue error record register */
 #define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
 #define DMAR_PQH_REG	0xc0	/* Page request queue head register */
 #define DMAR_PQT_REG	0xc8	/* Page request queue tail register */
@@ -126,6 +128,10 @@
 #define DMAR_VCMD_REG		0xe10 /* Virtual command register */
 #define DMAR_VCRSP_REG		0xe20 /* Virtual command response register */
 
+#define DMAR_IQER_REG_IQEI(reg)		FIELD_GET(GENMASK_ULL(3, 0), reg)
+#define DMAR_IQER_REG_ITESID(reg)	FIELD_GET(GENMASK_ULL(47, 32), reg)
+#define DMAR_IQER_REG_ICESID(reg)	FIELD_GET(GENMASK_ULL(63, 48), reg)
+
 #define OFFSET_STRIDE		(9)
 
 #define dmar_readq(a) readq(a)
-- 
cgit v1.2.3


From 4196d18903f94090f0a223d65de25e3bf50a3d13 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 12 Mar 2021 09:24:39 -0700
Subject: remoteproc: Add new RPROC_ATTACHED state

Add a new RPROC_ATTACHED state to take into account scenarios
where the remoteproc core needs to attach to a remote processor
that is booted by another entity.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Link: https://lore.kernel.org/r/20210312162453.1234145-4-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_sysfs.c | 1 +
 include/linux/remoteproc.h            | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c
index 6840dad931d5..26fd9ceecdb5 100644
--- a/drivers/remoteproc/remoteproc_sysfs.c
+++ b/drivers/remoteproc/remoteproc_sysfs.c
@@ -172,6 +172,7 @@ static const char * const rproc_state_string[] = {
 	[RPROC_RUNNING]		= "running",
 	[RPROC_CRASHED]		= "crashed",
 	[RPROC_DELETED]		= "deleted",
+	[RPROC_ATTACHED]	= "attached",
 	[RPROC_DETACHED]	= "detached",
 	[RPROC_LAST]		= "invalid",
 };
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 1b7d56c7a453..9193a8fb5b68 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -407,6 +407,8 @@ struct rproc_ops {
  * @RPROC_RUNNING:	device is up and running
  * @RPROC_CRASHED:	device has crashed; need to start recovery
  * @RPROC_DELETED:	device is deleted
+ * @RPROC_ATTACHED:	device has been booted by another entity and the core
+ *			has attached to it
  * @RPROC_DETACHED:	device has been booted by another entity and waiting
  *			for the core to attach to it
  * @RPROC_LAST:		just keep this one at the end
@@ -423,8 +425,9 @@ enum rproc_state {
 	RPROC_RUNNING	= 2,
 	RPROC_CRASHED	= 3,
 	RPROC_DELETED	= 4,
-	RPROC_DETACHED	= 5,
-	RPROC_LAST	= 6,
+	RPROC_ATTACHED	= 5,
+	RPROC_DETACHED	= 6,
+	RPROC_LAST	= 7,
 };
 
 /**
-- 
cgit v1.2.3


From 76f4c87587e2ff41e9b9867ffde2137f27ba39b9 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 12 Mar 2021 09:24:40 -0700
Subject: remoteproc: Properly represent the attached state

There is a need to know when a remote processor has been attached
to rather than booted by the remoteproc core.  In order to avoid
manipulating two variables, i.e rproc::autonomous and
rproc::state, get rid of the former and simply use the newly
introduced RPROC_ATTACHED state.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Link: https://lore.kernel.org/r/20210312162453.1234145-5-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c  | 20 +-------------------
 drivers/remoteproc/remoteproc_sysfs.c |  5 +----
 include/linux/remoteproc.h            |  2 --
 3 files changed, 2 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 9127b2aa5424..d828c01f4f8a 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1450,7 +1450,7 @@ static int __rproc_attach(struct rproc *rproc)
 		goto stop_rproc;
 	}
 
-	rproc->state = RPROC_RUNNING;
+	rproc->state = RPROC_ATTACHED;
 
 	dev_info(dev, "remote processor %s is now attached\n", rproc->name);
 
@@ -1665,14 +1665,6 @@ static int rproc_stop(struct rproc *rproc, bool crashed)
 
 	rproc->state = RPROC_OFFLINE;
 
-	/*
-	 * The remote processor has been stopped and is now offline, which means
-	 * that the next time it is brought back online the remoteproc core will
-	 * be responsible to load its firmware.  As such it is no longer
-	 * autonomous.
-	 */
-	rproc->autonomous = false;
-
 	dev_info(dev, "stopped remote processor %s\n", rproc->name);
 
 	return 0;
@@ -2083,16 +2075,6 @@ int rproc_add(struct rproc *rproc)
 	if (ret < 0)
 		return ret;
 
-	/*
-	 * Remind ourselves the remote processor has been attached to rather
-	 * than booted by the remoteproc core.  This is important because the
-	 * RPROC_DETACHED state will be lost as soon as the remote processor
-	 * has been attached to.  Used in firmware_show() and reset in
-	 * rproc_stop().
-	 */
-	if (rproc->state == RPROC_DETACHED)
-		rproc->autonomous = true;
-
 	/* if rproc is marked always-on, request it to boot */
 	if (rproc->auto_boot) {
 		ret = rproc_trigger_auto_boot(rproc);
diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c
index 26fd9ceecdb5..4f58be1e13c1 100644
--- a/drivers/remoteproc/remoteproc_sysfs.c
+++ b/drivers/remoteproc/remoteproc_sysfs.c
@@ -138,11 +138,8 @@ static ssize_t firmware_show(struct device *dev, struct device_attribute *attr,
 	 * If the remote processor has been started by an external
 	 * entity we have no idea of what image it is running.  As such
 	 * simply display a generic string rather then rproc->firmware.
-	 *
-	 * Here we rely on the autonomous flag because a remote processor
-	 * may have been attached to and currently in a running state.
 	 */
-	if (rproc->autonomous)
+	if (rproc->state == RPROC_ATTACHED)
 		firmware = "unknown";
 
 	return sprintf(buf, "%s\n", firmware);
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 9193a8fb5b68..9e42e90cd9da 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -514,7 +514,6 @@ struct rproc_dump_segment {
  * @table_sz: size of @cached_table
  * @has_iommu: flag to indicate if remote processor is behind an MMU
  * @auto_boot: flag to indicate if remote processor should be auto-started
- * @autonomous: true if an external entity has booted the remote processor
  * @dump_segments: list of segments in the firmware
  * @nb_vdev: number of vdev currently handled by rproc
  * @char_dev: character device of the rproc
@@ -551,7 +550,6 @@ struct rproc {
 	size_t table_sz;
 	bool has_iommu;
 	bool auto_boot;
-	bool autonomous;
 	struct list_head dump_segments;
 	int nb_vdev;
 	u8 elf_class;
-- 
cgit v1.2.3


From 1a631382be1d22ddab0582dae3498b3d28e2e44a Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 12 Mar 2021 09:24:41 -0700
Subject: remoteproc: Add new get_loaded_rsc_table() to rproc_ops

Add a new get_loaded_rsc_table() operation in order to support
scenarios where the remoteproc core has booted a remote processor
and detaches from it.  When re-attaching to the remote processor,
the core needs to know where the resource table has been placed
in memory.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Link: https://lore.kernel.org/r/20210312162453.1234145-6-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c     | 32 ++++++++++++++++++++++++++++++++
 drivers/remoteproc/remoteproc_internal.h | 10 ++++++++++
 include/linux/remoteproc.h               |  6 +++++-
 3 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index d828c01f4f8a..c7e1cf1b6f66 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1543,6 +1543,32 @@ disable_iommu:
 	return ret;
 }
 
+static int rproc_set_rsc_table(struct rproc *rproc)
+{
+	struct resource_table *table_ptr;
+	struct device *dev = &rproc->dev;
+	size_t table_sz;
+	int ret;
+
+	table_ptr = rproc_get_loaded_rsc_table(rproc, &table_sz);
+	if (!table_ptr) {
+		/* Not having a resource table is acceptable */
+		return 0;
+	}
+
+	if (IS_ERR(table_ptr)) {
+		ret = PTR_ERR(table_ptr);
+		dev_err(dev, "can't load resource table: %d\n", ret);
+		return ret;
+	}
+
+	rproc->cached_table = NULL;
+	rproc->table_ptr = table_ptr;
+	rproc->table_sz = table_sz;
+
+	return 0;
+}
+
 /*
  * Attach to remote processor - similar to rproc_fw_boot() but without
  * the steps that deal with the firmware image.
@@ -1562,6 +1588,12 @@ static int rproc_attach(struct rproc *rproc)
 		return ret;
 	}
 
+	ret = rproc_set_rsc_table(rproc);
+	if (ret) {
+		dev_err(dev, "can't load resource table: %d\n", ret);
+		goto disable_iommu;
+	}
+
 	/* reset max_notifyid */
 	rproc->max_notifyid = -1;
 
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h
index 9ea37aa687d2..a328e634b1de 100644
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -177,6 +177,16 @@ struct resource_table *rproc_find_loaded_rsc_table(struct rproc *rproc,
 	return NULL;
 }
 
+static inline
+struct resource_table *rproc_get_loaded_rsc_table(struct rproc *rproc,
+						  size_t *size)
+{
+	if (rproc->ops->get_loaded_rsc_table)
+		return rproc->ops->get_loaded_rsc_table(rproc, size);
+
+	return NULL;
+}
+
 static inline
 bool rproc_u64_fit_in_size_t(u64 val)
 {
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 9e42e90cd9da..eee338177a3d 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -370,7 +370,9 @@ enum rsc_handling_status {
  * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a
  * negative value on error
  * @load_rsc_table:	load resource table from firmware image
- * @find_loaded_rsc_table: find the loaded resouce table
+ * @find_loaded_rsc_table: find the loaded resource table from firmware image
+ * @get_loaded_rsc_table: get resource table installed in memory
+ *			  by external entity
  * @load:		load firmware to memory, where the remote processor
  *			expects to find it
  * @sanity_check:	sanity check the fw image
@@ -392,6 +394,8 @@ struct rproc_ops {
 			  int offset, int avail);
 	struct resource_table *(*find_loaded_rsc_table)(
 				struct rproc *rproc, const struct firmware *fw);
+	struct resource_table *(*get_loaded_rsc_table)(
+				struct rproc *rproc, size_t *size);
 	int (*load)(struct rproc *rproc, const struct firmware *fw);
 	int (*sanity_check)(struct rproc *rproc, const struct firmware *fw);
 	u64 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw);
-- 
cgit v1.2.3


From 7f3bd0c019cb813448d867c17c9b9dad205a13eb Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 12 Mar 2021 09:24:44 -0700
Subject: remoteproc: Add new detach() remoteproc operation

Add an new detach() operation in order to support scenarios where
the remoteproc core is going away but the remote processor is
kept operating.  This could be the case when the system is
rebooted or when the platform driver is removed.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Link: https://lore.kernel.org/r/20210312162453.1234145-9-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/remoteproc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index eee338177a3d..2f1f0fbc3994 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -363,6 +363,7 @@ enum rsc_handling_status {
  * @start:	power on the device and boot it
  * @stop:	power off the device
  * @attach:	attach to a device that his already powered up
+ * @detach:	detach from a device, leaving it powered up
  * @kick:	kick a virtqueue (virtqueue id given as a parameter)
  * @da_to_va:	optional platform hook to perform address translations
  * @parse_fw:	parse firmware to extract information (e.g. resource table)
@@ -387,6 +388,7 @@ struct rproc_ops {
 	int (*start)(struct rproc *rproc);
 	int (*stop)(struct rproc *rproc);
 	int (*attach)(struct rproc *rproc);
+	int (*detach)(struct rproc *rproc);
 	void (*kick)(struct rproc *rproc, int vqid);
 	void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len, bool *is_iomem);
 	int (*parse_fw)(struct rproc *rproc, const struct firmware *fw);
-- 
cgit v1.2.3


From d3962a397885518a85d2dc6b0c51e6594f71c30f Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 12 Mar 2021 09:24:46 -0700
Subject: remoteproc: Introduce function rproc_detach()

Introduce function rproc_detach() to enable the remoteproc
core to release the resources associated with a remote processor
without stopping its operation.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Link: https://lore.kernel.org/r/20210312162453.1234145-11-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c | 58 +++++++++++++++++++++++++++++++++++-
 include/linux/remoteproc.h           |  1 +
 2 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 23e9040386c2..78a36a3723ec 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1715,7 +1715,7 @@ static int rproc_stop(struct rproc *rproc, bool crashed)
 /*
  * __rproc_detach(): Does the opposite of __rproc_attach()
  */
-static int __maybe_unused __rproc_detach(struct rproc *rproc)
+static int __rproc_detach(struct rproc *rproc)
 {
 	struct device *dev = &rproc->dev;
 	int ret;
@@ -1954,6 +1954,62 @@ out:
 }
 EXPORT_SYMBOL(rproc_shutdown);
 
+/**
+ * rproc_detach() - Detach the remote processor from the
+ * remoteproc core
+ *
+ * @rproc: the remote processor
+ *
+ * Detach a remote processor (previously attached to with rproc_attach()).
+ *
+ * In case @rproc is still being used by an additional user(s), then
+ * this function will just decrement the power refcount and exit,
+ * without disconnecting the device.
+ *
+ * Function rproc_detach() calls __rproc_detach() in order to let a remote
+ * processor know that services provided by the application processor are
+ * no longer available.  From there it should be possible to remove the
+ * platform driver and even power cycle the application processor (if the HW
+ * supports it) without needing to switch off the remote processor.
+ */
+int rproc_detach(struct rproc *rproc)
+{
+	struct device *dev = &rproc->dev;
+	int ret;
+
+	ret = mutex_lock_interruptible(&rproc->lock);
+	if (ret) {
+		dev_err(dev, "can't lock rproc %s: %d\n", rproc->name, ret);
+		return ret;
+	}
+
+	/* if the remote proc is still needed, bail out */
+	if (!atomic_dec_and_test(&rproc->power)) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = __rproc_detach(rproc);
+	if (ret) {
+		atomic_inc(&rproc->power);
+		goto out;
+	}
+
+	/* clean up all acquired resources */
+	rproc_resource_cleanup(rproc);
+
+	/* release HW resources if needed */
+	rproc_unprepare_device(rproc);
+
+	rproc_disable_iommu(rproc);
+
+	rproc->table_ptr = NULL;
+out:
+	mutex_unlock(&rproc->lock);
+	return ret;
+}
+EXPORT_SYMBOL(rproc_detach);
+
 /**
  * rproc_get_by_phandle() - find a remote processor by phandle
  * @phandle: phandle to the rproc
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 2f1f0fbc3994..fc2cca600423 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -664,6 +664,7 @@ rproc_of_resm_mem_entry_init(struct device *dev, u32 of_resm_idx, size_t len,
 
 int rproc_boot(struct rproc *rproc);
 void rproc_shutdown(struct rproc *rproc);
+int rproc_detach(struct rproc *rproc);
 int rproc_set_firmware(struct rproc *rproc, const char *fw_name);
 void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type);
 void rproc_coredump_using_sections(struct rproc *rproc);
-- 
cgit v1.2.3


From 9dc9507f1880fb6225e3e058cb5219b152cbf198 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Fri, 12 Mar 2021 09:24:47 -0700
Subject: remoteproc: Properly deal with the resource table when detaching

If it is possible to detach the remote processor, keep an untouched
copy of the resource table.  That way we can start from the same
resource table without having to worry about original values or what
elements the startup code has changed when re-attaching to the remote
processor.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Reviewed-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Link: https://lore.kernel.org/r/20210312162453.1234145-12-mathieu.poirier@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c | 81 ++++++++++++++++++++++++++++++++++++
 include/linux/remoteproc.h           |  3 ++
 2 files changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 78a36a3723ec..57c851d22ba6 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1562,6 +1562,24 @@ static int rproc_set_rsc_table(struct rproc *rproc)
 		return ret;
 	}
 
+	/*
+	 * If it is possible to detach the remote processor, keep an untouched
+	 * copy of the resource table.  That way we can start fresh again when
+	 * the remote processor is re-attached, that is:
+	 *
+	 *      DETACHED -> ATTACHED -> DETACHED -> ATTACHED
+	 *
+	 * Free'd in rproc_reset_rsc_table_on_detach() and
+	 * rproc_reset_rsc_table_on_stop().
+	 */
+	if (rproc->ops->detach) {
+		rproc->clean_table = kmemdup(table_ptr, table_sz, GFP_KERNEL);
+		if (!rproc->clean_table)
+			return -ENOMEM;
+	} else {
+		rproc->clean_table = NULL;
+	}
+
 	rproc->cached_table = NULL;
 	rproc->table_ptr = table_ptr;
 	rproc->table_sz = table_sz;
@@ -1569,6 +1587,59 @@ static int rproc_set_rsc_table(struct rproc *rproc)
 	return 0;
 }
 
+static int rproc_reset_rsc_table_on_detach(struct rproc *rproc)
+{
+	struct resource_table *table_ptr;
+
+	/* A resource table was never retrieved, nothing to do here */
+	if (!rproc->table_ptr)
+		return 0;
+
+	/*
+	 * If we made it to this point a clean_table _must_ have been
+	 * allocated in rproc_set_rsc_table().  If one isn't present
+	 * something went really wrong and we must complain.
+	 */
+	if (WARN_ON(!rproc->clean_table))
+		return -EINVAL;
+
+	/* Remember where the external entity installed the resource table */
+	table_ptr = rproc->table_ptr;
+
+	/*
+	 * If we made it here the remote processor was started by another
+	 * entity and a cache table doesn't exist.  As such make a copy of
+	 * the resource table currently used by the remote processor and
+	 * use that for the rest of the shutdown process.  The memory
+	 * allocated here is free'd in rproc_detach().
+	 */
+	rproc->cached_table = kmemdup(rproc->table_ptr,
+				      rproc->table_sz, GFP_KERNEL);
+	if (!rproc->cached_table)
+		return -ENOMEM;
+
+	/*
+	 * Use a copy of the resource table for the remainder of the
+	 * shutdown process.
+	 */
+	rproc->table_ptr = rproc->cached_table;
+
+	/*
+	 * Reset the memory area where the firmware loaded the resource table
+	 * to its original value.  That way when we re-attach the remote
+	 * processor the resource table is clean and ready to be used again.
+	 */
+	memcpy(table_ptr, rproc->clean_table, rproc->table_sz);
+
+	/*
+	 * The clean resource table is no longer needed.  Allocated in
+	 * rproc_set_rsc_table().
+	 */
+	kfree(rproc->clean_table);
+
+	return 0;
+}
+
 /*
  * Attach to remote processor - similar to rproc_fw_boot() but without
  * the steps that deal with the firmware image.
@@ -1727,6 +1798,13 @@ static int __rproc_detach(struct rproc *rproc)
 	/* Stop any subdevices for the remote processor */
 	rproc_stop_subdevices(rproc, false);
 
+	/* the installed resource table is no longer accessible */
+	ret = rproc_reset_rsc_table_on_detach(rproc);
+	if (ret) {
+		dev_err(dev, "can't reset resource table: %d\n", ret);
+		return ret;
+	}
+
 	/* Tell the remote processor the core isn't available anymore */
 	ret = rproc->ops->detach(rproc);
 	if (ret) {
@@ -2003,6 +2081,9 @@ int rproc_detach(struct rproc *rproc)
 
 	rproc_disable_iommu(rproc);
 
+	/* Free the copy of the resource table */
+	kfree(rproc->cached_table);
+	rproc->cached_table = NULL;
 	rproc->table_ptr = NULL;
 out:
 	mutex_unlock(&rproc->lock);
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index fc2cca600423..8b795b544f75 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -516,6 +516,8 @@ struct rproc_dump_segment {
  * @recovery_disabled: flag that state if recovery was disabled
  * @max_notifyid: largest allocated notify id.
  * @table_ptr: pointer to the resource table in effect
+ * @clean_table: copy of the resource table without modifications.  Used
+ *		 when a remote processor is attached or detached from the core
  * @cached_table: copy of the resource table
  * @table_sz: size of @cached_table
  * @has_iommu: flag to indicate if remote processor is behind an MMU
@@ -552,6 +554,7 @@ struct rproc {
 	bool recovery_disabled;
 	int max_notifyid;
 	struct resource_table *table_ptr;
+	struct resource_table *clean_table;
 	struct resource_table *cached_table;
 	size_t table_sz;
 	bool has_iommu;
-- 
cgit v1.2.3


From 48d71395896d54eec989179dd265e569fcecb15a Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Sun, 14 Mar 2021 18:44:46 +0300
Subject: reset: Add reset_control_bulk API

Follow the clock and regulator subsystems' lead and add a bulk API
for reset controls.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Link: https://lore.kernel.org/r/20210314154459.15375-5-digetx@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/reset/core.c  | 215 ++++++++++++++++++++++++++++++++++
 include/linux/reset.h | 315 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 530 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index dbf881b586d9..71c1c8264b2d 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -358,6 +358,30 @@ int reset_control_reset(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_reset);
 
+/**
+ * reset_control_bulk_reset - reset the controlled devices in order
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset controls set
+ *
+ * Issue a reset on all provided reset controls, in order.
+ *
+ * See also: reset_control_reset()
+ */
+int reset_control_bulk_reset(int num_rstcs,
+			     struct reset_control_bulk_data *rstcs)
+{
+	int ret, i;
+
+	for (i = 0; i < num_rstcs; i++) {
+		ret = reset_control_reset(rstcs[i].rstc);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(reset_control_bulk_reset);
+
 /**
  * reset_control_rearm - allow shared reset line to be re-triggered"
  * @rstc: reset controller
@@ -461,6 +485,36 @@ int reset_control_assert(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_assert);
 
+/**
+ * reset_control_bulk_assert - asserts the reset lines in order
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset controls set
+ *
+ * Assert the reset lines for all provided reset controls, in order.
+ * If an assertion fails, already asserted resets are deasserted again.
+ *
+ * See also: reset_control_assert()
+ */
+int reset_control_bulk_assert(int num_rstcs,
+			      struct reset_control_bulk_data *rstcs)
+{
+	int ret, i;
+
+	for (i = 0; i < num_rstcs; i++) {
+		ret = reset_control_assert(rstcs[i].rstc);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	while (i--)
+		reset_control_deassert(rstcs[i].rstc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(reset_control_bulk_assert);
+
 /**
  * reset_control_deassert - deasserts the reset line
  * @rstc: reset controller
@@ -511,6 +565,36 @@ int reset_control_deassert(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_deassert);
 
+/**
+ * reset_control_bulk_deassert - deasserts the reset lines in reverse order
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset controls set
+ *
+ * Deassert the reset lines for all provided reset controls, in reverse order.
+ * If a deassertion fails, already deasserted resets are asserted again.
+ *
+ * See also: reset_control_deassert()
+ */
+int reset_control_bulk_deassert(int num_rstcs,
+				struct reset_control_bulk_data *rstcs)
+{
+	int ret, i;
+
+	for (i = num_rstcs - 1; i >= 0; i--) {
+		ret = reset_control_deassert(rstcs[i].rstc);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	while (i < num_rstcs)
+		reset_control_assert(rstcs[i++].rstc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(reset_control_bulk_deassert);
+
 /**
  * reset_control_status - returns a negative errno if not supported, a
  * positive value if the reset line is asserted, or zero if the reset
@@ -588,6 +672,36 @@ int reset_control_acquire(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_acquire);
 
+/**
+ * reset_control_bulk_acquire - acquires reset controls for exclusive use
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset controls set
+ *
+ * This is used to explicitly acquire reset controls requested with
+ * reset_control_bulk_get_exclusive_release() for temporary exclusive use.
+ *
+ * See also: reset_control_acquire(), reset_control_bulk_release()
+ */
+int reset_control_bulk_acquire(int num_rstcs,
+			       struct reset_control_bulk_data *rstcs)
+{
+	int ret, i;
+
+	for (i = 0; i < num_rstcs; i++) {
+		ret = reset_control_acquire(rstcs[i].rstc);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	while (i--)
+		reset_control_release(rstcs[i].rstc);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(reset_control_bulk_acquire);
+
 /**
  * reset_control_release() - releases exclusive access to a reset control
  * @rstc: reset control
@@ -610,6 +724,26 @@ void reset_control_release(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_release);
 
+/**
+ * reset_control_bulk_release() - releases exclusive access to reset controls
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset controls set
+ *
+ * Releases exclusive access right to reset controls previously obtained by a
+ * call to reset_control_bulk_acquire().
+ *
+ * See also: reset_control_release(), reset_control_bulk_acquire()
+ */
+void reset_control_bulk_release(int num_rstcs,
+				struct reset_control_bulk_data *rstcs)
+{
+	int i;
+
+	for (i = 0; i < num_rstcs; i++)
+		reset_control_release(rstcs[i].rstc);
+}
+EXPORT_SYMBOL_GPL(reset_control_bulk_release);
+
 static struct reset_control *__reset_control_get_internal(
 				struct reset_controller_dev *rcdev,
 				unsigned int index, bool shared, bool acquired)
@@ -814,6 +948,32 @@ struct reset_control *__reset_control_get(struct device *dev, const char *id,
 }
 EXPORT_SYMBOL_GPL(__reset_control_get);
 
+int __reset_control_bulk_get(struct device *dev, int num_rstcs,
+			     struct reset_control_bulk_data *rstcs,
+			     bool shared, bool optional, bool acquired)
+{
+	int ret, i;
+
+	for (i = 0; i < num_rstcs; i++) {
+		rstcs[i].rstc = __reset_control_get(dev, rstcs[i].id, 0,
+						    shared, optional, acquired);
+		if (IS_ERR(rstcs[i].rstc)) {
+			ret = PTR_ERR(rstcs[i].rstc);
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	mutex_lock(&reset_list_mutex);
+	while (i--)
+		__reset_control_put_internal(rstcs[i].rstc);
+	mutex_unlock(&reset_list_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__reset_control_bulk_get);
+
 static void reset_control_array_put(struct reset_control_array *resets)
 {
 	int i;
@@ -845,6 +1005,23 @@ void reset_control_put(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_put);
 
+/**
+ * reset_control_bulk_put - free the reset controllers
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset controls set
+ */
+void reset_control_bulk_put(int num_rstcs, struct reset_control_bulk_data *rstcs)
+{
+	mutex_lock(&reset_list_mutex);
+	while (num_rstcs--) {
+		if (IS_ERR_OR_NULL(rstcs[num_rstcs].rstc))
+			continue;
+		__reset_control_put_internal(rstcs[num_rstcs].rstc);
+	}
+	mutex_unlock(&reset_list_mutex);
+}
+EXPORT_SYMBOL_GPL(reset_control_bulk_put);
+
 static void devm_reset_control_release(struct device *dev, void *res)
 {
 	reset_control_put(*(struct reset_control **)res);
@@ -874,6 +1051,44 @@ struct reset_control *__devm_reset_control_get(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(__devm_reset_control_get);
 
+struct reset_control_bulk_devres {
+	int num_rstcs;
+	struct reset_control_bulk_data *rstcs;
+};
+
+static void devm_reset_control_bulk_release(struct device *dev, void *res)
+{
+	struct reset_control_bulk_devres *devres = res;
+
+	reset_control_bulk_put(devres->num_rstcs, devres->rstcs);
+}
+
+int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs,
+				  struct reset_control_bulk_data *rstcs,
+				  bool shared, bool optional, bool acquired)
+{
+	struct reset_control_bulk_devres *ptr;
+	int ret;
+
+	ptr = devres_alloc(devm_reset_control_bulk_release, sizeof(*ptr),
+			   GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ret = __reset_control_bulk_get(dev, num_rstcs, rstcs, shared, optional, acquired);
+	if (ret < 0) {
+		devres_free(ptr);
+		return ret;
+	}
+
+	ptr->num_rstcs = num_rstcs;
+	ptr->rstcs = rstcs;
+	devres_add(dev, ptr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__devm_reset_control_bulk_get);
+
 /**
  * __device_reset - find reset controller associated with the device
  *                  and perform reset
diff --git a/include/linux/reset.h b/include/linux/reset.h
index b9109efa2a5c..46e6372cb431 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -10,6 +10,21 @@ struct device;
 struct device_node;
 struct reset_control;
 
+/**
+ * struct reset_control_bulk_data - Data used for bulk reset control operations.
+ *
+ * @id: reset control consumer ID
+ * @rstc: struct reset_control * to store the associated reset control
+ *
+ * The reset APIs provide a series of reset_control_bulk_*() API calls as
+ * a convenience to consumers which require multiple reset controls.
+ * This structure is used to manage data for these calls.
+ */
+struct reset_control_bulk_data {
+	const char			*id;
+	struct reset_control		*rstc;
+};
+
 #ifdef CONFIG_RESET_CONTROLLER
 
 int reset_control_reset(struct reset_control *rstc);
@@ -20,6 +35,12 @@ int reset_control_status(struct reset_control *rstc);
 int reset_control_acquire(struct reset_control *rstc);
 void reset_control_release(struct reset_control *rstc);
 
+int reset_control_bulk_reset(int num_rstcs, struct reset_control_bulk_data *rstcs);
+int reset_control_bulk_assert(int num_rstcs, struct reset_control_bulk_data *rstcs);
+int reset_control_bulk_deassert(int num_rstcs, struct reset_control_bulk_data *rstcs);
+int reset_control_bulk_acquire(int num_rstcs, struct reset_control_bulk_data *rstcs);
+void reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs);
+
 struct reset_control *__of_reset_control_get(struct device_node *node,
 				     const char *id, int index, bool shared,
 				     bool optional, bool acquired);
@@ -27,10 +48,18 @@ struct reset_control *__reset_control_get(struct device *dev, const char *id,
 					  int index, bool shared,
 					  bool optional, bool acquired);
 void reset_control_put(struct reset_control *rstc);
+int __reset_control_bulk_get(struct device *dev, int num_rstcs,
+			     struct reset_control_bulk_data *rstcs,
+			     bool shared, bool optional, bool acquired);
+void reset_control_bulk_put(int num_rstcs, struct reset_control_bulk_data *rstcs);
+
 int __device_reset(struct device *dev, bool optional);
 struct reset_control *__devm_reset_control_get(struct device *dev,
 				     const char *id, int index, bool shared,
 				     bool optional, bool acquired);
+int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs,
+				  struct reset_control_bulk_data *rstcs,
+				  bool shared, bool optional, bool acquired);
 
 struct reset_control *devm_reset_control_array_get(struct device *dev,
 						   bool shared, bool optional);
@@ -96,6 +125,48 @@ static inline struct reset_control *__reset_control_get(
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
+static inline int
+reset_control_bulk_reset(int num_rstcs, struct reset_control_bulk_data *rstcs)
+{
+	return 0;
+}
+
+static inline int
+reset_control_bulk_assert(int num_rstcs, struct reset_control_bulk_data *rstcs)
+{
+	return 0;
+}
+
+static inline int
+reset_control_bulk_deassert(int num_rstcs, struct reset_control_bulk_data *rstcs)
+{
+	return 0;
+}
+
+static inline int
+reset_control_bulk_acquire(int num_rstcs, struct reset_control_bulk_data *rstcs)
+{
+	return 0;
+}
+
+static inline void
+reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs)
+{
+}
+
+static inline int
+__reset_control_bulk_get(struct device *dev, int num_rstcs,
+			 struct reset_control_bulk_data *rstcs,
+			 bool shared, bool optional, bool acquired)
+{
+	return optional ? 0 : -EOPNOTSUPP;
+}
+
+static inline void
+reset_control_bulk_put(int num_rstcs, struct reset_control_bulk_data *rstcs)
+{
+}
+
 static inline struct reset_control *__devm_reset_control_get(
 					struct device *dev, const char *id,
 					int index, bool shared, bool optional,
@@ -104,6 +175,14 @@ static inline struct reset_control *__devm_reset_control_get(
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
+static inline int
+__devm_reset_control_bulk_get(struct device *dev, int num_rstcs,
+			      struct reset_control_bulk_data *rstcs,
+			      bool shared, bool optional, bool acquired)
+{
+	return optional ? 0 : -EOPNOTSUPP;
+}
+
 static inline struct reset_control *
 devm_reset_control_array_get(struct device *dev, bool shared, bool optional)
 {
@@ -155,6 +234,23 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 	return __reset_control_get(dev, id, 0, false, false, true);
 }
 
+/**
+ * reset_control_bulk_get_exclusive - Lookup and obtain exclusive references to
+ *                                    multiple reset controllers.
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Fills the rstcs array with pointers to exclusive reset controls and
+ * returns 0, or an IS_ERR() condition containing errno.
+ */
+static inline int __must_check
+reset_control_bulk_get_exclusive(struct device *dev, int num_rstcs,
+				 struct reset_control_bulk_data *rstcs)
+{
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, true);
+}
+
 /**
  * reset_control_get_exclusive_released - Lookup and obtain a temoprarily
  *                                        exclusive reference to a reset
@@ -176,6 +272,48 @@ __must_check reset_control_get_exclusive_released(struct device *dev,
 	return __reset_control_get(dev, id, 0, false, false, false);
 }
 
+/**
+ * reset_control_bulk_get_exclusive_released - Lookup and obtain temporarily
+ *                                    exclusive references to multiple reset
+ *                                    controllers.
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Fills the rstcs array with pointers to exclusive reset controls and
+ * returns 0, or an IS_ERR() condition containing errno.
+ * reset-controls returned by this function must be acquired via
+ * reset_control_bulk_acquire() before they can be used and should be released
+ * via reset_control_bulk_release() afterwards.
+ */
+static inline int __must_check
+reset_control_bulk_get_exclusive_released(struct device *dev, int num_rstcs,
+					  struct reset_control_bulk_data *rstcs)
+{
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, false);
+}
+
+/**
+ * reset_control_bulk_get_optional_exclusive_released - Lookup and obtain optional
+ *                                    temporarily exclusive references to multiple
+ *                                    reset controllers.
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Optional variant of reset_control_bulk_get_exclusive_released(). If the
+ * requested reset is not specified in the device tree, this function returns 0
+ * instead of an error and missing rtsc is set to NULL.
+ *
+ * See reset_control_bulk_get_exclusive_released() for more information.
+ */
+static inline int __must_check
+reset_control_bulk_get_optional_exclusive_released(struct device *dev, int num_rstcs,
+						   struct reset_control_bulk_data *rstcs)
+{
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, false);
+}
+
 /**
  * reset_control_get_shared - Lookup and obtain a shared reference to a
  *                            reset controller.
@@ -204,6 +342,23 @@ static inline struct reset_control *reset_control_get_shared(
 	return __reset_control_get(dev, id, 0, true, false, false);
 }
 
+/**
+ * reset_control_bulk_get_shared - Lookup and obtain shared references to
+ *                                 multiple reset controllers.
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Fills the rstcs array with pointers to shared reset controls and
+ * returns 0, or an IS_ERR() condition containing errno.
+ */
+static inline int __must_check
+reset_control_bulk_get_shared(struct device *dev, int num_rstcs,
+			      struct reset_control_bulk_data *rstcs)
+{
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, false);
+}
+
 /**
  * reset_control_get_optional_exclusive - optional reset_control_get_exclusive()
  * @dev: device to be reset by the controller
@@ -221,6 +376,26 @@ static inline struct reset_control *reset_control_get_optional_exclusive(
 	return __reset_control_get(dev, id, 0, false, true, true);
 }
 
+/**
+ * reset_control_bulk_get_optional_exclusive - optional
+ *                                             reset_control_bulk_get_exclusive()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Optional variant of reset_control_bulk_get_exclusive(). If any of the
+ * requested resets are not specified in the device tree, this function sets
+ * them to NULL instead of returning an error.
+ *
+ * See reset_control_bulk_get_exclusive() for more information.
+ */
+static inline int __must_check
+reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs,
+					  struct reset_control_bulk_data *rstcs)
+{
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, true);
+}
+
 /**
  * reset_control_get_optional_shared - optional reset_control_get_shared()
  * @dev: device to be reset by the controller
@@ -238,6 +413,26 @@ static inline struct reset_control *reset_control_get_optional_shared(
 	return __reset_control_get(dev, id, 0, true, true, false);
 }
 
+/**
+ * reset_control_bulk_get_optional_shared - optional
+ *                                             reset_control_bulk_get_shared()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Optional variant of reset_control_bulk_get_shared(). If the requested resets
+ * are not specified in the device tree, this function sets them to NULL
+ * instead of returning an error.
+ *
+ * See reset_control_bulk_get_shared() for more information.
+ */
+static inline int __must_check
+reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs,
+				       struct reset_control_bulk_data *rstcs)
+{
+	return __reset_control_bulk_get(dev, num_rstcs, rstcs, true, true, false);
+}
+
 /**
  * of_reset_control_get_exclusive - Lookup and obtain an exclusive reference
  *                                  to a reset controller.
@@ -343,6 +538,26 @@ __must_check devm_reset_control_get_exclusive(struct device *dev,
 	return __devm_reset_control_get(dev, id, 0, false, false, true);
 }
 
+/**
+ * devm_reset_control_bulk_get_exclusive - resource managed
+ *                                         reset_control_bulk_get_exclusive()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Managed reset_control_bulk_get_exclusive(). For reset controllers returned
+ * from this function, reset_control_put() is called automatically on driver
+ * detach.
+ *
+ * See reset_control_bulk_get_exclusive() for more information.
+ */
+static inline int __must_check
+devm_reset_control_bulk_get_exclusive(struct device *dev, int num_rstcs,
+				      struct reset_control_bulk_data *rstcs)
+{
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, true);
+}
+
 /**
  * devm_reset_control_get_exclusive_released - resource managed
  *                                             reset_control_get_exclusive_released()
@@ -362,6 +577,26 @@ __must_check devm_reset_control_get_exclusive_released(struct device *dev,
 	return __devm_reset_control_get(dev, id, 0, false, false, false);
 }
 
+/**
+ * devm_reset_control_bulk_get_exclusive_released - resource managed
+ *                                                  reset_control_bulk_get_exclusive_released()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Managed reset_control_bulk_get_exclusive_released(). For reset controllers
+ * returned from this function, reset_control_put() is called automatically on
+ * driver detach.
+ *
+ * See reset_control_bulk_get_exclusive_released() for more information.
+ */
+static inline int __must_check
+devm_reset_control_bulk_get_exclusive_released(struct device *dev, int num_rstcs,
+					       struct reset_control_bulk_data *rstcs)
+{
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, false, false);
+}
+
 /**
  * devm_reset_control_get_optional_exclusive_released - resource managed
  *                                                      reset_control_get_optional_exclusive_released()
@@ -381,6 +616,26 @@ __must_check devm_reset_control_get_optional_exclusive_released(struct device *d
 	return __devm_reset_control_get(dev, id, 0, false, true, false);
 }
 
+/**
+ * devm_reset_control_bulk_get_optional_exclusive_released - resource managed
+ *                                                           reset_control_bulk_optional_get_exclusive_released()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Managed reset_control_bulk_optional_get_exclusive_released(). For reset
+ * controllers returned from this function, reset_control_put() is called
+ * automatically on driver detach.
+ *
+ * See reset_control_bulk_optional_get_exclusive_released() for more information.
+ */
+static inline int __must_check
+devm_reset_control_bulk_get_optional_exclusive_released(struct device *dev, int num_rstcs,
+							struct reset_control_bulk_data *rstcs)
+{
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, false);
+}
+
 /**
  * devm_reset_control_get_shared - resource managed reset_control_get_shared()
  * @dev: device to be reset by the controller
@@ -396,6 +651,26 @@ static inline struct reset_control *devm_reset_control_get_shared(
 	return __devm_reset_control_get(dev, id, 0, true, false, false);
 }
 
+/**
+ * devm_reset_control_bulk_get_shared - resource managed
+ *                                      reset_control_bulk_get_shared()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Managed reset_control_bulk_get_shared(). For reset controllers returned
+ * from this function, reset_control_put() is called automatically on driver
+ * detach.
+ *
+ * See reset_control_bulk_get_shared() for more information.
+ */
+static inline int __must_check
+devm_reset_control_bulk_get_shared(struct device *dev, int num_rstcs,
+				   struct reset_control_bulk_data *rstcs)
+{
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, false);
+}
+
 /**
  * devm_reset_control_get_optional_exclusive - resource managed
  *                                             reset_control_get_optional_exclusive()
@@ -414,6 +689,26 @@ static inline struct reset_control *devm_reset_control_get_optional_exclusive(
 	return __devm_reset_control_get(dev, id, 0, false, true, true);
 }
 
+/**
+ * devm_reset_control_bulk_get_optional_exclusive - resource managed
+ *                                                  reset_control_bulk_get_optional_exclusive()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Managed reset_control_bulk_get_optional_exclusive(). For reset controllers
+ * returned from this function, reset_control_put() is called automatically on
+ * driver detach.
+ *
+ * See reset_control_bulk_get_optional_exclusive() for more information.
+ */
+static inline int __must_check
+devm_reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs,
+					       struct reset_control_bulk_data *rstcs)
+{
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, true);
+}
+
 /**
  * devm_reset_control_get_optional_shared - resource managed
  *                                          reset_control_get_optional_shared()
@@ -432,6 +727,26 @@ static inline struct reset_control *devm_reset_control_get_optional_shared(
 	return __devm_reset_control_get(dev, id, 0, true, true, false);
 }
 
+/**
+ * devm_reset_control_bulk_get_optional_shared - resource managed
+ *                                               reset_control_bulk_get_optional_shared()
+ * @dev: device to be reset by the controller
+ * @num_rstcs: number of entries in rstcs array
+ * @rstcs: array of struct reset_control_bulk_data with reset line names set
+ *
+ * Managed reset_control_bulk_get_optional_shared(). For reset controllers
+ * returned from this function, reset_control_put() is called automatically on
+ * driver detach.
+ *
+ * See reset_control_bulk_get_optional_shared() for more information.
+ */
+static inline int __must_check
+devm_reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs,
+					    struct reset_control_bulk_data *rstcs)
+{
+	return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, true, false);
+}
+
 /**
  * devm_reset_control_get_exclusive_by_index - resource managed
  *                                             reset_control_get_exclusive()
-- 
cgit v1.2.3


From 1066cfbdfa3f5c401870fad577fe63d1171a5bcd Mon Sep 17 00:00:00 2001
From: Guru Das Srinagesh <gurus@codeaurora.org>
Date: Wed, 10 Mar 2021 16:39:52 -0800
Subject: regmap-irq: Extend sub-irq to support non-fixed reg strides

Qualcomm's MFD chips have a top level interrupt status register and
sub-irqs (peripherals).  When a bit in the main status register goes
high, it means that the peripheral corresponding to that bit has an
unserviced interrupt. If the bit is not set, this means that the
corresponding peripheral does not.

Commit a2d21848d9211d ("regmap: regmap-irq: Add main status register
support") introduced the sub-irq logic that is currently applied only
when reading status registers, but not for any other functions like acking
or masking. Extend the use of sub-irq to all other functions, with two
caveats regarding the specification of offsets:

- Each member of the sub_reg_offsets array should be of length 1
- The specified offsets should be the unequal strides for each sub-irq
  device.

In QCOM's case, all the *_base registers are to be configured to the
base addresses of the first sub-irq group, with offsets of each
subsequent group calculated as a difference from these addresses.

Continuing from the example mentioned in the cover letter:

	/*
	 * Address of MISC_INT_MASK		= 0x1011
	 * Address of TEMP_ALARM_INT_MASK	= 0x2011
	 * Address of GPIO01_INT_MASK		= 0x3011
	 *
	 * Calculate offsets as:
	 * offset_0 = 0x1011 - 0x1011 = 0       (to access MISC's
	 * 					 registers)
	 * offset_1 = 0x2011 - 0x1011 = 0x1000
	 * offset_2 = 0x3011 - 0x1011 = 0x2000
	 */

	static unsigned int sub_unit0_offsets[] = {0};
	static unsigned int sub_unit1_offsets[] = {0x1000};
	static unsigned int sub_unit2_offsets[] = {0x2000};

	static struct regmap_irq_sub_irq_map chip_sub_irq_offsets[] = {
		REGMAP_IRQ_MAIN_REG_OFFSET(sub_unit0_offsets),
		REGMAP_IRQ_MAIN_REG_OFFSET(sub_unit0_offsets),
		REGMAP_IRQ_MAIN_REG_OFFSET(sub_unit0_offsets),
	};

	static struct regmap_irq_chip chip_irq_chip = {
	--------8<--------
	.not_fixed_stride = true,
	.mask_base	  = MISC_INT_MASK,
	.type_base	  = MISC_INT_TYPE,
	.ack_base	  = MISC_INT_ACK,
	.sub_reg_offsets  = chip_sub_irq_offsets,
	--------8<--------
	};

Signed-off-by: Guru Das Srinagesh <gurus@codeaurora.org>
Link: https://lore.kernel.org/r/526562423eaa58b4075362083f561841f1d6956c.1615423027.git.gurus@codeaurora.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 81 ++++++++++++++++++++++++++--------------
 include/linux/regmap.h           |  7 ++++
 2 files changed, 60 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index 19db764ffa4a..e1d8fc9ef040 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -45,6 +45,27 @@ struct regmap_irq_chip_data {
 	bool clear_status:1;
 };
 
+static int sub_irq_reg(struct regmap_irq_chip_data *data,
+		       unsigned int base_reg, int i)
+{
+	const struct regmap_irq_chip *chip = data->chip;
+	struct regmap *map = data->map;
+	struct regmap_irq_sub_irq_map *subreg;
+	unsigned int offset;
+	int reg = 0;
+
+	if (!chip->sub_reg_offsets || !chip->not_fixed_stride) {
+		/* Assume linear mapping */
+		reg = base_reg + (i * map->reg_stride * data->irq_reg_stride);
+	} else {
+		subreg = &chip->sub_reg_offsets[i];
+		offset = subreg->offset[0];
+		reg = base_reg + offset;
+	}
+
+	return reg;
+}
+
 static inline const
 struct regmap_irq *irq_to_regmap_irq(struct regmap_irq_chip_data *data,
 				     int irq)
@@ -87,8 +108,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 
 	if (d->clear_status) {
 		for (i = 0; i < d->chip->num_regs; i++) {
-			reg = d->chip->status_base +
-				(i * map->reg_stride * d->irq_reg_stride);
+			reg = sub_irq_reg(d, d->chip->status_base, i);
 
 			ret = regmap_read(map, reg, &val);
 			if (ret)
@@ -108,8 +128,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 		if (!d->chip->mask_base)
 			continue;
 
-		reg = d->chip->mask_base +
-			(i * map->reg_stride * d->irq_reg_stride);
+		reg = sub_irq_reg(d, d->chip->mask_base, i);
 		if (d->chip->mask_invert) {
 			ret = regmap_irq_update_bits(d, reg,
 					 d->mask_buf_def[i], ~d->mask_buf[i]);
@@ -136,8 +155,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 			dev_err(d->map->dev, "Failed to sync masks in %x\n",
 				reg);
 
-		reg = d->chip->wake_base +
-			(i * map->reg_stride * d->irq_reg_stride);
+		reg = sub_irq_reg(d, d->chip->wake_base, i);
 		if (d->wake_buf) {
 			if (d->chip->wake_invert)
 				ret = regmap_irq_update_bits(d, reg,
@@ -161,8 +179,8 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 		 * it'll be ignored in irq handler, then may introduce irq storm
 		 */
 		if (d->mask_buf[i] && (d->chip->ack_base || d->chip->use_ack)) {
-			reg = d->chip->ack_base +
-				(i * map->reg_stride * d->irq_reg_stride);
+			reg = sub_irq_reg(d, d->chip->ack_base, i);
+
 			/* some chips ack by write 0 */
 			if (d->chip->ack_invert)
 				ret = regmap_write(map, reg, ~d->mask_buf[i]);
@@ -187,8 +205,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 		for (i = 0; i < d->chip->num_type_reg; i++) {
 			if (!d->type_buf_def[i])
 				continue;
-			reg = d->chip->type_base +
-				(i * map->reg_stride * d->type_reg_stride);
+			reg = sub_irq_reg(d, d->chip->type_base, i);
 			if (d->chip->type_invert)
 				ret = regmap_irq_update_bits(d, reg,
 					d->type_buf_def[i], ~d->type_buf[i]);
@@ -352,8 +369,15 @@ static inline int read_sub_irq_data(struct regmap_irq_chip_data *data,
 		for (i = 0; i < subreg->num_regs; i++) {
 			unsigned int offset = subreg->offset[i];
 
-			ret = regmap_read(map, chip->status_base + offset,
-					  &data->status_buf[offset]);
+			if (chip->not_fixed_stride)
+				ret = regmap_read(map,
+						chip->status_base + offset,
+						&data->status_buf[b]);
+			else
+				ret = regmap_read(map,
+						chip->status_base + offset,
+						&data->status_buf[offset]);
+
 			if (ret)
 				break;
 		}
@@ -474,10 +498,9 @@ static irqreturn_t regmap_irq_thread(int irq, void *d)
 
 	} else {
 		for (i = 0; i < data->chip->num_regs; i++) {
-			ret = regmap_read(map, chip->status_base +
-					  (i * map->reg_stride
-					   * data->irq_reg_stride),
-					  &data->status_buf[i]);
+			unsigned int reg = sub_irq_reg(data,
+					data->chip->status_base, i);
+			ret = regmap_read(map, reg, &data->status_buf[i]);
 
 			if (ret != 0) {
 				dev_err(map->dev,
@@ -499,8 +522,8 @@ static irqreturn_t regmap_irq_thread(int irq, void *d)
 		data->status_buf[i] &= ~data->mask_buf[i];
 
 		if (data->status_buf[i] && (chip->ack_base || chip->use_ack)) {
-			reg = chip->ack_base +
-				(i * map->reg_stride * data->irq_reg_stride);
+			reg = sub_irq_reg(data, data->chip->ack_base, i);
+
 			if (chip->ack_invert)
 				ret = regmap_write(map, reg,
 						~data->status_buf[i]);
@@ -605,6 +628,12 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 			return -EINVAL;
 	}
 
+	if (chip->not_fixed_stride) {
+		for (i = 0; i < chip->num_regs; i++)
+			if (chip->sub_reg_offsets[i].num_regs != 1)
+				return -EINVAL;
+	}
+
 	if (irq_base) {
 		irq_base = irq_alloc_descs(irq_base, 0, chip->num_irqs, 0);
 		if (irq_base < 0) {
@@ -700,8 +729,8 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 		if (!chip->mask_base)
 			continue;
 
-		reg = chip->mask_base +
-			(i * map->reg_stride * d->irq_reg_stride);
+		reg = sub_irq_reg(d, d->chip->mask_base, i);
+
 		if (chip->mask_invert)
 			ret = regmap_irq_update_bits(d, reg,
 					 d->mask_buf[i], ~d->mask_buf[i]);
@@ -725,8 +754,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 			continue;
 
 		/* Ack masked but set interrupts */
-		reg = chip->status_base +
-			(i * map->reg_stride * d->irq_reg_stride);
+		reg = sub_irq_reg(d, d->chip->status_base, i);
 		ret = regmap_read(map, reg, &d->status_buf[i]);
 		if (ret != 0) {
 			dev_err(map->dev, "Failed to read IRQ status: %d\n",
@@ -735,8 +763,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 		}
 
 		if (d->status_buf[i] && (chip->ack_base || chip->use_ack)) {
-			reg = chip->ack_base +
-				(i * map->reg_stride * d->irq_reg_stride);
+			reg = sub_irq_reg(d, d->chip->ack_base, i);
 			if (chip->ack_invert)
 				ret = regmap_write(map, reg,
 					~(d->status_buf[i] & d->mask_buf[i]));
@@ -765,8 +792,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 	if (d->wake_buf) {
 		for (i = 0; i < chip->num_regs; i++) {
 			d->wake_buf[i] = d->mask_buf_def[i];
-			reg = chip->wake_base +
-				(i * map->reg_stride * d->irq_reg_stride);
+			reg = sub_irq_reg(d, d->chip->wake_base, i);
 
 			if (chip->wake_invert)
 				ret = regmap_irq_update_bits(d, reg,
@@ -786,8 +812,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 
 	if (chip->num_type_reg && !chip->type_in_mask) {
 		for (i = 0; i < chip->num_type_reg; ++i) {
-			reg = chip->type_base +
-				(i * map->reg_stride * d->type_reg_stride);
+			reg = sub_irq_reg(d, d->chip->type_base, i);
 
 			ret = regmap_read(map, reg, &d->type_buf_def[i]);
 
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 2cc4ecd36298..18910bd809f7 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1378,6 +1378,9 @@ struct regmap_irq_sub_irq_map {
  *		     status_base. Should contain num_regs arrays.
  *		     Can be provided for chips with more complex mapping than
  *		     1.st bit to 1.st sub-reg, 2.nd bit to 2.nd sub-reg, ...
+ *		     When used with not_fixed_stride, each one-element array
+ *		     member contains offset calculated as address from each
+ *		     peripheral to first peripheral.
  * @num_main_regs: Number of 'main status' irq registers for chips which have
  *		   main_status set.
  *
@@ -1404,6 +1407,9 @@ struct regmap_irq_sub_irq_map {
  * @clear_on_unmask: For chips with interrupts cleared on read: read the status
  *                   registers before unmasking interrupts to clear any bits
  *                   set when they were masked.
+ * @not_fixed_stride: Used when chip peripherals are not laid out with fixed
+ * 		      stride. Must be used with sub_reg_offsets containing the
+ * 		      offsets to each peripheral.
  * @runtime_pm:  Hold a runtime PM lock on the device when accessing it.
  *
  * @num_regs:    Number of registers in each control bank.
@@ -1450,6 +1456,7 @@ struct regmap_irq_chip {
 	bool type_invert:1;
 	bool type_in_mask:1;
 	bool clear_on_unmask:1;
+	bool not_fixed_stride:1;
 
 	int num_regs;
 
-- 
cgit v1.2.3


From 8ccbed2476f2a615d5045a7c5c7b459db7dd9263 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 8 Jan 2021 12:55:49 +0200
Subject: thunderbolt: Do not re-establish XDomain DMA paths automatically

This step is actually not needed. The service drivers themselves will
handle this once they have negotiated the service up and running again
with the remote side. Also dropping this makes it easier to add support
for multiple DMA tunnels over a single XDomain connection.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/thunderbolt/xdomain.c | 35 ++---------------------------------
 include/linux/thunderbolt.h   |  2 --
 2 files changed, 2 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
index 584bb5ec06f8..a1657663a95e 100644
--- a/drivers/thunderbolt/xdomain.c
+++ b/drivers/thunderbolt/xdomain.c
@@ -946,19 +946,6 @@ static int populate_properties(struct tb_xdomain *xd,
 	return 0;
 }
 
-/* Called with @xd->lock held */
-static void tb_xdomain_restore_paths(struct tb_xdomain *xd)
-{
-	if (!xd->resume)
-		return;
-
-	xd->resume = false;
-	if (xd->transmit_path) {
-		dev_dbg(&xd->dev, "re-establishing DMA path\n");
-		tb_domain_approve_xdomain_paths(xd->tb, xd);
-	}
-}
-
 static inline struct tb_switch *tb_xdomain_parent(struct tb_xdomain *xd)
 {
 	return tb_to_switch(xd->dev.parent);
@@ -1084,16 +1071,8 @@ static void tb_xdomain_get_properties(struct work_struct *work)
 	mutex_lock(&xd->lock);
 
 	/* Only accept newer generation properties */
-	if (xd->properties && gen <= xd->property_block_gen) {
-		/*
-		 * On resume it is likely that the properties block is
-		 * not changed (unless the other end added or removed
-		 * services). However, we need to make sure the existing
-		 * DMA paths are restored properly.
-		 */
-		tb_xdomain_restore_paths(xd);
+	if (xd->properties && gen <= xd->property_block_gen)
 		goto err_free_block;
-	}
 
 	dir = tb_property_parse_dir(block, ret);
 	if (!dir) {
@@ -1118,8 +1097,6 @@ static void tb_xdomain_get_properties(struct work_struct *work)
 
 	tb_xdomain_update_link_attributes(xd);
 
-	tb_xdomain_restore_paths(xd);
-
 	mutex_unlock(&xd->lock);
 
 	kfree(block);
@@ -1332,15 +1309,7 @@ static int __maybe_unused tb_xdomain_suspend(struct device *dev)
 
 static int __maybe_unused tb_xdomain_resume(struct device *dev)
 {
-	struct tb_xdomain *xd = tb_to_xdomain(dev);
-
-	/*
-	 * Ask tb_xdomain_get_properties() restore any existing DMA
-	 * paths after properties are re-read.
-	 */
-	xd->resume = true;
-	start_handshake(xd);
-
+	start_handshake(tb_to_xdomain(dev));
 	return 0;
 }
 
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 659a0a810fa1..7ec977161f5c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -185,7 +185,6 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
  * @link_speed: Speed of the link in Gb/s
  * @link_width: Width of the link (1 or 2)
  * @is_unplugged: The XDomain is unplugged
- * @resume: The XDomain is being resumed
  * @needs_uuid: If the XDomain does not have @remote_uuid it will be
  *		queried first
  * @transmit_path: HopID which the remote end expects us to transmit
@@ -231,7 +230,6 @@ struct tb_xdomain {
 	unsigned int link_speed;
 	unsigned int link_width;
 	bool is_unplugged;
-	bool resume;
 	bool needs_uuid;
 	u16 transmit_path;
 	u16 transmit_ring;
-- 
cgit v1.2.3


From 7d3084c0b77c6c417a16fc1c5bf3bc3149d20fab Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 8 Jan 2021 14:38:24 +0200
Subject: thunderbolt: Add tb_property_copy_dir()

This function takes a deep copy of the properties. We need this in order
to support more dynamic properties per XDomain connection as required by
the USB4 inter-domain service spec.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/thunderbolt/property.c | 71 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/thunderbolt.h    |  1 +
 2 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thunderbolt/property.c b/drivers/thunderbolt/property.c
index d5b0cdb8f0b1..dc555cda98e6 100644
--- a/drivers/thunderbolt/property.c
+++ b/drivers/thunderbolt/property.c
@@ -501,6 +501,77 @@ ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
 	return ret < 0 ? ret : 0;
 }
 
+/**
+ * tb_property_copy_dir() - Take a deep copy of directory
+ * @dir: Directory to copy
+ *
+ * This function takes a deep copy of @dir and returns back the copy. In
+ * case of error returns %NULL. The resulting directory needs to be
+ * released by calling tb_property_free_dir().
+ */
+struct tb_property_dir *tb_property_copy_dir(const struct tb_property_dir *dir)
+{
+	struct tb_property *property, *p = NULL;
+	struct tb_property_dir *d;
+
+	if (!dir)
+		return NULL;
+
+	d = tb_property_create_dir(dir->uuid);
+	if (!d)
+		return NULL;
+
+	list_for_each_entry(property, &dir->properties, list) {
+		struct tb_property *p;
+
+		p = tb_property_alloc(property->key, property->type);
+		if (!p)
+			goto err_free;
+
+		p->length = property->length;
+
+		switch (property->type) {
+		case TB_PROPERTY_TYPE_DIRECTORY:
+			p->value.dir = tb_property_copy_dir(property->value.dir);
+			if (!p->value.dir)
+				goto err_free;
+			break;
+
+		case TB_PROPERTY_TYPE_DATA:
+			p->value.data = kmemdup(property->value.data,
+						property->length * 4,
+						GFP_KERNEL);
+			if (!p->value.data)
+				goto err_free;
+			break;
+
+		case TB_PROPERTY_TYPE_TEXT:
+			p->value.text = kzalloc(p->length * 4, GFP_KERNEL);
+			if (!p->value.text)
+				goto err_free;
+			strcpy(p->value.text, property->value.text);
+			break;
+
+		case TB_PROPERTY_TYPE_VALUE:
+			p->value.immediate = property->value.immediate;
+			break;
+
+		default:
+			break;
+		}
+
+		list_add_tail(&p->list, &d->properties);
+	}
+
+	return d;
+
+err_free:
+	kfree(p);
+	tb_property_free_dir(d);
+
+	return NULL;
+}
+
 /**
  * tb_property_add_immediate() - Add immediate property to directory
  * @parent: Directory to add the property
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 7ec977161f5c..003a9ad29168 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -146,6 +146,7 @@ struct tb_property_dir *tb_property_parse_dir(const u32 *block,
 					      size_t block_len);
 ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
 			       size_t block_len);
+struct tb_property_dir *tb_property_copy_dir(const struct tb_property_dir *dir);
 struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid);
 void tb_property_free_dir(struct tb_property_dir *dir);
 int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
-- 
cgit v1.2.3


From 46b494f286812a88caba28dd0810cf3a55747431 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 8 Jan 2021 14:57:19 +0200
Subject: thunderbolt: Add support for maxhopid XDomain property

USB4 inter-domain spec mandates that the compatible hosts expose a new
property "maxhopid" that tells the connection manager on the other side
what is the maximum supported input HopID over the connection. Since
this is depend on the lane adapter the cable is connected it needs to be
filled in dynamically.

For this reason we take a copy of the global properties and fill then
for each XDomain connection upon first connect, and then keep updating
it if the generation changes as services are being added/removed. We
also take advantage of this copy to fill in the hostname.

We also expose this maxhopid as an attribute under each XDomain device.

While there drop kernel-doc entry for property_lock which seems to be
left there when the structure was originally introduced.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-bus-thunderbolt |   7 +
 drivers/thunderbolt/xdomain.c                   | 206 ++++++++++++++----------
 include/linux/thunderbolt.h                     |  19 ++-
 3 files changed, 138 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt
index bfa4ca6f3fc1..c41c68f64693 100644
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@@ -134,6 +134,13 @@ Contact:	thunderbolt-software@lists.01.org
 Description:	This attribute contains name of this device extracted from
 		the device DROM.
 
+What:		/sys/bus/thunderbolt/devices/.../maxhopid
+Date:		Jul 2021
+KernelVersion:	5.13
+Contact:	Mika Westerberg <mika.westerberg@linux.intel.com>
+Description:	Only set for XDomains. The maximum HopID the other host
+		supports as its input HopID.
+
 What:		/sys/bus/thunderbolt/devices/.../rx_speed
 Date:		Jan 2020
 KernelVersion:	5.5
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
index ffa9cc9e0e7d..ab56757d7c24 100644
--- a/drivers/thunderbolt/xdomain.c
+++ b/drivers/thunderbolt/xdomain.c
@@ -24,6 +24,7 @@
 #define XDOMAIN_PROPERTIES_RETRIES		10
 #define XDOMAIN_PROPERTIES_CHANGED_RETRIES	10
 #define XDOMAIN_BONDING_WAIT			100  /* ms */
+#define XDOMAIN_DEFAULT_MAX_HOPID		15
 
 struct xdomain_request_work {
 	struct work_struct work;
@@ -35,13 +36,15 @@ static bool tb_xdomain_enabled = true;
 module_param_named(xdomain, tb_xdomain_enabled, bool, 0444);
 MODULE_PARM_DESC(xdomain, "allow XDomain protocol (default: true)");
 
-/* Serializes access to the properties and protocol handlers below */
+/*
+ * Serializes access to the properties and protocol handlers below. If
+ * you need to take both this lock and the struct tb_xdomain lock, take
+ * this one first.
+ */
 static DEFINE_MUTEX(xdomain_lock);
 
 /* Properties exposed to the remote domains */
 static struct tb_property_dir *xdomain_property_dir;
-static u32 *xdomain_property_block;
-static u32 xdomain_property_block_len;
 static u32 xdomain_property_block_gen;
 
 /* Additional protocol handlers */
@@ -386,8 +389,7 @@ err:
 }
 
 static int tb_xdp_properties_response(struct tb *tb, struct tb_ctl *ctl,
-	u64 route, u8 sequence, const uuid_t *src_uuid,
-	const struct tb_xdp_properties *req)
+	struct tb_xdomain *xd, u8 sequence, const struct tb_xdp_properties *req)
 {
 	struct tb_xdp_properties_response *res;
 	size_t total_size;
@@ -399,39 +401,39 @@ static int tb_xdp_properties_response(struct tb *tb, struct tb_ctl *ctl,
 	 * protocol supports forwarding, though which we might add
 	 * support later on.
 	 */
-	if (!uuid_equal(src_uuid, &req->dst_uuid)) {
-		tb_xdp_error_response(ctl, route, sequence,
+	if (!uuid_equal(xd->local_uuid, &req->dst_uuid)) {
+		tb_xdp_error_response(ctl, xd->route, sequence,
 				      ERROR_UNKNOWN_DOMAIN);
 		return 0;
 	}
 
-	mutex_lock(&xdomain_lock);
+	mutex_lock(&xd->lock);
 
-	if (req->offset >= xdomain_property_block_len) {
-		mutex_unlock(&xdomain_lock);
+	if (req->offset >= xd->local_property_block_len) {
+		mutex_unlock(&xd->lock);
 		return -EINVAL;
 	}
 
-	len = xdomain_property_block_len - req->offset;
+	len = xd->local_property_block_len - req->offset;
 	len = min_t(u16, len, TB_XDP_PROPERTIES_MAX_DATA_LENGTH);
 	total_size = sizeof(*res) + len * 4;
 
 	res = kzalloc(total_size, GFP_KERNEL);
 	if (!res) {
-		mutex_unlock(&xdomain_lock);
+		mutex_unlock(&xd->lock);
 		return -ENOMEM;
 	}
 
-	tb_xdp_fill_header(&res->hdr, route, sequence, PROPERTIES_RESPONSE,
+	tb_xdp_fill_header(&res->hdr, xd->route, sequence, PROPERTIES_RESPONSE,
 			   total_size);
-	res->generation = xdomain_property_block_gen;
-	res->data_length = xdomain_property_block_len;
+	res->generation = xd->local_property_block_gen;
+	res->data_length = xd->local_property_block_len;
 	res->offset = req->offset;
-	uuid_copy(&res->src_uuid, src_uuid);
+	uuid_copy(&res->src_uuid, xd->local_uuid);
 	uuid_copy(&res->dst_uuid, &req->src_uuid);
-	memcpy(res->data, &xdomain_property_block[req->offset], len * 4);
+	memcpy(res->data, &xd->local_property_block[req->offset], len * 4);
 
-	mutex_unlock(&xdomain_lock);
+	mutex_unlock(&xd->lock);
 
 	ret = __tb_xdomain_response(ctl, res, total_size,
 				    TB_CFG_PKG_XDOMAIN_RESP);
@@ -513,52 +515,63 @@ void tb_unregister_protocol_handler(struct tb_protocol_handler *handler)
 }
 EXPORT_SYMBOL_GPL(tb_unregister_protocol_handler);
 
-static int rebuild_property_block(void)
+static void update_property_block(struct tb_xdomain *xd)
 {
-	u32 *block, len;
-	int ret;
-
-	ret = tb_property_format_dir(xdomain_property_dir, NULL, 0);
-	if (ret < 0)
-		return ret;
-
-	len = ret;
-
-	block = kcalloc(len, sizeof(u32), GFP_KERNEL);
-	if (!block)
-		return -ENOMEM;
+	mutex_lock(&xdomain_lock);
+	mutex_lock(&xd->lock);
+	/*
+	 * If the local property block is not up-to-date, rebuild it now
+	 * based on the global property template.
+	 */
+	if (!xd->local_property_block ||
+	    xd->local_property_block_gen < xdomain_property_block_gen) {
+		struct tb_property_dir *dir;
+		int ret, block_len;
+		u32 *block;
+
+		dir = tb_property_copy_dir(xdomain_property_dir);
+		if (!dir) {
+			dev_warn(&xd->dev, "failed to copy properties\n");
+			goto out_unlock;
+		}
 
-	ret = tb_property_format_dir(xdomain_property_dir, block, len);
-	if (ret) {
-		kfree(block);
-		return ret;
-	}
+		/* Fill in non-static properties now */
+		tb_property_add_text(dir, "deviceid", utsname()->nodename);
+		tb_property_add_immediate(dir, "maxhopid", xd->local_max_hopid);
 
-	kfree(xdomain_property_block);
-	xdomain_property_block = block;
-	xdomain_property_block_len = len;
-	xdomain_property_block_gen++;
+		ret = tb_property_format_dir(dir, NULL, 0);
+		if (ret < 0) {
+			dev_warn(&xd->dev, "local property block creation failed\n");
+			tb_property_free_dir(dir);
+			goto out_unlock;
+		}
 
-	return 0;
-}
+		block_len = ret;
+		block = kcalloc(block_len, sizeof(*block), GFP_KERNEL);
+		if (!block) {
+			tb_property_free_dir(dir);
+			goto out_unlock;
+		}
 
-static void finalize_property_block(void)
-{
-	const struct tb_property *nodename;
+		ret = tb_property_format_dir(dir, block, block_len);
+		if (ret) {
+			dev_warn(&xd->dev, "property block generation failed\n");
+			tb_property_free_dir(dir);
+			kfree(block);
+			goto out_unlock;
+		}
 
-	/*
-	 * On first XDomain connection we set up the the system
-	 * nodename. This delayed here because userspace may not have it
-	 * set when the driver is first probed.
-	 */
-	mutex_lock(&xdomain_lock);
-	nodename = tb_property_find(xdomain_property_dir, "deviceid",
-				    TB_PROPERTY_TYPE_TEXT);
-	if (!nodename) {
-		tb_property_add_text(xdomain_property_dir, "deviceid",
-				     utsname()->nodename);
-		rebuild_property_block();
+		tb_property_free_dir(dir);
+		/* Release the previous block */
+		kfree(xd->local_property_block);
+		/* Assign new one */
+		xd->local_property_block = block;
+		xd->local_property_block_len = block_len;
+		xd->local_property_block_gen = xdomain_property_block_gen;
 	}
+
+out_unlock:
+	mutex_unlock(&xd->lock);
 	mutex_unlock(&xdomain_lock);
 }
 
@@ -569,6 +582,7 @@ static void tb_xdp_handle_request(struct work_struct *work)
 	const struct tb_xdomain_header *xhdr = &pkg->xd_hdr;
 	struct tb *tb = xw->tb;
 	struct tb_ctl *ctl = tb->ctl;
+	struct tb_xdomain *xd;
 	const uuid_t *uuid;
 	int ret = 0;
 	u32 sequence;
@@ -590,19 +604,21 @@ static void tb_xdp_handle_request(struct work_struct *work)
 		goto out;
 	}
 
-	finalize_property_block();
-
 	tb_dbg(tb, "%llx: received XDomain request %#x\n", route, pkg->type);
 
+	xd = tb_xdomain_find_by_route_locked(tb, route);
+	if (xd)
+		update_property_block(xd);
+
 	switch (pkg->type) {
 	case PROPERTIES_REQUEST:
-		ret = tb_xdp_properties_response(tb, ctl, route, sequence, uuid,
-			(const struct tb_xdp_properties *)pkg);
+		if (xd) {
+			ret = tb_xdp_properties_response(tb, ctl, xd, sequence,
+				(const struct tb_xdp_properties *)pkg);
+		}
 		break;
 
-	case PROPERTIES_CHANGED_REQUEST: {
-		struct tb_xdomain *xd;
-
+	case PROPERTIES_CHANGED_REQUEST:
 		ret = tb_xdp_properties_changed_response(ctl, route, sequence);
 
 		/*
@@ -610,17 +626,11 @@ static void tb_xdp_handle_request(struct work_struct *work)
 		 * the xdomain related to this connection as well in
 		 * case there is a change in services it offers.
 		 */
-		xd = tb_xdomain_find_by_route_locked(tb, route);
-		if (xd) {
-			if (device_is_registered(&xd->dev)) {
-				queue_delayed_work(tb->wq, &xd->get_properties_work,
-						   msecs_to_jiffies(50));
-			}
-			tb_xdomain_put(xd);
+		if (xd && device_is_registered(&xd->dev)) {
+			queue_delayed_work(tb->wq, &xd->get_properties_work,
+					   msecs_to_jiffies(50));
 		}
-
 		break;
-	}
 
 	case UUID_REQUEST_OLD:
 	case UUID_REQUEST:
@@ -633,6 +643,8 @@ static void tb_xdp_handle_request(struct work_struct *work)
 		break;
 	}
 
+	tb_xdomain_put(xd);
+
 	if (ret) {
 		tb_warn(tb, "failed to send XDomain response for %#x\n",
 			pkg->type);
@@ -814,7 +826,7 @@ static int remove_missing_service(struct device *dev, void *data)
 	if (!svc)
 		return 0;
 
-	if (!tb_property_find(xd->properties, svc->key,
+	if (!tb_property_find(xd->remote_properties, svc->key,
 			      TB_PROPERTY_TYPE_DIRECTORY))
 		device_unregister(dev);
 
@@ -874,7 +886,7 @@ static void enumerate_services(struct tb_xdomain *xd)
 	device_for_each_child_reverse(&xd->dev, xd, remove_missing_service);
 
 	/* Then re-enumerate properties creating new services as we go */
-	tb_property_for_each(xd->properties, p) {
+	tb_property_for_each(xd->remote_properties, p) {
 		if (p->type != TB_PROPERTY_TYPE_DIRECTORY)
 			continue;
 
@@ -931,6 +943,14 @@ static int populate_properties(struct tb_xdomain *xd,
 		return -EINVAL;
 	xd->vendor = p->value.immediate;
 
+	p = tb_property_find(dir, "maxhopid", TB_PROPERTY_TYPE_VALUE);
+	/*
+	 * USB4 inter-domain spec suggests using 15 as HopID if the
+	 * other end does not announce it in a property. This is for
+	 * TBT3 compatibility.
+	 */
+	xd->remote_max_hopid = p ? p->value.immediate : XDOMAIN_DEFAULT_MAX_HOPID;
+
 	kfree(xd->device_name);
 	xd->device_name = NULL;
 	kfree(xd->vendor_name);
@@ -1072,7 +1092,7 @@ static void tb_xdomain_get_properties(struct work_struct *work)
 	mutex_lock(&xd->lock);
 
 	/* Only accept newer generation properties */
-	if (xd->properties && gen <= xd->property_block_gen)
+	if (xd->remote_properties && gen <= xd->remote_property_block_gen)
 		goto err_free_block;
 
 	dir = tb_property_parse_dir(block, ret);
@@ -1088,13 +1108,13 @@ static void tb_xdomain_get_properties(struct work_struct *work)
 	}
 
 	/* Release the existing one */
-	if (xd->properties) {
-		tb_property_free_dir(xd->properties);
+	if (xd->remote_properties) {
+		tb_property_free_dir(xd->remote_properties);
 		update = true;
 	}
 
-	xd->properties = dir;
-	xd->property_block_gen = gen;
+	xd->remote_properties = dir;
+	xd->remote_property_block_gen = gen;
 
 	tb_xdomain_update_link_attributes(xd);
 
@@ -1180,6 +1200,15 @@ device_name_show(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR_RO(device_name);
 
+static ssize_t maxhopid_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct tb_xdomain *xd = container_of(dev, struct tb_xdomain, dev);
+
+	return sprintf(buf, "%d\n", xd->remote_max_hopid);
+}
+static DEVICE_ATTR_RO(maxhopid);
+
 static ssize_t vendor_show(struct device *dev, struct device_attribute *attr,
 			   char *buf)
 {
@@ -1238,6 +1267,7 @@ static DEVICE_ATTR(tx_lanes, 0444, lanes_show, NULL);
 static struct attribute *xdomain_attrs[] = {
 	&dev_attr_device.attr,
 	&dev_attr_device_name.attr,
+	&dev_attr_maxhopid.attr,
 	&dev_attr_rx_lanes.attr,
 	&dev_attr_rx_speed.attr,
 	&dev_attr_tx_lanes.attr,
@@ -1263,7 +1293,8 @@ static void tb_xdomain_release(struct device *dev)
 
 	put_device(xd->dev.parent);
 
-	tb_property_free_dir(xd->properties);
+	kfree(xd->local_property_block);
+	tb_property_free_dir(xd->remote_properties);
 	ida_destroy(&xd->service_ids);
 
 	kfree(xd->local_uuid);
@@ -1355,6 +1386,7 @@ struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent,
 
 	xd->tb = tb;
 	xd->route = route;
+	xd->local_max_hopid = down->config.max_in_hop_id;
 	ida_init(&xd->service_ids);
 	mutex_init(&xd->lock);
 	INIT_DELAYED_WORK(&xd->get_uuid_work, tb_xdomain_get_uuid);
@@ -1824,11 +1856,7 @@ int tb_register_property_dir(const char *key, struct tb_property_dir *dir)
 	if (ret)
 		goto err_unlock;
 
-	ret = rebuild_property_block();
-	if (ret) {
-		remove_directory(key, dir);
-		goto err_unlock;
-	}
+	xdomain_property_block_gen++;
 
 	mutex_unlock(&xdomain_lock);
 	update_all_xdomains();
@@ -1854,7 +1882,7 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir)
 
 	mutex_lock(&xdomain_lock);
 	if (remove_directory(key, dir))
-		ret = rebuild_property_block();
+		xdomain_property_block_gen++;
 	mutex_unlock(&xdomain_lock);
 
 	if (!ret)
@@ -1873,7 +1901,8 @@ int tb_xdomain_init(void)
 	 * directories. Those will be added by service drivers
 	 * themselves when they are loaded.
 	 *
-	 * We also add node name later when first connection is made.
+	 * Rest of the properties are filled dynamically based on these
+	 * when the P2P connection is made.
 	 */
 	tb_property_add_immediate(xdomain_property_dir, "vendorid",
 				  PCI_VENDOR_ID_INTEL);
@@ -1887,6 +1916,5 @@ int tb_xdomain_init(void)
 
 void tb_xdomain_exit(void)
 {
-	kfree(xdomain_property_block);
 	tb_property_free_dir(xdomain_property_dir);
 }
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 003a9ad29168..3e0ce654d60c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -180,6 +180,8 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
  * @route: Route string the other domain can be reached
  * @vendor: Vendor ID of the remote domain
  * @device: Device ID of the demote domain
+ * @local_max_hopid: Maximum input HopID of this host
+ * @remote_max_hopid: Maximum input HopID of the remote host
  * @lock: Lock to serialize access to the following fields of this structure
  * @vendor_name: Name of the vendor (or %NULL if not known)
  * @device_name: Name of the device (or %NULL if not known)
@@ -193,9 +195,11 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
  * @receive_path: HopID which we expect the remote end to transmit
  * @receive_ring: Local ring (hop) where incoming packets arrive
  * @service_ids: Used to generate IDs for the services
- * @properties: Properties exported by the remote domain
- * @property_block_gen: Generation of @properties
- * @properties_lock: Lock protecting @properties.
+ * @local_property_block: Local block of properties
+ * @local_property_block_gen: Generation of @local_property_block
+ * @local_property_block_len: Length of the @local_property_block in dwords
+ * @remote_properties: Properties exported by the remote domain
+ * @remote_property_block_gen: Generation of @remote_properties
  * @get_uuid_work: Work used to retrieve @remote_uuid
  * @uuid_retries: Number of times left @remote_uuid is requested before
  *		  giving up
@@ -225,6 +229,8 @@ struct tb_xdomain {
 	u64 route;
 	u16 vendor;
 	u16 device;
+	unsigned int local_max_hopid;
+	unsigned int remote_max_hopid;
 	struct mutex lock;
 	const char *vendor_name;
 	const char *device_name;
@@ -237,8 +243,11 @@ struct tb_xdomain {
 	u16 receive_path;
 	u16 receive_ring;
 	struct ida service_ids;
-	struct tb_property_dir *properties;
-	u32 property_block_gen;
+	u32 *local_property_block;
+	u32 local_property_block_gen;
+	u32 local_property_block_len;
+	struct tb_property_dir *remote_properties;
+	u32 remote_property_block_gen;
 	struct delayed_work get_uuid_work;
 	int uuid_retries;
 	struct delayed_work get_properties_work;
-- 
cgit v1.2.3


From 180b0689425c6fb2b35e69a3316ee38371a782df Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Fri, 8 Jan 2021 16:25:39 +0200
Subject: thunderbolt: Allow multiple DMA tunnels over a single XDomain
 connection

Currently we have had an artificial limitation of a single DMA tunnel
per XDomain connection. However, hardware wise there is no such limit
and software based connection manager can take advantage of all the DMA
rings available on the host to establish tunnels.

For this reason make the tb_xdomain_[enable|disable]_paths() to take the
DMA ring and HopID as parameter instead of storing them in the struct
tb_xdomain. We also add API functions to allocate input and output
HopIDs of the XDomain connection that the service drivers can use
instead of hard-coding.

Also convert the two existing service drivers over to this API.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/net/thunderbolt.c      |  49 +++++++++++----
 drivers/thunderbolt/dma_test.c |  35 +++++++++--
 drivers/thunderbolt/domain.c   |  24 +++++--
 drivers/thunderbolt/icm.c      |  32 ++++++----
 drivers/thunderbolt/tb.c       |  48 ++++++++------
 drivers/thunderbolt/tb.h       |  16 +++--
 drivers/thunderbolt/tunnel.c   |  82 ++++++++++++++++++++----
 drivers/thunderbolt/tunnel.h   |   8 ++-
 drivers/thunderbolt/xdomain.c  | 139 ++++++++++++++++++++++++++++-------------
 include/linux/thunderbolt.h    |  32 ++++++----
 10 files changed, 340 insertions(+), 125 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c
index ed3743dc62b9..5c9ec91b6e78 100644
--- a/drivers/net/thunderbolt.c
+++ b/drivers/net/thunderbolt.c
@@ -28,7 +28,6 @@
 #define TBNET_LOGOUT_TIMEOUT	100
 
 #define TBNET_RING_SIZE		256
-#define TBNET_LOCAL_PATH	0xf
 #define TBNET_LOGIN_RETRIES	60
 #define TBNET_LOGOUT_RETRIES	5
 #define TBNET_MATCH_FRAGS_ID	BIT(1)
@@ -154,8 +153,8 @@ struct tbnet_ring {
  * @login_sent: ThunderboltIP login message successfully sent
  * @login_received: ThunderboltIP login message received from the remote
  *		    host
- * @transmit_path: HopID the other end needs to use building the
- *		   opposite side path.
+ * @local_transmit_path: HopID we are using to send out packets
+ * @remote_transmit_path: HopID the other end is using to send packets to us
  * @connection_lock: Lock serializing access to @login_sent,
  *		     @login_received and @transmit_path.
  * @login_retries: Number of login retries currently done
@@ -184,7 +183,8 @@ struct tbnet {
 	atomic_t command_id;
 	bool login_sent;
 	bool login_received;
-	u32 transmit_path;
+	int local_transmit_path;
+	int remote_transmit_path;
 	struct mutex connection_lock;
 	int login_retries;
 	struct delayed_work login_work;
@@ -257,7 +257,7 @@ static int tbnet_login_request(struct tbnet *net, u8 sequence)
 			  atomic_inc_return(&net->command_id));
 
 	request.proto_version = TBIP_LOGIN_PROTO_VERSION;
-	request.transmit_path = TBNET_LOCAL_PATH;
+	request.transmit_path = net->local_transmit_path;
 
 	return tb_xdomain_request(xd, &request, sizeof(request),
 				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
@@ -364,10 +364,10 @@ static void tbnet_tear_down(struct tbnet *net, bool send_logout)
 	mutex_lock(&net->connection_lock);
 
 	if (net->login_sent && net->login_received) {
-		int retries = TBNET_LOGOUT_RETRIES;
+		int ret, retries = TBNET_LOGOUT_RETRIES;
 
 		while (send_logout && retries-- > 0) {
-			int ret = tbnet_logout_request(net);
+			ret = tbnet_logout_request(net);
 			if (ret != -ETIMEDOUT)
 				break;
 		}
@@ -377,8 +377,16 @@ static void tbnet_tear_down(struct tbnet *net, bool send_logout)
 		tbnet_free_buffers(&net->rx_ring);
 		tbnet_free_buffers(&net->tx_ring);
 
-		if (tb_xdomain_disable_paths(net->xd))
+		ret = tb_xdomain_disable_paths(net->xd,
+					       net->local_transmit_path,
+					       net->rx_ring.ring->hop,
+					       net->remote_transmit_path,
+					       net->tx_ring.ring->hop);
+		if (ret)
 			netdev_warn(net->dev, "failed to disable DMA paths\n");
+
+		tb_xdomain_release_in_hopid(net->xd, net->remote_transmit_path);
+		net->remote_transmit_path = 0;
 	}
 
 	net->login_retries = 0;
@@ -424,7 +432,7 @@ static int tbnet_handle_packet(const void *buf, size_t size, void *data)
 		if (!ret) {
 			mutex_lock(&net->connection_lock);
 			net->login_received = true;
-			net->transmit_path = pkg->transmit_path;
+			net->remote_transmit_path = pkg->transmit_path;
 
 			/* If we reached the number of max retries or
 			 * previous logout, schedule another round of
@@ -597,12 +605,18 @@ static void tbnet_connected_work(struct work_struct *work)
 	if (!connected)
 		return;
 
+	ret = tb_xdomain_alloc_in_hopid(net->xd, net->remote_transmit_path);
+	if (ret != net->remote_transmit_path) {
+		netdev_err(net->dev, "failed to allocate Rx HopID\n");
+		return;
+	}
+
 	/* Both logins successful so enable the high-speed DMA paths and
 	 * start the network device queue.
 	 */
-	ret = tb_xdomain_enable_paths(net->xd, TBNET_LOCAL_PATH,
+	ret = tb_xdomain_enable_paths(net->xd, net->local_transmit_path,
 				      net->rx_ring.ring->hop,
-				      net->transmit_path,
+				      net->remote_transmit_path,
 				      net->tx_ring.ring->hop);
 	if (ret) {
 		netdev_err(net->dev, "failed to enable DMA paths\n");
@@ -629,6 +643,7 @@ err_free_rx_buffers:
 err_stop_rings:
 	tb_ring_stop(net->rx_ring.ring);
 	tb_ring_stop(net->tx_ring.ring);
+	tb_xdomain_release_in_hopid(net->xd, net->remote_transmit_path);
 }
 
 static void tbnet_login_work(struct work_struct *work)
@@ -851,6 +866,7 @@ static int tbnet_open(struct net_device *dev)
 	struct tb_xdomain *xd = net->xd;
 	u16 sof_mask, eof_mask;
 	struct tb_ring *ring;
+	int hopid;
 
 	netif_carrier_off(dev);
 
@@ -862,6 +878,15 @@ static int tbnet_open(struct net_device *dev)
 	}
 	net->tx_ring.ring = ring;
 
+	hopid = tb_xdomain_alloc_out_hopid(xd, -1);
+	if (hopid < 0) {
+		netdev_err(dev, "failed to allocate Tx HopID\n");
+		tb_ring_free(net->tx_ring.ring);
+		net->tx_ring.ring = NULL;
+		return hopid;
+	}
+	net->local_transmit_path = hopid;
+
 	sof_mask = BIT(TBIP_PDF_FRAME_START);
 	eof_mask = BIT(TBIP_PDF_FRAME_END);
 
@@ -893,6 +918,8 @@ static int tbnet_stop(struct net_device *dev)
 
 	tb_ring_free(net->rx_ring.ring);
 	net->rx_ring.ring = NULL;
+
+	tb_xdomain_release_out_hopid(net->xd, net->local_transmit_path);
 	tb_ring_free(net->tx_ring.ring);
 	net->tx_ring.ring = NULL;
 
diff --git a/drivers/thunderbolt/dma_test.c b/drivers/thunderbolt/dma_test.c
index 6debaf5a6604..3bedecb236e0 100644
--- a/drivers/thunderbolt/dma_test.c
+++ b/drivers/thunderbolt/dma_test.c
@@ -13,7 +13,6 @@
 #include <linux/sizes.h>
 #include <linux/thunderbolt.h>
 
-#define DMA_TEST_HOPID			8
 #define DMA_TEST_TX_RING_SIZE		64
 #define DMA_TEST_RX_RING_SIZE		256
 #define DMA_TEST_FRAME_SIZE		SZ_4K
@@ -72,7 +71,9 @@ static const char * const dma_test_result_names[] = {
  * @svc: XDomain service the driver is bound to
  * @xd: XDomain the service belongs to
  * @rx_ring: Software ring holding RX frames
+ * @rx_hopid: HopID used for receiving frames
  * @tx_ring: Software ring holding TX frames
+ * @tx_hopid: HopID used for sending fames
  * @packets_to_send: Number of packets to send
  * @packets_to_receive: Number of packets to receive
  * @packets_sent: Actual number of packets sent
@@ -92,7 +93,9 @@ struct dma_test {
 	const struct tb_service *svc;
 	struct tb_xdomain *xd;
 	struct tb_ring *rx_ring;
+	int rx_hopid;
 	struct tb_ring *tx_ring;
+	int tx_hopid;
 	unsigned int packets_to_send;
 	unsigned int packets_to_receive;
 	unsigned int packets_sent;
@@ -119,10 +122,12 @@ static void *dma_test_pattern;
 static void dma_test_free_rings(struct dma_test *dt)
 {
 	if (dt->rx_ring) {
+		tb_xdomain_release_in_hopid(dt->xd, dt->rx_hopid);
 		tb_ring_free(dt->rx_ring);
 		dt->rx_ring = NULL;
 	}
 	if (dt->tx_ring) {
+		tb_xdomain_release_out_hopid(dt->xd, dt->tx_hopid);
 		tb_ring_free(dt->tx_ring);
 		dt->tx_ring = NULL;
 	}
@@ -151,6 +156,14 @@ static int dma_test_start_rings(struct dma_test *dt)
 
 		dt->tx_ring = ring;
 		e2e_tx_hop = ring->hop;
+
+		ret = tb_xdomain_alloc_out_hopid(xd, -1);
+		if (ret < 0) {
+			dma_test_free_rings(dt);
+			return ret;
+		}
+
+		dt->tx_hopid = ret;
 	}
 
 	if (dt->packets_to_receive) {
@@ -168,11 +181,19 @@ static int dma_test_start_rings(struct dma_test *dt)
 		}
 
 		dt->rx_ring = ring;
+
+		ret = tb_xdomain_alloc_in_hopid(xd, -1);
+		if (ret < 0) {
+			dma_test_free_rings(dt);
+			return ret;
+		}
+
+		dt->rx_hopid = ret;
 	}
 
-	ret = tb_xdomain_enable_paths(dt->xd, DMA_TEST_HOPID,
+	ret = tb_xdomain_enable_paths(dt->xd, dt->tx_hopid,
 				      dt->tx_ring ? dt->tx_ring->hop : 0,
-				      DMA_TEST_HOPID,
+				      dt->rx_hopid,
 				      dt->rx_ring ? dt->rx_ring->hop : 0);
 	if (ret) {
 		dma_test_free_rings(dt);
@@ -189,12 +210,18 @@ static int dma_test_start_rings(struct dma_test *dt)
 
 static void dma_test_stop_rings(struct dma_test *dt)
 {
+	int ret;
+
 	if (dt->rx_ring)
 		tb_ring_stop(dt->rx_ring);
 	if (dt->tx_ring)
 		tb_ring_stop(dt->tx_ring);
 
-	if (tb_xdomain_disable_paths(dt->xd))
+	ret = tb_xdomain_disable_paths(dt->xd, dt->tx_hopid,
+				       dt->tx_ring ? dt->tx_ring->hop : 0,
+				       dt->rx_hopid,
+				       dt->rx_ring ? dt->rx_ring->hop : 0);
+	if (ret)
 		dev_warn(&dt->svc->dev, "failed to disable DMA paths\n");
 
 	dma_test_free_rings(dt);
diff --git a/drivers/thunderbolt/domain.c b/drivers/thunderbolt/domain.c
index 039486b61b6a..a7d83eec3d15 100644
--- a/drivers/thunderbolt/domain.c
+++ b/drivers/thunderbolt/domain.c
@@ -791,6 +791,10 @@ int tb_domain_disconnect_pcie_paths(struct tb *tb)
  * tb_domain_approve_xdomain_paths() - Enable DMA paths for XDomain
  * @tb: Domain enabling the DMA paths
  * @xd: XDomain DMA paths are created to
+ * @transmit_path: HopID we are using to send out packets
+ * @transmit_ring: DMA ring used to send out packets
+ * @receive_path: HopID the other end is using to send packets to us
+ * @receive_ring: DMA ring used to receive packets from @receive_path
  *
  * Calls connection manager specific method to enable DMA paths to the
  * XDomain in question.
@@ -799,18 +803,25 @@ int tb_domain_disconnect_pcie_paths(struct tb *tb)
  * particular returns %-ENOTSUPP if the connection manager
  * implementation does not support XDomains.
  */
-int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+				    int transmit_path, int transmit_ring,
+				    int receive_path, int receive_ring)
 {
 	if (!tb->cm_ops->approve_xdomain_paths)
 		return -ENOTSUPP;
 
-	return tb->cm_ops->approve_xdomain_paths(tb, xd);
+	return tb->cm_ops->approve_xdomain_paths(tb, xd, transmit_path,
+			transmit_ring, receive_path, receive_ring);
 }
 
 /**
  * tb_domain_disconnect_xdomain_paths() - Disable DMA paths for XDomain
  * @tb: Domain disabling the DMA paths
  * @xd: XDomain whose DMA paths are disconnected
+ * @transmit_path: HopID we are using to send out packets
+ * @transmit_ring: DMA ring used to send out packets
+ * @receive_path: HopID the other end is using to send packets to us
+ * @receive_ring: DMA ring used to receive packets from @receive_path
  *
  * Calls connection manager specific method to disconnect DMA paths to
  * the XDomain in question.
@@ -819,12 +830,15 @@ int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
  * particular returns %-ENOTSUPP if the connection manager
  * implementation does not support XDomains.
  */
-int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+				       int transmit_path, int transmit_ring,
+				       int receive_path, int receive_ring)
 {
 	if (!tb->cm_ops->disconnect_xdomain_paths)
 		return -ENOTSUPP;
 
-	return tb->cm_ops->disconnect_xdomain_paths(tb, xd);
+	return tb->cm_ops->disconnect_xdomain_paths(tb, xd, transmit_path,
+			transmit_ring, receive_path, receive_ring);
 }
 
 static int disconnect_xdomain(struct device *dev, void *data)
@@ -835,7 +849,7 @@ static int disconnect_xdomain(struct device *dev, void *data)
 
 	xd = tb_to_xdomain(dev);
 	if (xd && xd->tb == tb)
-		ret = tb_xdomain_disable_paths(xd);
+		ret = tb_xdomain_disable_all_paths(xd);
 
 	return ret;
 }
diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c
index c111b946c64d..2f30b816705a 100644
--- a/drivers/thunderbolt/icm.c
+++ b/drivers/thunderbolt/icm.c
@@ -557,7 +557,9 @@ static int icm_fr_challenge_switch_key(struct tb *tb, struct tb_switch *sw,
 	return 0;
 }
 
-static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+					int transmit_path, int transmit_ring,
+					int receive_path, int receive_ring)
 {
 	struct icm_fr_pkg_approve_xdomain_response reply;
 	struct icm_fr_pkg_approve_xdomain request;
@@ -568,10 +570,10 @@ static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
 	request.link_info = xd->depth << ICM_LINK_INFO_DEPTH_SHIFT | xd->link;
 	memcpy(&request.remote_uuid, xd->remote_uuid, sizeof(*xd->remote_uuid));
 
-	request.transmit_path = xd->transmit_path;
-	request.transmit_ring = xd->transmit_ring;
-	request.receive_path = xd->receive_path;
-	request.receive_ring = xd->receive_ring;
+	request.transmit_path = transmit_path;
+	request.transmit_ring = transmit_ring;
+	request.receive_path = receive_path;
+	request.receive_ring = receive_ring;
 
 	memset(&reply, 0, sizeof(reply));
 	ret = icm_request(tb, &request, sizeof(request), &reply, sizeof(reply),
@@ -585,7 +587,9 @@ static int icm_fr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
 	return 0;
 }
 
-static int icm_fr_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+static int icm_fr_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+					   int transmit_path, int transmit_ring,
+					   int receive_path, int receive_ring)
 {
 	u8 phy_port;
 	u8 cmd;
@@ -1122,7 +1126,9 @@ static int icm_tr_challenge_switch_key(struct tb *tb, struct tb_switch *sw,
 	return 0;
 }
 
-static int icm_tr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+static int icm_tr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+					int transmit_path, int transmit_ring,
+					int receive_path, int receive_ring)
 {
 	struct icm_tr_pkg_approve_xdomain_response reply;
 	struct icm_tr_pkg_approve_xdomain request;
@@ -1132,10 +1138,10 @@ static int icm_tr_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
 	request.hdr.code = ICM_APPROVE_XDOMAIN;
 	request.route_hi = upper_32_bits(xd->route);
 	request.route_lo = lower_32_bits(xd->route);
-	request.transmit_path = xd->transmit_path;
-	request.transmit_ring = xd->transmit_ring;
-	request.receive_path = xd->receive_path;
-	request.receive_ring = xd->receive_ring;
+	request.transmit_path = transmit_path;
+	request.transmit_ring = transmit_ring;
+	request.receive_path = receive_path;
+	request.receive_ring = receive_ring;
 	memcpy(&request.remote_uuid, xd->remote_uuid, sizeof(*xd->remote_uuid));
 
 	memset(&reply, 0, sizeof(reply));
@@ -1176,7 +1182,9 @@ static int icm_tr_xdomain_tear_down(struct tb *tb, struct tb_xdomain *xd,
 	return 0;
 }
 
-static int icm_tr_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+static int icm_tr_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+					   int transmit_path, int transmit_ring,
+					   int receive_path, int receive_ring)
 {
 	int ret;
 
diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index 30e17f7d9e1f..eb15022e4e3e 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -1075,7 +1075,9 @@ static int tb_tunnel_pci(struct tb *tb, struct tb_switch *sw)
 	return 0;
 }
 
-static int tb_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+static int tb_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+				    int transmit_path, int transmit_ring,
+				    int receive_path, int receive_ring)
 {
 	struct tb_cm *tcm = tb_priv(tb);
 	struct tb_port *nhi_port, *dst_port;
@@ -1087,9 +1089,8 @@ static int tb_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
 	nhi_port = tb_switch_find_port(tb->root_switch, TB_TYPE_NHI);
 
 	mutex_lock(&tb->lock);
-	tunnel = tb_tunnel_alloc_dma(tb, nhi_port, dst_port, xd->transmit_ring,
-				     xd->transmit_path, xd->receive_ring,
-				     xd->receive_path);
+	tunnel = tb_tunnel_alloc_dma(tb, nhi_port, dst_port, transmit_path,
+				     transmit_ring, receive_path, receive_ring);
 	if (!tunnel) {
 		mutex_unlock(&tb->lock);
 		return -ENOMEM;
@@ -1108,29 +1109,40 @@ static int tb_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
 	return 0;
 }
 
-static void __tb_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+static void __tb_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+					  int transmit_path, int transmit_ring,
+					  int receive_path, int receive_ring)
 {
-	struct tb_port *dst_port;
-	struct tb_tunnel *tunnel;
+	struct tb_cm *tcm = tb_priv(tb);
+	struct tb_port *nhi_port, *dst_port;
+	struct tb_tunnel *tunnel, *n;
 	struct tb_switch *sw;
 
 	sw = tb_to_switch(xd->dev.parent);
 	dst_port = tb_port_at(xd->route, sw);
+	nhi_port = tb_switch_find_port(tb->root_switch, TB_TYPE_NHI);
 
-	/*
-	 * It is possible that the tunnel was already teared down (in
-	 * case of cable disconnect) so it is fine if we cannot find it
-	 * here anymore.
-	 */
-	tunnel = tb_find_tunnel(tb, TB_TUNNEL_DMA, NULL, dst_port);
-	tb_deactivate_and_free_tunnel(tunnel);
+	list_for_each_entry_safe(tunnel, n, &tcm->tunnel_list, list) {
+		if (!tb_tunnel_is_dma(tunnel))
+			continue;
+		if (tunnel->src_port != nhi_port || tunnel->dst_port != dst_port)
+			continue;
+
+		if (tb_tunnel_match_dma(tunnel, transmit_path, transmit_ring,
+					receive_path, receive_ring))
+			tb_deactivate_and_free_tunnel(tunnel);
+	}
 }
 
-static int tb_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd)
+static int tb_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+				       int transmit_path, int transmit_ring,
+				       int receive_path, int receive_ring)
 {
 	if (!xd->is_unplugged) {
 		mutex_lock(&tb->lock);
-		__tb_disconnect_xdomain_paths(tb, xd);
+		__tb_disconnect_xdomain_paths(tb, xd, transmit_path,
+					      transmit_ring, receive_path,
+					      receive_ring);
 		mutex_unlock(&tb->lock);
 	}
 	return 0;
@@ -1206,12 +1218,12 @@ static void tb_handle_hotplug(struct work_struct *work)
 			 * tb_xdomain_remove() so setting XDomain as
 			 * unplugged here prevents deadlock if they call
 			 * tb_xdomain_disable_paths(). We will tear down
-			 * the path below.
+			 * all the tunnels below.
 			 */
 			xd->is_unplugged = true;
 			tb_xdomain_remove(xd);
 			port->xdomain = NULL;
-			__tb_disconnect_xdomain_paths(tb, xd);
+			__tb_disconnect_xdomain_paths(tb, xd, -1, -1, -1, -1);
 			tb_xdomain_put(xd);
 			tb_port_unconfigure_xdomain(port);
 		} else if (tb_port_is_dpout(port) || tb_port_is_dpin(port)) {
diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
index 56f6a3f13678..9790e9f13d2b 100644
--- a/drivers/thunderbolt/tb.h
+++ b/drivers/thunderbolt/tb.h
@@ -406,8 +406,12 @@ struct tb_cm_ops {
 	int (*challenge_switch_key)(struct tb *tb, struct tb_switch *sw,
 				    const u8 *challenge, u8 *response);
 	int (*disconnect_pcie_paths)(struct tb *tb);
-	int (*approve_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd);
-	int (*disconnect_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd);
+	int (*approve_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd,
+				     int transmit_path, int transmit_ring,
+				     int receive_path, int receive_ring);
+	int (*disconnect_xdomain_paths)(struct tb *tb, struct tb_xdomain *xd,
+					int transmit_path, int transmit_ring,
+					int receive_path, int receive_ring);
 	int (*usb4_switch_op)(struct tb_switch *sw, u16 opcode, u32 *metadata,
 			      u8 *status, const void *tx_data, size_t tx_data_len,
 			      void *rx_data, size_t rx_data_len);
@@ -641,8 +645,12 @@ int tb_domain_approve_switch(struct tb *tb, struct tb_switch *sw);
 int tb_domain_approve_switch_key(struct tb *tb, struct tb_switch *sw);
 int tb_domain_challenge_switch_key(struct tb *tb, struct tb_switch *sw);
 int tb_domain_disconnect_pcie_paths(struct tb *tb);
-int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd);
-int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd);
+int tb_domain_approve_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+				    int transmit_path, int transmit_ring,
+				    int receive_path, int receive_ring);
+int tb_domain_disconnect_xdomain_paths(struct tb *tb, struct tb_xdomain *xd,
+				       int transmit_path, int transmit_ring,
+				       int receive_path, int receive_ring);
 int tb_domain_disconnect_all_paths(struct tb *tb);
 
 static inline struct tb *tb_domain_get(struct tb *tb)
diff --git a/drivers/thunderbolt/tunnel.c b/drivers/thunderbolt/tunnel.c
index 2e7ec037a73e..e1979bed7146 100644
--- a/drivers/thunderbolt/tunnel.c
+++ b/drivers/thunderbolt/tunnel.c
@@ -815,28 +815,28 @@ static void tb_dma_init_path(struct tb_path *path, unsigned int efc, u32 credits
  * @tb: Pointer to the domain structure
  * @nhi: Host controller port
  * @dst: Destination null port which the other domain is connected to
- * @transmit_ring: NHI ring number used to send packets towards the
- *		   other domain. Set to %0 if TX path is not needed.
  * @transmit_path: HopID used for transmitting packets
- * @receive_ring: NHI ring number used to receive packets from the
- *		  other domain. Set to %0 if RX path is not needed.
+ * @transmit_ring: NHI ring number used to send packets towards the
+ *		   other domain. Set to %-1 if TX path is not needed.
  * @receive_path: HopID used for receiving packets
+ * @receive_ring: NHI ring number used to receive packets from the
+ *		  other domain. Set to %-1 if RX path is not needed.
  *
  * Return: Returns a tb_tunnel on success or NULL on failure.
  */
 struct tb_tunnel *tb_tunnel_alloc_dma(struct tb *tb, struct tb_port *nhi,
-				      struct tb_port *dst, int transmit_ring,
-				      int transmit_path, int receive_ring,
-				      int receive_path)
+				      struct tb_port *dst, int transmit_path,
+				      int transmit_ring, int receive_path,
+				      int receive_ring)
 {
 	struct tb_tunnel *tunnel;
 	size_t npaths = 0, i = 0;
 	struct tb_path *path;
 	u32 credits;
 
-	if (receive_ring)
+	if (receive_ring > 0)
 		npaths++;
-	if (transmit_ring)
+	if (transmit_ring > 0)
 		npaths++;
 
 	if (WARN_ON(!npaths))
@@ -851,7 +851,7 @@ struct tb_tunnel *tb_tunnel_alloc_dma(struct tb *tb, struct tb_port *nhi,
 
 	credits = tb_dma_credits(nhi);
 
-	if (receive_ring) {
+	if (receive_ring > 0) {
 		path = tb_path_alloc(tb, dst, receive_path, nhi, receive_ring, 0,
 				     "DMA RX");
 		if (!path) {
@@ -862,7 +862,7 @@ struct tb_tunnel *tb_tunnel_alloc_dma(struct tb *tb, struct tb_port *nhi,
 		tunnel->paths[i++] = path;
 	}
 
-	if (transmit_ring) {
+	if (transmit_ring > 0) {
 		path = tb_path_alloc(tb, nhi, transmit_ring, dst, transmit_path, 0,
 				     "DMA TX");
 		if (!path) {
@@ -876,6 +876,66 @@ struct tb_tunnel *tb_tunnel_alloc_dma(struct tb *tb, struct tb_port *nhi,
 	return tunnel;
 }
 
+/**
+ * tb_tunnel_match_dma() - Match DMA tunnel
+ * @tunnel: Tunnel to match
+ * @transmit_path: HopID used for transmitting packets. Pass %-1 to ignore.
+ * @transmit_ring: NHI ring number used to send packets towards the
+ *		   other domain. Pass %-1 to ignore.
+ * @receive_path: HopID used for receiving packets. Pass %-1 to ignore.
+ * @receive_ring: NHI ring number used to receive packets from the
+ *		  other domain. Pass %-1 to ignore.
+ *
+ * This function can be used to match specific DMA tunnel, if there are
+ * multiple DMA tunnels going through the same XDomain connection.
+ * Returns true if there is match and false otherwise.
+ */
+bool tb_tunnel_match_dma(const struct tb_tunnel *tunnel, int transmit_path,
+			 int transmit_ring, int receive_path, int receive_ring)
+{
+	const struct tb_path *tx_path = NULL, *rx_path = NULL;
+	int i;
+
+	if (!receive_ring || !transmit_ring)
+		return false;
+
+	for (i = 0; i < tunnel->npaths; i++) {
+		const struct tb_path *path = tunnel->paths[i];
+
+		if (!path)
+			continue;
+
+		if (tb_port_is_nhi(path->hops[0].in_port))
+			tx_path = path;
+		else if (tb_port_is_nhi(path->hops[path->path_length - 1].out_port))
+			rx_path = path;
+	}
+
+	if (transmit_ring > 0 || transmit_path > 0) {
+		if (!tx_path)
+			return false;
+		if (transmit_ring > 0 &&
+		    (tx_path->hops[0].in_hop_index != transmit_ring))
+			return false;
+		if (transmit_path > 0 &&
+		    (tx_path->hops[tx_path->path_length - 1].next_hop_index != transmit_path))
+			return false;
+	}
+
+	if (receive_ring > 0 || receive_path > 0) {
+		if (!rx_path)
+			return false;
+		if (receive_path > 0 &&
+		    (rx_path->hops[0].in_hop_index != receive_path))
+			return false;
+		if (receive_ring > 0 &&
+		    (rx_path->hops[rx_path->path_length - 1].next_hop_index != receive_ring))
+			return false;
+	}
+
+	return true;
+}
+
 static int tb_usb3_max_link_rate(struct tb_port *up, struct tb_port *down)
 {
 	int ret, up_max_rate, down_max_rate;
diff --git a/drivers/thunderbolt/tunnel.h b/drivers/thunderbolt/tunnel.h
index 1d2a64eb060d..a66994fb4e60 100644
--- a/drivers/thunderbolt/tunnel.h
+++ b/drivers/thunderbolt/tunnel.h
@@ -70,9 +70,11 @@ struct tb_tunnel *tb_tunnel_alloc_dp(struct tb *tb, struct tb_port *in,
 				     struct tb_port *out, int max_up,
 				     int max_down);
 struct tb_tunnel *tb_tunnel_alloc_dma(struct tb *tb, struct tb_port *nhi,
-				      struct tb_port *dst, int transmit_ring,
-				      int transmit_path, int receive_ring,
-				      int receive_path);
+				      struct tb_port *dst, int transmit_path,
+				      int transmit_ring, int receive_path,
+				      int receive_ring);
+bool tb_tunnel_match_dma(const struct tb_tunnel *tunnel, int transmit_path,
+			 int transmit_ring, int receive_path, int receive_ring);
 struct tb_tunnel *tb_tunnel_discover_usb3(struct tb *tb, struct tb_port *down);
 struct tb_tunnel *tb_tunnel_alloc_usb3(struct tb *tb, struct tb_port *up,
 				       struct tb_port *down, int max_up,
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
index ab56757d7c24..b21d99d59412 100644
--- a/drivers/thunderbolt/xdomain.c
+++ b/drivers/thunderbolt/xdomain.c
@@ -1295,6 +1295,8 @@ static void tb_xdomain_release(struct device *dev)
 
 	kfree(xd->local_property_block);
 	tb_property_free_dir(xd->remote_properties);
+	ida_destroy(&xd->out_hopids);
+	ida_destroy(&xd->in_hopids);
 	ida_destroy(&xd->service_ids);
 
 	kfree(xd->local_uuid);
@@ -1388,6 +1390,8 @@ struct tb_xdomain *tb_xdomain_alloc(struct tb *tb, struct device *parent,
 	xd->route = route;
 	xd->local_max_hopid = down->config.max_in_hop_id;
 	ida_init(&xd->service_ids);
+	ida_init(&xd->in_hopids);
+	ida_init(&xd->out_hopids);
 	mutex_init(&xd->lock);
 	INIT_DELAYED_WORK(&xd->get_uuid_work, tb_xdomain_get_uuid);
 	INIT_DELAYED_WORK(&xd->get_properties_work, tb_xdomain_get_properties);
@@ -1553,73 +1557,118 @@ void tb_xdomain_lane_bonding_disable(struct tb_xdomain *xd)
 EXPORT_SYMBOL_GPL(tb_xdomain_lane_bonding_disable);
 
 /**
- * tb_xdomain_enable_paths() - Enable DMA paths for XDomain connection
+ * tb_xdomain_alloc_in_hopid() - Allocate input HopID for tunneling
  * @xd: XDomain connection
- * @transmit_path: HopID of the transmit path the other end is using to
- *		   send packets
- * @transmit_ring: DMA ring used to receive packets from the other end
- * @receive_path: HopID of the receive path the other end is using to
- *		  receive packets
- * @receive_ring: DMA ring used to send packets to the other end
- *
- * The function enables DMA paths accordingly so that after successful
- * return the caller can send and receive packets using high-speed DMA
- * path.
+ * @hopid: Preferred HopID or %-1 for next available
  *
- * Return: %0 in case of success and negative errno in case of error
+ * Returns allocated HopID or negative errno. Specifically returns
+ * %-ENOSPC if there are no more available HopIDs. Returned HopID is
+ * guaranteed to be within range supported by the input lane adapter.
+ * Call tb_xdomain_release_in_hopid() to release the allocated HopID.
  */
-int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
-			    u16 transmit_ring, u16 receive_path,
-			    u16 receive_ring)
+int tb_xdomain_alloc_in_hopid(struct tb_xdomain *xd, int hopid)
 {
-	int ret;
+	if (hopid < 0)
+		hopid = TB_PATH_MIN_HOPID;
+	if (hopid < TB_PATH_MIN_HOPID || hopid > xd->local_max_hopid)
+		return -EINVAL;
 
-	mutex_lock(&xd->lock);
+	return ida_alloc_range(&xd->in_hopids, hopid, xd->local_max_hopid,
+			       GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_alloc_in_hopid);
 
-	if (xd->transmit_path) {
-		ret = xd->transmit_path == transmit_path ? 0 : -EBUSY;
-		goto exit_unlock;
-	}
+/**
+ * tb_xdomain_alloc_out_hopid() - Allocate output HopID for tunneling
+ * @xd: XDomain connection
+ * @hopid: Preferred HopID or %-1 for next available
+ *
+ * Returns allocated HopID or negative errno. Specifically returns
+ * %-ENOSPC if there are no more available HopIDs. Returned HopID is
+ * guaranteed to be within range supported by the output lane adapter.
+ * Call tb_xdomain_release_in_hopid() to release the allocated HopID.
+ */
+int tb_xdomain_alloc_out_hopid(struct tb_xdomain *xd, int hopid)
+{
+	if (hopid < 0)
+		hopid = TB_PATH_MIN_HOPID;
+	if (hopid < TB_PATH_MIN_HOPID || hopid > xd->remote_max_hopid)
+		return -EINVAL;
 
-	xd->transmit_path = transmit_path;
-	xd->transmit_ring = transmit_ring;
-	xd->receive_path = receive_path;
-	xd->receive_ring = receive_ring;
+	return ida_alloc_range(&xd->out_hopids, hopid, xd->remote_max_hopid,
+			       GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_alloc_out_hopid);
 
-	ret = tb_domain_approve_xdomain_paths(xd->tb, xd);
+/**
+ * tb_xdomain_release_in_hopid() - Release input HopID
+ * @xd: XDomain connection
+ * @hopid: HopID to release
+ */
+void tb_xdomain_release_in_hopid(struct tb_xdomain *xd, int hopid)
+{
+	ida_free(&xd->in_hopids, hopid);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_release_in_hopid);
 
-exit_unlock:
-	mutex_unlock(&xd->lock);
+/**
+ * tb_xdomain_release_out_hopid() - Release output HopID
+ * @xd: XDomain connection
+ * @hopid: HopID to release
+ */
+void tb_xdomain_release_out_hopid(struct tb_xdomain *xd, int hopid)
+{
+	ida_free(&xd->out_hopids, hopid);
+}
+EXPORT_SYMBOL_GPL(tb_xdomain_release_out_hopid);
 
-	return ret;
+/**
+ * tb_xdomain_enable_paths() - Enable DMA paths for XDomain connection
+ * @xd: XDomain connection
+ * @transmit_path: HopID we are using to send out packets
+ * @transmit_ring: DMA ring used to send out packets
+ * @receive_path: HopID the other end is using to send packets to us
+ * @receive_ring: DMA ring used to receive packets from @receive_path
+ *
+ * The function enables DMA paths accordingly so that after successful
+ * return the caller can send and receive packets using high-speed DMA
+ * path. If a transmit or receive path is not needed, pass %-1 for those
+ * parameters.
+ *
+ * Return: %0 in case of success and negative errno in case of error
+ */
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, int transmit_path,
+			    int transmit_ring, int receive_path,
+			    int receive_ring)
+{
+	return tb_domain_approve_xdomain_paths(xd->tb, xd, transmit_path,
+					       transmit_ring, receive_path,
+					       receive_ring);
 }
 EXPORT_SYMBOL_GPL(tb_xdomain_enable_paths);
 
 /**
  * tb_xdomain_disable_paths() - Disable DMA paths for XDomain connection
  * @xd: XDomain connection
+ * @transmit_path: HopID we are using to send out packets
+ * @transmit_ring: DMA ring used to send out packets
+ * @receive_path: HopID the other end is using to send packets to us
+ * @receive_ring: DMA ring used to receive packets from @receive_path
  *
  * This does the opposite of tb_xdomain_enable_paths(). After call to
- * this the caller is not expected to use the rings anymore.
+ * this the caller is not expected to use the rings anymore. Passing %-1
+ * as path/ring parameter means don't care. Normally the callers should
+ * pass the same values here as they do when paths are enabled.
  *
  * Return: %0 in case of success and negative errno in case of error
  */
-int tb_xdomain_disable_paths(struct tb_xdomain *xd)
+int tb_xdomain_disable_paths(struct tb_xdomain *xd, int transmit_path,
+			     int transmit_ring, int receive_path,
+			     int receive_ring)
 {
-	int ret = 0;
-
-	mutex_lock(&xd->lock);
-	if (xd->transmit_path) {
-		xd->transmit_path = 0;
-		xd->transmit_ring = 0;
-		xd->receive_path = 0;
-		xd->receive_ring = 0;
-
-		ret = tb_domain_disconnect_xdomain_paths(xd->tb, xd);
-	}
-	mutex_unlock(&xd->lock);
-
-	return ret;
+	return tb_domain_disconnect_xdomain_paths(xd->tb, xd, transmit_path,
+						  transmit_ring, receive_path,
+						  receive_ring);
 }
 EXPORT_SYMBOL_GPL(tb_xdomain_disable_paths);
 
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 3e0ce654d60c..e7c96c37174f 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -190,11 +190,9 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
  * @is_unplugged: The XDomain is unplugged
  * @needs_uuid: If the XDomain does not have @remote_uuid it will be
  *		queried first
- * @transmit_path: HopID which the remote end expects us to transmit
- * @transmit_ring: Local ring (hop) where outgoing packets are pushed
- * @receive_path: HopID which we expect the remote end to transmit
- * @receive_ring: Local ring (hop) where incoming packets arrive
  * @service_ids: Used to generate IDs for the services
+ * @in_hopids: Input HopIDs for DMA tunneling
+ * @out_hopids; Output HopIDs for DMA tunneling
  * @local_property_block: Local block of properties
  * @local_property_block_gen: Generation of @local_property_block
  * @local_property_block_len: Length of the @local_property_block in dwords
@@ -238,11 +236,9 @@ struct tb_xdomain {
 	unsigned int link_width;
 	bool is_unplugged;
 	bool needs_uuid;
-	u16 transmit_path;
-	u16 transmit_ring;
-	u16 receive_path;
-	u16 receive_ring;
 	struct ida service_ids;
+	struct ida in_hopids;
+	struct ida out_hopids;
 	u32 *local_property_block;
 	u32 local_property_block_gen;
 	u32 local_property_block_len;
@@ -260,10 +256,22 @@ struct tb_xdomain {
 
 int tb_xdomain_lane_bonding_enable(struct tb_xdomain *xd);
 void tb_xdomain_lane_bonding_disable(struct tb_xdomain *xd);
-int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
-			    u16 transmit_ring, u16 receive_path,
-			    u16 receive_ring);
-int tb_xdomain_disable_paths(struct tb_xdomain *xd);
+int tb_xdomain_alloc_in_hopid(struct tb_xdomain *xd, int hopid);
+void tb_xdomain_release_in_hopid(struct tb_xdomain *xd, int hopid);
+int tb_xdomain_alloc_out_hopid(struct tb_xdomain *xd, int hopid);
+void tb_xdomain_release_out_hopid(struct tb_xdomain *xd, int hopid);
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, int transmit_path,
+			    int transmit_ring, int receive_path,
+			    int receive_ring);
+int tb_xdomain_disable_paths(struct tb_xdomain *xd, int transmit_path,
+			     int transmit_ring, int receive_path,
+			     int receive_ring);
+
+static inline int tb_xdomain_disable_all_paths(struct tb_xdomain *xd)
+{
+	return tb_xdomain_disable_paths(xd, -1, -1, -1, -1);
+}
+
 struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid);
 struct tb_xdomain *tb_xdomain_find_by_route(struct tb *tb, u64 route);
 
-- 
cgit v1.2.3


From 53e043b2b432ef2294efec04dd8a88d96c024624 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Mon, 15 Mar 2021 12:56:56 +0100
Subject: io_uring: remove structures from include/linux/io_uring.h

Link: https://lore.kernel.org/r/8c1d14f3748105f4caeda01716d47af2fa41d11c.1615809009.git.metze@samba.org
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.h               | 10 +++++++++-
 fs/io_uring.c            | 16 ++++++++++++++++
 include/linux/io_uring.h | 25 -------------------------
 3 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 1ac2f3248088..80d590564ff9 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -2,7 +2,6 @@
 #define INTERNAL_IO_WQ_H
 
 #include <linux/refcount.h>
-#include <linux/io_uring.h>
 
 struct io_wq;
 
@@ -21,6 +20,15 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
+struct io_wq_work_node {
+	struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+	struct io_wq_work_node *first;
+	struct io_wq_work_node *last;
+};
+
 static inline void wq_list_add_after(struct io_wq_work_node *node,
 				     struct io_wq_work_node *pos,
 				     struct io_wq_work_list *list)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a81f7a30ea70..52ba8d7f3eb8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -456,6 +456,22 @@ struct io_ring_ctx {
 	struct list_head		tctx_list;
 };
 
+struct io_uring_task {
+	/* submission side */
+	struct xarray		xa;
+	struct wait_queue_head	wait;
+	void			*last;
+	void			*io_wq;
+	struct percpu_counter	inflight;
+	atomic_t		in_idle;
+	bool			sqpoll;
+
+	spinlock_t		task_lock;
+	struct io_wq_work_list	task_list;
+	unsigned long		task_state;
+	struct callback_head	task_work;
+};
+
 /*
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 9761a0ec9f95..79cde9906be0 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,31 +5,6 @@
 #include <linux/sched.h>
 #include <linux/xarray.h>
 
-struct io_wq_work_node {
-	struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
-	struct io_wq_work_node *first;
-	struct io_wq_work_node *last;
-};
-
-struct io_uring_task {
-	/* submission side */
-	struct xarray		xa;
-	struct wait_queue_head	wait;
-	void			*last;
-	void			*io_wq;
-	struct percpu_counter	inflight;
-	atomic_t		in_idle;
-	bool			sqpoll;
-
-	spinlock_t		task_lock;
-	struct io_wq_work_list	task_list;
-	unsigned long		task_state;
-	struct callback_head	task_work;
-};
-
 #if defined(CONFIG_IO_URING)
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_task_cancel(void);
-- 
cgit v1.2.3


From 2e8496f31d0be8f43849b2980b069f3a9805d047 Mon Sep 17 00:00:00 2001
From: Richard Gong <richard.gong@intel.com>
Date: Tue, 9 Feb 2021 16:20:27 -0600
Subject: firmware: stratix10-svc: reset COMMAND_RECONFIG_FLAG_PARTIAL to 0

Clean up COMMAND_RECONFIG_FLAG_PARTIAL flag by resetting it to 0, which
aligns with the firmware settings.

Fixes: 36847f9e3e56 ("firmware: stratix10-svc: correct reconfig flag and timeout values")
Signed-off-by: Richard Gong <richard.gong@intel.com>
Reviewed-by: Tom Rix <trix@redhat.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
 include/linux/firmware/intel/stratix10-svc-client.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h
index ebc295647581..19781b0f6429 100644
--- a/include/linux/firmware/intel/stratix10-svc-client.h
+++ b/include/linux/firmware/intel/stratix10-svc-client.h
@@ -56,7 +56,7 @@
  * COMMAND_RECONFIG_FLAG_PARTIAL:
  * Set to FPGA configuration type (full or partial).
  */
-#define COMMAND_RECONFIG_FLAG_PARTIAL	1
+#define COMMAND_RECONFIG_FLAG_PARTIAL	0
 
 /*
  * Timeout settings for service clients:
-- 
cgit v1.2.3


From b47e330231acbf4506b049643145cc64268a1940 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 16 Mar 2021 12:41:03 -0400
Subject: tracing: Pass buffer of event to trigger operations

The ring_buffer_event_time_stamp() is going to be updated to extract the
time stamp for the event without needing it to be set to have absolute
values for all events. But to do so, it needs the buffer that the event is
on as the buffer saves information for the event before it is committed to
the buffer.

If the trace buffer is disabled, a temporary buffer is used, and there's
no access to this buffer from the current histogram triggers, even though
it is passed to the trace event code.

Pass the buffer that the event is on all the way down to the histogram
triggers.

Link: https://lkml.kernel.org/r/20210316164113.542448131@goodmis.org

Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h        |  5 +-
 kernel/trace/trace.c                |  2 +-
 kernel/trace/trace.h                |  4 +-
 kernel/trace/trace_events_hist.c    | 92 ++++++++++++++++++++++++-------------
 kernel/trace/trace_events_trigger.c | 45 +++++++++++-------
 5 files changed, 95 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 28e7af1406f2..8cba64ce23a4 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -640,7 +640,8 @@ enum event_trigger_type {
 extern int filter_match_preds(struct event_filter *filter, void *rec);
 
 extern enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec,
+event_triggers_call(struct trace_event_file *file,
+		    struct trace_buffer *buffer, void *rec,
 		    struct ring_buffer_event *event);
 extern void
 event_triggers_post_call(struct trace_event_file *file,
@@ -664,7 +665,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 
 	if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
 		if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
-			event_triggers_call(file, NULL, NULL);
+			event_triggers_call(file, NULL, NULL, NULL);
 		if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
 			return true;
 		if (eflags & EVENT_FILE_FL_PID_FILTER)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eccb4e1187cc..f979220238a5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6763,7 +6763,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
 		/* do not add \n before testing triggers, but add \0 */
 		entry->buf[cnt] = '\0';
-		tt = event_triggers_call(tr->trace_marker_file, entry, event);
+		tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event);
 	}
 
 	if (entry->buf[cnt - 1] != '\n') {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a6446c03cfbc..798773178d7e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1329,7 +1329,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
 	unsigned long eflags = file->flags;
 
 	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
-		*tt = event_triggers_call(file, entry, event);
+		*tt = event_triggers_call(file, buffer, entry, event);
 
 	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
 	    (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
@@ -1626,7 +1626,7 @@ extern int register_trigger_hist_enable_disable_cmds(void);
  */
 struct event_trigger_ops {
 	void			(*func)(struct event_trigger_data *data,
-					void *rec,
+					struct trace_buffer *buffer, void *rec,
 					struct ring_buffer_event *rbe);
 	int			(*init)(struct event_trigger_ops *ops,
 					struct event_trigger_data *data);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 39ebe1826fc3..6978aa3ee4c5 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -81,6 +81,7 @@ struct hist_field;
 
 typedef u64 (*hist_field_fn_t) (struct hist_field *field,
 				struct tracing_map_elt *elt,
+				struct trace_buffer *buffer,
 				struct ring_buffer_event *rbe,
 				void *event);
 
@@ -153,6 +154,7 @@ struct hist_field {
 
 static u64 hist_field_none(struct hist_field *field,
 			   struct tracing_map_elt *elt,
+			   struct trace_buffer *buffer,
 			   struct ring_buffer_event *rbe,
 			   void *event)
 {
@@ -161,6 +163,7 @@ static u64 hist_field_none(struct hist_field *field,
 
 static u64 hist_field_counter(struct hist_field *field,
 			      struct tracing_map_elt *elt,
+			      struct trace_buffer *buffer,
 			      struct ring_buffer_event *rbe,
 			      void *event)
 {
@@ -169,6 +172,7 @@ static u64 hist_field_counter(struct hist_field *field,
 
 static u64 hist_field_string(struct hist_field *hist_field,
 			     struct tracing_map_elt *elt,
+			     struct trace_buffer *buffer,
 			     struct ring_buffer_event *rbe,
 			     void *event)
 {
@@ -179,6 +183,7 @@ static u64 hist_field_string(struct hist_field *hist_field,
 
 static u64 hist_field_dynstring(struct hist_field *hist_field,
 				struct tracing_map_elt *elt,
+				struct trace_buffer *buffer,
 				struct ring_buffer_event *rbe,
 				void *event)
 {
@@ -191,6 +196,7 @@ static u64 hist_field_dynstring(struct hist_field *hist_field,
 
 static u64 hist_field_pstring(struct hist_field *hist_field,
 			      struct tracing_map_elt *elt,
+			      struct trace_buffer *buffer,
 			      struct ring_buffer_event *rbe,
 			      void *event)
 {
@@ -201,52 +207,56 @@ static u64 hist_field_pstring(struct hist_field *hist_field,
 
 static u64 hist_field_log2(struct hist_field *hist_field,
 			   struct tracing_map_elt *elt,
+			   struct trace_buffer *buffer,
 			   struct ring_buffer_event *rbe,
 			   void *event)
 {
 	struct hist_field *operand = hist_field->operands[0];
 
-	u64 val = operand->fn(operand, elt, rbe, event);
+	u64 val = operand->fn(operand, elt, buffer, rbe, event);
 
 	return (u64) ilog2(roundup_pow_of_two(val));
 }
 
 static u64 hist_field_plus(struct hist_field *hist_field,
 			   struct tracing_map_elt *elt,
+			   struct trace_buffer *buffer,
 			   struct ring_buffer_event *rbe,
 			   void *event)
 {
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, rbe, event);
-	u64 val2 = operand2->fn(operand2, elt, rbe, event);
+	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+	u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
 
 	return val1 + val2;
 }
 
 static u64 hist_field_minus(struct hist_field *hist_field,
 			    struct tracing_map_elt *elt,
+			    struct trace_buffer *buffer,
 			    struct ring_buffer_event *rbe,
 			    void *event)
 {
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, rbe, event);
-	u64 val2 = operand2->fn(operand2, elt, rbe, event);
+	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+	u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
 
 	return val1 - val2;
 }
 
 static u64 hist_field_unary_minus(struct hist_field *hist_field,
 				  struct tracing_map_elt *elt,
+				  struct trace_buffer *buffer,
 				  struct ring_buffer_event *rbe,
 				  void *event)
 {
 	struct hist_field *operand = hist_field->operands[0];
 
-	s64 sval = (s64)operand->fn(operand, elt, rbe, event);
+	s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
 	u64 val = (u64)-sval;
 
 	return val;
@@ -255,6 +265,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
 #define DEFINE_HIST_FIELD_FN(type)					\
 	static u64 hist_field_##type(struct hist_field *hist_field,	\
 				     struct tracing_map_elt *elt,	\
+				     struct trace_buffer *buffer,	\
 				     struct ring_buffer_event *rbe,	\
 				     void *event)			\
 {									\
@@ -380,7 +391,8 @@ struct hist_trigger_data {
 struct action_data;
 
 typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
-			     struct tracing_map_elt *elt, void *rec,
+			     struct tracing_map_elt *elt,
+			     struct trace_buffer *buffer, void *rec,
 			     struct ring_buffer_event *rbe, void *key,
 			     struct action_data *data, u64 *var_ref_vals);
 
@@ -608,7 +620,8 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
 }
 
 static void action_trace(struct hist_trigger_data *hist_data,
-			 struct tracing_map_elt *elt, void *rec,
+			 struct tracing_map_elt *elt,
+			 struct trace_buffer *buffer, void *rec,
 			 struct ring_buffer_event *rbe, void *key,
 			 struct action_data *data, u64 *var_ref_vals)
 {
@@ -624,6 +637,7 @@ struct hist_var_data {
 
 static u64 hist_field_timestamp(struct hist_field *hist_field,
 				struct tracing_map_elt *elt,
+				struct trace_buffer *buffer,
 				struct ring_buffer_event *rbe,
 				void *event)
 {
@@ -640,6 +654,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field,
 
 static u64 hist_field_cpu(struct hist_field *hist_field,
 			  struct tracing_map_elt *elt,
+			  struct trace_buffer *buffer,
 			  struct ring_buffer_event *rbe,
 			  void *event)
 {
@@ -1020,6 +1035,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
 
 static u64 hist_field_var_ref(struct hist_field *hist_field,
 			      struct tracing_map_elt *elt,
+			      struct trace_buffer *buffer,
 			      struct ring_buffer_event *rbe,
 			      void *event)
 {
@@ -2561,6 +2577,7 @@ find_target_event_var(struct hist_trigger_data *hist_data,
 }
 
 static inline void __update_field_vars(struct tracing_map_elt *elt,
+				       struct trace_buffer *buffer,
 				       struct ring_buffer_event *rbe,
 				       void *rec,
 				       struct field_var **field_vars,
@@ -2576,7 +2593,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
 		struct hist_field *var = field_var->var;
 		struct hist_field *val = field_var->val;
 
-		var_val = val->fn(val, elt, rbe, rec);
+		var_val = val->fn(val, elt, buffer, rbe, rec);
 		var_idx = var->var.idx;
 
 		if (val->flags & HIST_FIELD_FL_STRING) {
@@ -2592,19 +2609,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
 
 static void update_field_vars(struct hist_trigger_data *hist_data,
 			      struct tracing_map_elt *elt,
+			      struct trace_buffer *buffer,
 			      struct ring_buffer_event *rbe,
 			      void *rec)
 {
-	__update_field_vars(elt, rbe, rec, hist_data->field_vars,
+	__update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars,
 			    hist_data->n_field_vars, 0);
 }
 
 static void save_track_data_vars(struct hist_trigger_data *hist_data,
-				 struct tracing_map_elt *elt, void *rec,
+				 struct tracing_map_elt *elt,
+				 struct trace_buffer *buffer,  void *rec,
 				 struct ring_buffer_event *rbe, void *key,
 				 struct action_data *data, u64 *var_ref_vals)
 {
-	__update_field_vars(elt, rbe, rec, hist_data->save_vars,
+	__update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars,
 			    hist_data->n_save_vars, hist_data->n_field_var_str);
 }
 
@@ -2780,12 +2799,14 @@ static void save_track_val(struct hist_trigger_data *hist_data,
 }
 
 static void save_track_data(struct hist_trigger_data *hist_data,
-			    struct tracing_map_elt *elt, void *rec,
+			    struct tracing_map_elt *elt,
+			    struct trace_buffer *buffer, void *rec,
 			    struct ring_buffer_event *rbe, void *key,
 			    struct action_data *data, u64 *var_ref_vals)
 {
 	if (data->track_data.save_data)
-		data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+		data->track_data.save_data(hist_data, elt, buffer, rec, rbe,
+					   key, data, var_ref_vals);
 }
 
 static bool check_track_val(struct tracing_map_elt *elt,
@@ -2836,7 +2857,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
 }
 
 static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
-				     struct tracing_map_elt *elt, void *rec,
+				     struct tracing_map_elt *elt,
+				     struct trace_buffer *buffer, void *rec,
 				     struct ring_buffer_event *rbe, void *key,
 				     struct action_data *data,
 				     u64 *var_ref_vals)
@@ -2905,7 +2927,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
 	return false;
 }
 static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
-				     struct tracing_map_elt *elt, void *rec,
+				     struct tracing_map_elt *elt,
+				     struct trace_buffer *buffer, void *rec,
 				     struct ring_buffer_event *rbe, void *key,
 				     struct action_data *data,
 				     u64 *var_ref_vals) {}
@@ -2947,7 +2970,8 @@ static void track_data_print(struct seq_file *m,
 }
 
 static void ontrack_action(struct hist_trigger_data *hist_data,
-			   struct tracing_map_elt *elt, void *rec,
+			   struct tracing_map_elt *elt,
+			   struct trace_buffer *buffer, void *rec,
 			   struct ring_buffer_event *rbe, void *key,
 			   struct action_data *data, u64 *var_ref_vals)
 {
@@ -2955,7 +2979,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data,
 
 	if (check_track_val(elt, data, var_val)) {
 		save_track_val(hist_data, elt, data, var_val);
-		save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+		save_track_data(hist_data, elt, buffer, rec, rbe,
+				key, data, var_ref_vals);
 	}
 }
 
@@ -4400,7 +4425,8 @@ create_hist_data(unsigned int map_bits,
 }
 
 static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
-				    struct tracing_map_elt *elt, void *rec,
+				    struct tracing_map_elt *elt,
+				    struct trace_buffer *buffer, void *rec,
 				    struct ring_buffer_event *rbe,
 				    u64 *var_ref_vals)
 {
@@ -4414,7 +4440,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 
 	for_each_hist_val_field(i, hist_data) {
 		hist_field = hist_data->fields[i];
-		hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+		hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
 		if (hist_field->flags & HIST_FIELD_FL_VAR) {
 			var_idx = hist_field->var.idx;
 
@@ -4442,13 +4468,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 	for_each_hist_key_field(i, hist_data) {
 		hist_field = hist_data->fields[i];
 		if (hist_field->flags & HIST_FIELD_FL_VAR) {
-			hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+			hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
 			var_idx = hist_field->var.idx;
 			tracing_map_set_var(elt, var_idx, hist_val);
 		}
 	}
 
-	update_field_vars(hist_data, elt, rbe, rec);
+	update_field_vars(hist_data, elt, buffer, rbe, rec);
 }
 
 static inline void add_to_key(char *compound_key, void *key,
@@ -4478,7 +4504,8 @@ static inline void add_to_key(char *compound_key, void *key,
 
 static void
 hist_trigger_actions(struct hist_trigger_data *hist_data,
-		     struct tracing_map_elt *elt, void *rec,
+		     struct tracing_map_elt *elt,
+		     struct trace_buffer *buffer, void *rec,
 		     struct ring_buffer_event *rbe, void *key,
 		     u64 *var_ref_vals)
 {
@@ -4487,11 +4514,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
 
 	for (i = 0; i < hist_data->n_actions; i++) {
 		data = hist_data->actions[i];
-		data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+		data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals);
 	}
 }
 
-static void event_hist_trigger(struct event_trigger_data *data, void *rec,
+static void event_hist_trigger(struct event_trigger_data *data,
+			       struct trace_buffer *buffer, void *rec,
 			       struct ring_buffer_event *rbe)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
@@ -4516,7 +4544,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
 					 HIST_STACKTRACE_SKIP);
 			key = entries;
 		} else {
-			field_contents = key_field->fn(key_field, elt, rbe, rec);
+			field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
 			if (key_field->flags & HIST_FIELD_FL_STRING) {
 				key = (void *)(unsigned long)field_contents;
 				use_compound_key = true;
@@ -4539,10 +4567,10 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
 	if (!elt)
 		return;
 
-	hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
+	hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
 
 	if (resolve_var_refs(hist_data, key, var_ref_vals, true))
-		hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals);
+		hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
 }
 
 static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -5812,7 +5840,8 @@ __init int register_trigger_hist_cmd(void)
 }
 
 static void
-hist_enable_trigger(struct event_trigger_data *data, void *rec,
+hist_enable_trigger(struct event_trigger_data *data,
+		    struct trace_buffer *buffer,  void *rec,
 		    struct ring_buffer_event *event)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
@@ -5830,7 +5859,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
+hist_enable_count_trigger(struct event_trigger_data *data,
+			  struct trace_buffer *buffer,  void *rec,
 			  struct ring_buffer_event *event)
 {
 	if (!data->count)
@@ -5839,7 +5869,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
 	if (data->count != -1)
 		(data->count)--;
 
-	hist_enable_trigger(data, rec, event);
+	hist_enable_trigger(data, buffer, rec, event);
 }
 
 static struct event_trigger_ops hist_enable_trigger_ops = {
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f725802160c0..b8bfa8505b7b 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -53,7 +53,8 @@ void trigger_data_free(struct event_trigger_data *data)
  * any trigger that should be deferred, ETT_NONE if nothing to defer.
  */
 enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec,
+event_triggers_call(struct trace_event_file *file,
+		    struct trace_buffer *buffer, void *rec,
 		    struct ring_buffer_event *event)
 {
 	struct event_trigger_data *data;
@@ -67,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, void *rec,
 		if (data->paused)
 			continue;
 		if (!rec) {
-			data->ops->func(data, rec, event);
+			data->ops->func(data, buffer, rec, event);
 			continue;
 		}
 		filter = rcu_dereference_sched(data->filter);
@@ -77,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, void *rec,
 			tt |= data->cmd_ops->trigger_type;
 			continue;
 		}
-		data->ops->func(data, rec, event);
+		data->ops->func(data, buffer, rec, event);
 	}
 	return tt;
 }
@@ -105,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file,
 		if (data->paused)
 			continue;
 		if (data->cmd_ops->trigger_type & tt)
-			data->ops->func(data, NULL, NULL);
+			data->ops->func(data, NULL, NULL, NULL);
 	}
 }
 EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -937,7 +938,8 @@ get_named_trigger_data(struct event_trigger_data *data)
 }
 
 static void
-traceon_trigger(struct event_trigger_data *data, void *rec,
+traceon_trigger(struct event_trigger_data *data,
+		struct trace_buffer *buffer, void *rec,
 		struct ring_buffer_event *event)
 {
 	if (tracing_is_on())
@@ -947,7 +949,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-traceon_count_trigger(struct event_trigger_data *data, void *rec,
+traceon_count_trigger(struct event_trigger_data *data,
+		      struct trace_buffer *buffer, void *rec,
 		      struct ring_buffer_event *event)
 {
 	if (tracing_is_on())
@@ -963,7 +966,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-traceoff_trigger(struct event_trigger_data *data, void *rec,
+traceoff_trigger(struct event_trigger_data *data,
+		 struct trace_buffer *buffer, void *rec,
 		 struct ring_buffer_event *event)
 {
 	if (!tracing_is_on())
@@ -973,7 +977,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-traceoff_count_trigger(struct event_trigger_data *data, void *rec,
+traceoff_count_trigger(struct event_trigger_data *data,
+		       struct trace_buffer *buffer, void *rec,
 		       struct ring_buffer_event *event)
 {
 	if (!tracing_is_on())
@@ -1071,7 +1076,8 @@ static struct event_command trigger_traceoff_cmd = {
 
 #ifdef CONFIG_TRACER_SNAPSHOT
 static void
-snapshot_trigger(struct event_trigger_data *data, void *rec,
+snapshot_trigger(struct event_trigger_data *data,
+		 struct trace_buffer *buffer, void *rec,
 		 struct ring_buffer_event *event)
 {
 	struct trace_event_file *file = data->private_data;
@@ -1083,7 +1089,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-snapshot_count_trigger(struct event_trigger_data *data, void *rec,
+snapshot_count_trigger(struct event_trigger_data *data,
+		       struct trace_buffer *buffer, void *rec,
 		       struct ring_buffer_event *event)
 {
 	if (!data->count)
@@ -1092,7 +1099,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec,
 	if (data->count != -1)
 		(data->count)--;
 
-	snapshot_trigger(data, rec, event);
+	snapshot_trigger(data, buffer, rec, event);
 }
 
 static int
@@ -1176,14 +1183,16 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
 #endif
 
 static void
-stacktrace_trigger(struct event_trigger_data *data, void *rec,
+stacktrace_trigger(struct event_trigger_data *data,
+		   struct trace_buffer *buffer,  void *rec,
 		   struct ring_buffer_event *event)
 {
 	trace_dump_stack(STACK_SKIP);
 }
 
 static void
-stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
+stacktrace_count_trigger(struct event_trigger_data *data,
+			 struct trace_buffer *buffer, void *rec,
 			 struct ring_buffer_event *event)
 {
 	if (!data->count)
@@ -1192,7 +1201,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
 	if (data->count != -1)
 		(data->count)--;
 
-	stacktrace_trigger(data, rec, event);
+	stacktrace_trigger(data, buffer, rec, event);
 }
 
 static int
@@ -1254,7 +1263,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 }
 
 static void
-event_enable_trigger(struct event_trigger_data *data, void *rec,
+event_enable_trigger(struct event_trigger_data *data,
+		     struct trace_buffer *buffer,  void *rec,
 		     struct ring_buffer_event *event)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
@@ -1266,7 +1276,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec,
 }
 
 static void
-event_enable_count_trigger(struct event_trigger_data *data, void *rec,
+event_enable_count_trigger(struct event_trigger_data *data,
+			   struct trace_buffer *buffer,  void *rec,
 			   struct ring_buffer_event *event)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
@@ -1281,7 +1292,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec,
 	if (data->count != -1)
 		(data->count)--;
 
-	event_enable_trigger(data, rec, event);
+	event_enable_trigger(data, buffer, rec, event);
 }
 
 int event_enable_trigger_print(struct seq_file *m,
-- 
cgit v1.2.3


From efe6196a6bc5bbc84b856316c4687fd24566a95c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 16 Mar 2021 12:41:04 -0400
Subject: ring-buffer: Allow ring_buffer_event_time_stamp() to return time
 stamp of all events

Currently, ring_buffer_event_time_stamp() only returns an accurate time
stamp of the event if it has an absolute extended time stamp attached to
it. To make it more robust, use the event_stamp() in case the event does
not have an absolute value attached to it.

This will allow ring_buffer_event_time_stamp() to be used in more cases
than just histograms, and it will also allow histograms to not require
including absolute values all the time.

Link: https://lkml.kernel.org/r/20210316164113.704830885@goodmis.org

Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h      |  3 +-
 kernel/trace/ring_buffer.c       | 60 ++++++++++++++++++++++++++++++----------
 kernel/trace/trace_events_hist.c |  2 +-
 3 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 136ea0997e6d..057b7ed4fe24 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -61,7 +61,8 @@ enum ring_buffer_type {
 
 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
 void *ring_buffer_event_data(struct ring_buffer_event *event);
-u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
+u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
+				 struct ring_buffer_event *event);
 
 /*
  * ring_buffer_discard_commit will remove an event that has not
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 470d97169081..8fa2a84f714f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -298,21 +298,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
 	return ts;
 }
 
-/**
- * ring_buffer_event_time_stamp - return the event's extended timestamp
- * @event: the event to get the timestamp of
- *
- * Returns the extended timestamp associated with a data event.
- * An extended time_stamp is a 64-bit timestamp represented
- * internally in a special way that makes the best use of space
- * contained within a ring buffer event.  This function decodes
- * it and maps it to a straight u64 value.
- */
-u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
-{
-	return rb_event_time_stamp(event);
-}
-
 /* Flag when events were overwritten */
 #define RB_MISSED_EVENTS	(1 << 31)
 /* Missed count stored at end */
@@ -757,6 +742,51 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
 }
 #endif
 
+static inline u64 rb_time_stamp(struct trace_buffer *buffer);
+
+/**
+ * ring_buffer_event_time_stamp - return the event's current time stamp
+ * @buffer: The buffer that the event is on
+ * @event: the event to get the time stamp of
+ *
+ * Note, this must be called after @event is reserved, and before it is
+ * committed to the ring buffer. And must be called from the same
+ * context where the event was reserved (normal, softirq, irq, etc).
+ *
+ * Returns the time stamp associated with the current event.
+ * If the event has an extended time stamp, then that is used as
+ * the time stamp to return.
+ * In the highly unlikely case that the event was nested more than
+ * the max nesting, then the write_stamp of the buffer is returned,
+ * otherwise  current time is returned, but that really neither of
+ * the last two cases should ever happen.
+ */
+u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
+				 struct ring_buffer_event *event)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()];
+	unsigned int nest;
+	u64 ts;
+
+	/* If the event includes an absolute time, then just use that */
+	if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
+		return rb_event_time_stamp(event);
+
+	/* Read the current saved nesting level time stamp */
+	nest = local_read(&cpu_buffer->committing) - 1;
+	if (likely(nest < MAX_NEST))
+		return cpu_buffer->event_stamp[nest];
+
+	WARN_ON_ONCE(1);
+
+	/* Can only fail on 32 bit */
+	if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
+		/* Screw it, just read the current time */
+		ts = rb_time_stamp(cpu_buffer->buffer);
+
+	return ts;
+}
+
 /**
  * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
  * @buffer: The ring_buffer to get the number of pages from
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6978aa3ee4c5..45986cb4637e 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -644,7 +644,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field,
 	struct hist_trigger_data *hist_data = hist_field->hist_data;
 	struct trace_array *tr = hist_data->event_file->tr;
 
-	u64 ts = ring_buffer_event_time_stamp(rbe);
+	u64 ts = ring_buffer_event_time_stamp(buffer, rbe);
 
 	if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
 		ts = ns2usecs(ts);
-- 
cgit v1.2.3


From f2616c772c768485de18e7fcb2816bcdcd098339 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 17 Mar 2021 13:34:35 -0400
Subject: seq_buf: Add seq_buf_terminate() API

In the case that the seq_buf buffer needs to be printed directly, add a way
to make sure that the buffer is safe to read by forcing a nul terminating
character at the end of the string, or the last byte of the buffer if the
string has overflowed.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 9d6c28cc4d8f..5b31c5147969 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -71,6 +71,31 @@ static inline unsigned int seq_buf_used(struct seq_buf *s)
 	return min(s->len, s->size);
 }
 
+/**
+ * seq_buf_terminate - Make sure buffer is nul terminated
+ * @s: the seq_buf descriptor to terminate.
+ *
+ * This makes sure that the buffer in @s is nul terminated and
+ * safe to read as a string.
+ *
+ * Note, if this is called when the buffer has overflowed, then
+ * the last byte of the buffer is zeroed, and the len will still
+ * point passed it.
+ *
+ * After this function is called, s->buffer is safe to use
+ * in string operations.
+ */
+static inline void seq_buf_terminate(struct seq_buf *s)
+{
+	if (WARN_ON(s->size == 0))
+		return;
+
+	if (seq_buf_buffer_left(s))
+		s->buffer[s->len] = 0;
+	else
+		s->buffer[s->size - 1] = 0;
+}
+
 /**
  * seq_buf_get_buf - get buffer to write arbitrary data to
  * @s: the seq_buf handle
-- 
cgit v1.2.3


From 255c04a87f4381849fce9ed81e5efabf78a71a30 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <atenart@kernel.org>
Date: Thu, 18 Mar 2021 19:37:43 +0100
Subject: net: embed num_tc in the xps maps

The xps cpus/rxqs map is accessed using dev->num_tc, which is used when
allocating the map. But later updates of dev->num_tc can lead to having
a mismatch between the maps and how they're accessed. In such cases the
map values do not make any sense and out of bound accesses can occur
(that can be easily seen using KASAN).

This patch aims at fixing this by embedding num_tc into the maps, using
the value at the time the map is created. This brings two improvements:
- The maps can be accessed using the embedded num_tc, so we know for
  sure we won't have out of bound accesses.
- Checks can be made before accessing the maps so we know the values
  retrieved will make sense.

We also update __netif_set_xps_queue to conditionally copy old maps from
dev_maps in the new one only if the number of traffic classes from both
maps match.

Signed-off-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 +++++
 net/core/dev.c            | 63 ++++++++++++++++++++++++++++++-----------------
 net/core/net-sysfs.c      | 45 +++++++++++++--------------------
 3 files changed, 64 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97254c089eb2..c38534c55ea1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -771,9 +771,15 @@ struct xps_map {
 
 /*
  * This structure holds all XPS maps for device.  Maps are indexed by CPU.
+ *
+ * We keep track of the number of traffic classes used when the struct is
+ * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
+ * not crossing its upper bound, as the original dev->num_tc can be updated in
+ * the meantime.
  */
 struct xps_dev_maps {
 	struct rcu_head rcu;
+	s16 num_tc;
 	struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 6bc20eabd2b0..4e29d1994fdd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2491,7 +2491,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 				 struct xps_dev_maps *dev_maps,
 				 int cpu, u16 offset, u16 count)
 {
-	int num_tc = dev->num_tc ? : 1;
+	int num_tc = dev_maps->num_tc;
 	bool active = false;
 	int tci;
 
@@ -2634,10 +2634,10 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 {
 	const unsigned long *online_mask = NULL, *possible_mask = NULL;
 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+	bool active = false, copy = false;
 	int i, j, tci, numa_node_id = -2;
 	int maps_sz, num_tc = 1, tc = 0;
 	struct xps_map *map, *new_map;
-	bool active = false;
 	unsigned int nr_ids;
 
 	if (dev->num_tc) {
@@ -2672,19 +2672,29 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	if (maps_sz < L1_CACHE_BYTES)
 		maps_sz = L1_CACHE_BYTES;
 
+	/* The old dev_maps could be larger or smaller than the one we're
+	 * setting up now, as dev->num_tc could have been updated in between. We
+	 * could try to be smart, but let's be safe instead and only copy
+	 * foreign traffic classes if the two map sizes match.
+	 */
+	if (dev_maps && dev_maps->num_tc == num_tc)
+		copy = true;
+
 	/* allocate memory for queue storage */
 	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
 	     j < nr_ids;) {
-		if (!new_dev_maps)
-			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 		if (!new_dev_maps) {
-			mutex_unlock(&xps_map_mutex);
-			return -ENOMEM;
+			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+			if (!new_dev_maps) {
+				mutex_unlock(&xps_map_mutex);
+				return -ENOMEM;
+			}
+
+			new_dev_maps->num_tc = num_tc;
 		}
 
 		tci = j * num_tc + tc;
-		map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
-				 NULL;
+		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
 
 		map = expand_xps_map(map, j, index, is_rxqs_map);
 		if (!map)
@@ -2706,7 +2716,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 	     j < nr_ids;) {
 		/* copy maps belonging to foreign traffic classes */
-		for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
+		for (i = tc, tci = j * num_tc; copy && i--; tci++) {
 			/* fill in the new device map from the old device map */
 			map = xmap_dereference(dev_maps->attr_map[tci]);
 			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
@@ -2736,14 +2746,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 					numa_node_id = -1;
 			}
 #endif
-		} else if (dev_maps) {
+		} else if (copy) {
 			/* fill in the new device map from the old device map */
 			map = xmap_dereference(dev_maps->attr_map[tci]);
 			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 		}
 
 		/* copy maps belonging to foreign traffic classes */
-		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
+		for (i = num_tc - tc, tci++; copy && --i; tci++) {
 			/* fill in the new device map from the old device map */
 			map = xmap_dereference(dev_maps->attr_map[tci]);
 			RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
@@ -2761,11 +2771,18 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 
 	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 	     j < nr_ids;) {
-		for (i = num_tc, tci = j * num_tc; i--; tci++) {
-			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
 			map = xmap_dereference(dev_maps->attr_map[tci]);
-			if (map && map != new_map)
-				kfree_rcu(map, rcu);
+			if (!map)
+				continue;
+
+			if (copy) {
+				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+				if (map == new_map)
+					continue;
+			}
+
+			kfree_rcu(map, rcu);
 		}
 	}
 
@@ -2789,12 +2806,12 @@ out_no_new_maps:
 	/* removes tx-queue from unused CPUs/rx-queues */
 	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 	     j < nr_ids;) {
-		for (i = tc, tci = j * num_tc; i--; tci++)
+		for (i = tc, tci = j * dev_maps->num_tc; i--; tci++)
 			active |= remove_xps_queue(dev_maps, tci, index);
 		if (!netif_attr_test_mask(j, mask, nr_ids) ||
 		    !netif_attr_test_online(j, online_mask, nr_ids))
 			active |= remove_xps_queue(dev_maps, tci, index);
-		for (i = num_tc - tc, tci++; --i; tci++)
+		for (i = dev_maps->num_tc - tc, tci++; --i; tci++)
 			active |= remove_xps_queue(dev_maps, tci, index);
 	}
 
@@ -2812,7 +2829,7 @@ error:
 	     j < nr_ids;) {
 		for (i = num_tc, tci = j * num_tc; i--; tci++) {
 			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
-			map = dev_maps ?
+			map = copy ?
 			      xmap_dereference(dev_maps->attr_map[tci]) :
 			      NULL;
 			if (new_map && new_map != map)
@@ -3944,13 +3961,15 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 			       struct xps_dev_maps *dev_maps, unsigned int tci)
 {
+	int tc = netdev_get_prio_tc_map(dev, skb->priority);
 	struct xps_map *map;
 	int queue_index = -1;
 
-	if (dev->num_tc) {
-		tci *= dev->num_tc;
-		tci += netdev_get_prio_tc_map(dev, skb->priority);
-	}
+	if (tc >= dev_maps->num_tc)
+		return queue_index;
+
+	tci *= dev_maps->num_tc;
+	tci += tc;
 
 	map = rcu_dereference(dev_maps->attr_map[tci]);
 	if (map) {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 5f76183ad5bc..1364d0f39cb0 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1364,9 +1364,9 @@ static const struct attribute_group dql_group = {
 static ssize_t xps_cpus_show(struct netdev_queue *queue,
 			     char *buf)
 {
-	int j, len, ret, num_tc = 1, tc = 0;
 	struct net_device *dev = queue->dev;
 	struct xps_dev_maps *dev_maps;
+	int j, len, ret, tc = 0;
 	unsigned long *mask;
 	unsigned int index;
 
@@ -1378,22 +1378,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 	if (!rtnl_trylock())
 		return restart_syscall();
 
-	if (dev->num_tc) {
-		/* Do not allow XPS on subordinate device directly */
-		num_tc = dev->num_tc;
-		if (num_tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
-
-		/* If queue belongs to subordinate dev use its map */
-		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+	/* If queue belongs to subordinate dev use its map */
+	dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 
-		tc = netdev_txq_to_tc(dev, index);
-		if (tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
+	tc = netdev_txq_to_tc(dev, index);
+	if (tc < 0) {
+		ret = -EINVAL;
+		goto err_rtnl_unlock;
 	}
 
 	mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
@@ -1404,12 +1395,12 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 
 	rcu_read_lock();
 	dev_maps = rcu_dereference(dev->xps_cpus_map);
-	if (!dev_maps)
+	if (!dev_maps || tc >= dev_maps->num_tc)
 		goto out_no_maps;
 
 	for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids),
 	     j < nr_cpu_ids;) {
-		int i, tci = j * num_tc + tc;
+		int i, tci = j * dev_maps->num_tc + tc;
 		struct xps_map *map;
 
 		map = rcu_dereference(dev_maps->attr_map[tci]);
@@ -1480,9 +1471,9 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
 
 static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 {
-	int j, len, ret, num_tc = 1, tc = 0;
 	struct net_device *dev = queue->dev;
 	struct xps_dev_maps *dev_maps;
+	int j, len, ret, tc = 0;
 	unsigned long *mask;
 	unsigned int index;
 
@@ -1491,14 +1482,12 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 	if (!rtnl_trylock())
 		return restart_syscall();
 
-	if (dev->num_tc) {
-		num_tc = dev->num_tc;
-		tc = netdev_txq_to_tc(dev, index);
-		if (tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
+	tc = netdev_txq_to_tc(dev, index);
+	if (tc < 0) {
+		ret = -EINVAL;
+		goto err_rtnl_unlock;
 	}
+
 	mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
 	if (!mask) {
 		ret = -ENOMEM;
@@ -1507,12 +1496,12 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 
 	rcu_read_lock();
 	dev_maps = rcu_dereference(dev->xps_rxqs_map);
-	if (!dev_maps)
+	if (!dev_maps || tc >= dev_maps->num_tc)
 		goto out_no_maps;
 
 	for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
 	     j < dev->num_rx_queues;) {
-		int i, tci = j * num_tc + tc;
+		int i, tci = j * dev_maps->num_tc + tc;
 		struct xps_map *map;
 
 		map = rcu_dereference(dev_maps->attr_map[tci]);
-- 
cgit v1.2.3


From 5478fcd0f48322e04ae6c173ad3a1959e066dc83 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <atenart@kernel.org>
Date: Thu, 18 Mar 2021 19:37:44 +0100
Subject: net: embed nr_ids in the xps maps

Embed nr_ids (the number of cpu for the xps cpus map, and the number of
rxqs for the xps cpus map) in dev_maps. That will help not accessing out
of bound memory if those values change after dev_maps was allocated.

Suggested-by: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++++
 net/core/dev.c            | 45 +++++++++++++++++++++------------------------
 net/core/net-sysfs.c      | 38 ++++++++++++++++++++++----------------
 3 files changed, 47 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c38534c55ea1..09e73f5a8c78 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -772,6 +772,9 @@ struct xps_map {
 /*
  * This structure holds all XPS maps for device.  Maps are indexed by CPU.
  *
+ * We keep track of the number of cpus/rxqs used when the struct is allocated,
+ * in nr_ids. This will help not accessing out-of-bound memory.
+ *
  * We keep track of the number of traffic classes used when the struct is
  * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
  * not crossing its upper bound, as the original dev->num_tc can be updated in
@@ -779,6 +782,7 @@ struct xps_map {
  */
 struct xps_dev_maps {
 	struct rcu_head rcu;
+	unsigned int nr_ids;
 	s16 num_tc;
 	struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
 };
diff --git a/net/core/dev.c b/net/core/dev.c
index 4e29d1994fdd..7530c95970a0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2524,14 +2524,14 @@ static void reset_xps_maps(struct net_device *dev,
 }
 
 static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
-			   struct xps_dev_maps *dev_maps, unsigned int nr_ids,
-			   u16 offset, u16 count, bool is_rxqs_map)
+			   struct xps_dev_maps *dev_maps, u16 offset, u16 count,
+			   bool is_rxqs_map)
 {
+	unsigned int nr_ids = dev_maps->nr_ids;
 	bool active = false;
 	int i, j;
 
-	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
-	     j < nr_ids;)
+	for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), j < nr_ids;)
 		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 					       count);
 	if (!active)
@@ -2551,7 +2551,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 {
 	const unsigned long *possible_mask = NULL;
 	struct xps_dev_maps *dev_maps;
-	unsigned int nr_ids;
 
 	if (!static_key_false(&xps_needed))
 		return;
@@ -2561,11 +2560,9 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 
 	if (static_key_false(&xps_rxqs_needed)) {
 		dev_maps = xmap_dereference(dev->xps_rxqs_map);
-		if (dev_maps) {
-			nr_ids = dev->num_rx_queues;
-			clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
-				       offset, count, true);
-		}
+		if (dev_maps)
+			clean_xps_maps(dev, possible_mask, dev_maps, offset,
+				       count, true);
 	}
 
 	dev_maps = xmap_dereference(dev->xps_cpus_map);
@@ -2574,9 +2571,7 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 
 	if (num_possible_cpus() > 1)
 		possible_mask = cpumask_bits(cpu_possible_mask);
-	nr_ids = nr_cpu_ids;
-	clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
-		       false);
+	clean_xps_maps(dev, possible_mask, dev_maps, offset, count, false);
 
 out_no_maps:
 	mutex_unlock(&xps_map_mutex);
@@ -2673,11 +2668,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 		maps_sz = L1_CACHE_BYTES;
 
 	/* The old dev_maps could be larger or smaller than the one we're
-	 * setting up now, as dev->num_tc could have been updated in between. We
-	 * could try to be smart, but let's be safe instead and only copy
-	 * foreign traffic classes if the two map sizes match.
+	 * setting up now, as dev->num_tc or nr_ids could have been updated in
+	 * between. We could try to be smart, but let's be safe instead and only
+	 * copy foreign traffic classes if the two map sizes match.
 	 */
-	if (dev_maps && dev_maps->num_tc == num_tc)
+	if (dev_maps &&
+	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
 		copy = true;
 
 	/* allocate memory for queue storage */
@@ -2690,6 +2686,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 				return -ENOMEM;
 			}
 
+			new_dev_maps->nr_ids = nr_ids;
 			new_dev_maps->num_tc = num_tc;
 		}
 
@@ -2770,7 +2767,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 		goto out_no_old_maps;
 
 	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
-	     j < nr_ids;) {
+	     j < dev_maps->nr_ids;) {
 		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
 			map = xmap_dereference(dev_maps->attr_map[tci]);
 			if (!map)
@@ -2804,12 +2801,12 @@ out_no_new_maps:
 		goto out_no_maps;
 
 	/* removes tx-queue from unused CPUs/rx-queues */
-	for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
-	     j < nr_ids;) {
+	for (j = -1; j = netif_attrmask_next(j, possible_mask, dev_maps->nr_ids),
+	     j < dev_maps->nr_ids;) {
 		for (i = tc, tci = j * dev_maps->num_tc; i--; tci++)
 			active |= remove_xps_queue(dev_maps, tci, index);
-		if (!netif_attr_test_mask(j, mask, nr_ids) ||
-		    !netif_attr_test_online(j, online_mask, nr_ids))
+		if (!netif_attr_test_mask(j, mask, dev_maps->nr_ids) ||
+		    !netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
 			active |= remove_xps_queue(dev_maps, tci, index);
 		for (i = dev_maps->num_tc - tc, tci++; --i; tci++)
 			active |= remove_xps_queue(dev_maps, tci, index);
@@ -3965,7 +3962,7 @@ static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 	struct xps_map *map;
 	int queue_index = -1;
 
-	if (tc >= dev_maps->num_tc)
+	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
 		return queue_index;
 
 	tci *= dev_maps->num_tc;
@@ -4004,7 +4001,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 	if (dev_maps) {
 		int tci = sk_rx_queue_get(sk);
 
-		if (tci >= 0 && tci < dev->num_rx_queues)
+		if (tci >= 0)
 			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
 							  tci);
 	}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1364d0f39cb0..bb08bdc88fa9 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1366,9 +1366,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 {
 	struct net_device *dev = queue->dev;
 	struct xps_dev_maps *dev_maps;
+	unsigned int index, nr_ids;
 	int j, len, ret, tc = 0;
 	unsigned long *mask;
-	unsigned int index;
 
 	if (!netif_is_multiqueue(dev))
 		return -ENOENT;
@@ -1387,19 +1387,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 		goto err_rtnl_unlock;
 	}
 
-	mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
+	rcu_read_lock();
+	dev_maps = rcu_dereference(dev->xps_cpus_map);
+	nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids;
+
+	mask = bitmap_zalloc(nr_ids, GFP_KERNEL);
 	if (!mask) {
 		ret = -ENOMEM;
-		goto err_rtnl_unlock;
+		goto err_rcu_unlock;
 	}
 
-	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_cpus_map);
 	if (!dev_maps || tc >= dev_maps->num_tc)
 		goto out_no_maps;
 
-	for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids),
-	     j < nr_cpu_ids;) {
+	for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) {
 		int i, tci = j * dev_maps->num_tc + tc;
 		struct xps_map *map;
 
@@ -1419,10 +1420,12 @@ out_no_maps:
 
 	rtnl_unlock();
 
-	len = bitmap_print_to_pagebuf(false, buf, mask, nr_cpu_ids);
+	len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids);
 	bitmap_free(mask);
 	return len < PAGE_SIZE ? len : -EINVAL;
 
+err_rcu_unlock:
+	rcu_read_unlock();
 err_rtnl_unlock:
 	rtnl_unlock();
 	return ret;
@@ -1473,9 +1476,9 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 {
 	struct net_device *dev = queue->dev;
 	struct xps_dev_maps *dev_maps;
+	unsigned int index, nr_ids;
 	int j, len, ret, tc = 0;
 	unsigned long *mask;
-	unsigned int index;
 
 	index = get_netdev_queue_index(queue);
 
@@ -1488,19 +1491,20 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 		goto err_rtnl_unlock;
 	}
 
-	mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
+	rcu_read_lock();
+	dev_maps = rcu_dereference(dev->xps_rxqs_map);
+	nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues;
+
+	mask = bitmap_zalloc(nr_ids, GFP_KERNEL);
 	if (!mask) {
 		ret = -ENOMEM;
-		goto err_rtnl_unlock;
+		goto err_rcu_unlock;
 	}
 
-	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_rxqs_map);
 	if (!dev_maps || tc >= dev_maps->num_tc)
 		goto out_no_maps;
 
-	for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
-	     j < dev->num_rx_queues;) {
+	for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) {
 		int i, tci = j * dev_maps->num_tc + tc;
 		struct xps_map *map;
 
@@ -1520,11 +1524,13 @@ out_no_maps:
 
 	rtnl_unlock();
 
-	len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+	len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids);
 	bitmap_free(mask);
 
 	return len < PAGE_SIZE ? len : -EINVAL;
 
+err_rcu_unlock:
+	rcu_read_unlock();
 err_rtnl_unlock:
 	rtnl_unlock();
 	return ret;
-- 
cgit v1.2.3


From 044ab86d431b59b88966457dbb62679f274ec442 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <atenart@kernel.org>
Date: Thu, 18 Mar 2021 19:37:46 +0100
Subject: net: move the xps maps to an array

Move the xps maps (xps_cpus_map and xps_rxqs_map) to an array in
net_device. That will simplify a lot the code removing the need for lots
of if/else conditionals as the correct map will be available using its
offset in the array.

This should not modify the xps maps behaviour in any way.

Suggested-by: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c  |  2 +-
 include/linux/netdevice.h | 17 +++++++----
 net/core/dev.c            | 73 ++++++++++++++++++++---------------------------
 net/core/net-sysfs.c      |  6 ++--
 4 files changed, 46 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 77ba8e2fc11c..584a9bd59dda 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2015,7 +2015,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi)
 		}
 		virtqueue_set_affinity(vi->rq[i].vq, mask);
 		virtqueue_set_affinity(vi->sq[i].vq, mask);
-		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, false);
+		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
 		cpumask_clear(mask);
 	}
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 09e73f5a8c78..4940509999be 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -754,6 +754,13 @@ struct rx_queue_attribute {
 			 const char *buf, size_t len);
 };
 
+/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
+enum xps_map_type {
+	XPS_CPUS = 0,
+	XPS_RXQS,
+	XPS_MAPS_MAX,
+};
+
 #ifdef CONFIG_XPS
 /*
  * This structure holds an XPS map which can be of variable length.  The
@@ -1773,8 +1780,7 @@ enum netdev_ml_priv_type {
  *	@tx_queue_len:		Max frames per queue allowed
  *	@tx_global_lock: 	XXX: need comments on this one
  *	@xdp_bulkq:		XDP device bulk queue
- *	@xps_cpus_map:		all CPUs map for XPS device
- *	@xps_rxqs_map:		all RXQs map for XPS device
+ *	@xps_maps:		all CPUs/RXQs maps for XPS device
  *
  *	@xps_maps:	XXX: need comments on this one
  *	@miniq_egress:		clsact qdisc specific data for
@@ -2070,8 +2076,7 @@ struct net_device {
 	struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
 
 #ifdef CONFIG_XPS
-	struct xps_dev_maps __rcu *xps_cpus_map;
-	struct xps_dev_maps __rcu *xps_rxqs_map;
+	struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
@@ -3701,7 +3706,7 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			u16 index);
 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
-			  u16 index, bool is_rxqs_map);
+			  u16 index, enum xps_map_type type);
 
 /**
  *	netif_attr_test_mask - Test a CPU or Rx queue set in a mask
@@ -3796,7 +3801,7 @@ static inline int netif_set_xps_queue(struct net_device *dev,
 
 static inline int __netif_set_xps_queue(struct net_device *dev,
 					const unsigned long *mask,
-					u16 index, bool is_rxqs_map)
+					u16 index, enum xps_map_type type)
 {
 	return 0;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 3ed8cb3a4061..af57e32bb543 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2511,31 +2511,34 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 
 static void reset_xps_maps(struct net_device *dev,
 			   struct xps_dev_maps *dev_maps,
-			   bool is_rxqs_map)
+			   enum xps_map_type type)
 {
-	if (is_rxqs_map) {
-		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
-		RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
-	} else {
-		RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
-	}
 	static_key_slow_dec_cpuslocked(&xps_needed);
+	if (type == XPS_RXQS)
+		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+
+	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+
 	kfree_rcu(dev_maps, rcu);
 }
 
-static void clean_xps_maps(struct net_device *dev,
-			   struct xps_dev_maps *dev_maps, u16 offset, u16 count,
-			   bool is_rxqs_map)
+static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
+			   u16 offset, u16 count)
 {
+	struct xps_dev_maps *dev_maps;
 	bool active = false;
 	int i, j;
 
+	dev_maps = xmap_dereference(dev->xps_maps[type]);
+	if (!dev_maps)
+		return;
+
 	for (j = 0; j < dev_maps->nr_ids; j++)
 		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
 	if (!active)
-		reset_xps_maps(dev, dev_maps, is_rxqs_map);
+		reset_xps_maps(dev, dev_maps, type);
 
-	if (!is_rxqs_map) {
+	if (type == XPS_CPUS) {
 		for (i = offset + (count - 1); count--; i--)
 			netdev_queue_numa_node_write(
 				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
@@ -2545,27 +2548,17 @@ static void clean_xps_maps(struct net_device *dev,
 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
 				   u16 count)
 {
-	struct xps_dev_maps *dev_maps;
-
 	if (!static_key_false(&xps_needed))
 		return;
 
 	cpus_read_lock();
 	mutex_lock(&xps_map_mutex);
 
-	if (static_key_false(&xps_rxqs_needed)) {
-		dev_maps = xmap_dereference(dev->xps_rxqs_map);
-		if (dev_maps)
-			clean_xps_maps(dev, dev_maps, offset, count, true);
-	}
-
-	dev_maps = xmap_dereference(dev->xps_cpus_map);
-	if (!dev_maps)
-		goto out_no_maps;
+	if (static_key_false(&xps_rxqs_needed))
+		clean_xps_maps(dev, XPS_RXQS, offset, count);
 
-	clean_xps_maps(dev, dev_maps, offset, count, false);
+	clean_xps_maps(dev, XPS_CPUS, offset, count);
 
-out_no_maps:
 	mutex_unlock(&xps_map_mutex);
 	cpus_read_unlock();
 }
@@ -2617,7 +2610,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
 
 /* Must be called under cpus_read_lock */
 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
-			  u16 index, bool is_rxqs_map)
+			  u16 index, enum xps_map_type type)
 {
 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 	const unsigned long *online_mask = NULL;
@@ -2642,15 +2635,15 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	}
 
 	mutex_lock(&xps_map_mutex);
-	if (is_rxqs_map) {
+
+	dev_maps = xmap_dereference(dev->xps_maps[type]);
+	if (type == XPS_RXQS) {
 		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
-		dev_maps = xmap_dereference(dev->xps_rxqs_map);
 		nr_ids = dev->num_rx_queues;
 	} else {
 		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 		if (num_possible_cpus() > 1)
 			online_mask = cpumask_bits(cpu_online_mask);
-		dev_maps = xmap_dereference(dev->xps_cpus_map);
 		nr_ids = nr_cpu_ids;
 	}
 
@@ -2683,7 +2676,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 		tci = j * num_tc + tc;
 		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
 
-		map = expand_xps_map(map, j, index, is_rxqs_map);
+		map = expand_xps_map(map, j, index, type == XPS_RXQS);
 		if (!map)
 			goto error;
 
@@ -2696,7 +2689,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 	if (!dev_maps) {
 		/* Increment static keys at most once per type */
 		static_key_slow_inc_cpuslocked(&xps_needed);
-		if (is_rxqs_map)
+		if (type == XPS_RXQS)
 			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
 	}
 
@@ -2725,7 +2718,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 			if (pos == map->len)
 				map->queues[map->len++] = index;
 #ifdef CONFIG_NUMA
-			if (!is_rxqs_map) {
+			if (type == XPS_CPUS) {
 				if (numa_node_id == -2)
 					numa_node_id = cpu_to_node(j);
 				else if (numa_node_id != cpu_to_node(j))
@@ -2746,10 +2739,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 		}
 	}
 
-	if (is_rxqs_map)
-		rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
-	else
-		rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
+	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
 
 	/* Cleanup old maps */
 	if (!dev_maps)
@@ -2778,12 +2768,11 @@ out_no_old_maps:
 	active = true;
 
 out_no_new_maps:
-	if (!is_rxqs_map) {
+	if (type == XPS_CPUS)
 		/* update Tx queue numa node */
 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
 					     (numa_node_id >= 0) ?
 					     numa_node_id : NUMA_NO_NODE);
-	}
 
 	if (!dev_maps)
 		goto out_no_maps;
@@ -2801,7 +2790,7 @@ out_no_new_maps:
 
 	/* free map if not active */
 	if (!active)
-		reset_xps_maps(dev, dev_maps, is_rxqs_map);
+		reset_xps_maps(dev, dev_maps, type);
 
 out_no_maps:
 	mutex_unlock(&xps_map_mutex);
@@ -2833,7 +2822,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 	int ret;
 
 	cpus_read_lock();
-	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
 	cpus_read_unlock();
 
 	return ret;
@@ -3983,7 +3972,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 	if (!static_key_false(&xps_rxqs_needed))
 		goto get_cpus_map;
 
-	dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
 	if (dev_maps) {
 		int tci = sk_rx_queue_get(sk);
 
@@ -3994,7 +3983,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
 
 get_cpus_map:
 	if (queue_index < 0) {
-		dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
 		if (dev_maps) {
 			unsigned int tci = skb->sender_cpu - 1;
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c762c435ff76..ca1f3b63cfad 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1388,7 +1388,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 	}
 
 	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_cpus_map);
+	dev_maps = rcu_dereference(dev->xps_maps[XPS_CPUS]);
 	nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids;
 
 	mask = bitmap_zalloc(nr_ids, GFP_KERNEL);
@@ -1492,7 +1492,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 	}
 
 	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_rxqs_map);
+	dev_maps = rcu_dereference(dev->xps_maps[XPS_RXQS]);
 	nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues;
 
 	mask = bitmap_zalloc(nr_ids, GFP_KERNEL);
@@ -1566,7 +1566,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
 	}
 
 	cpus_read_lock();
-	err = __netif_set_xps_queue(dev, mask, index, true);
+	err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS);
 	cpus_read_unlock();
 
 	rtnl_unlock();
-- 
cgit v1.2.3


From 76da35dc99afb460b9c335182ba6a3e7ff924186 Mon Sep 17 00:00:00 2001
From: "Wong, Vee Khee" <vee.khee.wong@intel.com>
Date: Wed, 17 Mar 2021 09:32:47 +0800
Subject: stmmac: intel: Add PSE and PCH PTP clock source selection

Intel mGbE variant implemented in EHL and TGL can be set to select
different clock frequency based on GPO bits in MAC_GPIO_STATUS register.

We introduce a new "void (*ptp_clk_freq_config)(void *priv)" in platform
data so that if a platform is required to configure the frequency of clock
source, in this case Intel mGBE does, the platform-specific configuration
of the PTP clock setting is done when stmmac_ptp_register() is called.

Signed-off-by: Wong, Vee Khee <vee.khee.wong@intel.com>
Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Co-developed-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 46 +++++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      |  7 ++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c  |  3 ++
 include/linux/stmmac.h                            |  1 +
 4 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index c49646773871..763b549e3c2d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -12,8 +12,18 @@
 #define INTEL_MGBE_ADHOC_ADDR	0x15
 #define INTEL_MGBE_XPCS_ADDR	0x16
 
+/* Selection for PTP Clock Freq belongs to PSE & PCH GbE */
+#define PSE_PTP_CLK_FREQ_MASK		(GMAC_GPO0 | GMAC_GPO3)
+#define PSE_PTP_CLK_FREQ_19_2MHZ	(GMAC_GPO0)
+#define PSE_PTP_CLK_FREQ_200MHZ		(GMAC_GPO0 | GMAC_GPO3)
+#define PSE_PTP_CLK_FREQ_256MHZ		(0)
+#define PCH_PTP_CLK_FREQ_MASK		(GMAC_GPO0)
+#define PCH_PTP_CLK_FREQ_19_2MHZ	(GMAC_GPO0)
+#define PCH_PTP_CLK_FREQ_200MHZ		(0)
+
 struct intel_priv_data {
 	int mdio_adhoc_addr;	/* mdio address for serdes & etc */
+	bool is_pse;
 };
 
 /* This struct is used to associate PCI Function of MAC controller on a board,
@@ -204,6 +214,32 @@ static void intel_serdes_powerdown(struct net_device *ndev, void *intel_data)
 	}
 }
 
+/* Program PTP Clock Frequency for different variant of
+ * Intel mGBE that has slightly different GPO mapping
+ */
+static void intel_mgbe_ptp_clk_freq_config(void *npriv)
+{
+	struct stmmac_priv *priv = (struct stmmac_priv *)npriv;
+	struct intel_priv_data *intel_priv;
+	u32 gpio_value;
+
+	intel_priv = (struct intel_priv_data *)priv->plat->bsp_priv;
+
+	gpio_value = readl(priv->ioaddr + GMAC_GPIO_STATUS);
+
+	if (intel_priv->is_pse) {
+		/* For PSE GbE, use 200MHz */
+		gpio_value &= ~PSE_PTP_CLK_FREQ_MASK;
+		gpio_value |= PSE_PTP_CLK_FREQ_200MHZ;
+	} else {
+		/* For PCH GbE, use 200MHz */
+		gpio_value &= ~PCH_PTP_CLK_FREQ_MASK;
+		gpio_value |= PCH_PTP_CLK_FREQ_200MHZ;
+	}
+
+	writel(gpio_value, priv->ioaddr + GMAC_GPIO_STATUS);
+}
+
 static void common_default_data(struct plat_stmmacenet_data *plat)
 {
 	plat->clk_csr = 2;	/* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */
@@ -322,6 +358,8 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 		return ret;
 	}
 
+	plat->ptp_clk_freq_config = intel_mgbe_ptp_clk_freq_config;
+
 	/* Set default value for multicast hash bins */
 	plat->multicast_filter_bins = HASH_TABLE_SIZE;
 
@@ -391,8 +429,12 @@ static struct stmmac_pci_info ehl_rgmii1g_info = {
 static int ehl_pse0_common_data(struct pci_dev *pdev,
 				struct plat_stmmacenet_data *plat)
 {
+	struct intel_priv_data *intel_priv = plat->bsp_priv;
+
+	intel_priv->is_pse = true;
 	plat->bus_id = 2;
 	plat->addr64 = 32;
+
 	return ehl_common_data(pdev, plat);
 }
 
@@ -423,8 +465,12 @@ static struct stmmac_pci_info ehl_pse0_sgmii1g_info = {
 static int ehl_pse1_common_data(struct pci_dev *pdev,
 				struct plat_stmmacenet_data *plat)
 {
+	struct intel_priv_data *intel_priv = plat->bsp_priv;
+
+	intel_priv->is_pse = true;
 	plat->bus_id = 3;
 	plat->addr64 = 32;
+
 	return ehl_common_data(pdev, plat);
 }
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 82df91c130f7..ef8502d2b6e6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -42,6 +42,7 @@
 #define GMAC_HW_FEATURE3		0x00000128
 #define GMAC_MDIO_ADDR			0x00000200
 #define GMAC_MDIO_DATA			0x00000204
+#define GMAC_GPIO_STATUS		0x0000020C
 #define GMAC_ARP_ADDR			0x00000210
 #define GMAC_ADDR_HIGH(reg)		(0x300 + reg * 8)
 #define GMAC_ADDR_LOW(reg)		(0x304 + reg * 8)
@@ -278,6 +279,12 @@ enum power_event {
 #define GMAC_HW_FEAT_DVLAN		BIT(5)
 #define GMAC_HW_FEAT_NRVF		GENMASK(2, 0)
 
+/* GMAC GPIO Status reg */
+#define GMAC_GPO0			BIT(16)
+#define GMAC_GPO1			BIT(17)
+#define GMAC_GPO2			BIT(18)
+#define GMAC_GPO3			BIT(19)
+
 /* MAC HW ADDR regs */
 #define GMAC_HI_DCS			GENMASK(18, 16)
 #define GMAC_HI_DCS_SHIFT		16
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 0989e2bb6ee3..8b10fd10446f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -192,6 +192,9 @@ void stmmac_ptp_register(struct stmmac_priv *priv)
 {
 	int i;
 
+	if (priv->plat->ptp_clk_freq_config)
+		priv->plat->ptp_clk_freq_config(priv);
+
 	for (i = 0; i < priv->dma_cap.pps_out_num; i++) {
 		if (i >= STMMAC_PPS_MAX)
 			break;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 51004ebd0540..10abc80b601e 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -181,6 +181,7 @@ struct plat_stmmacenet_data {
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	int (*serdes_powerup)(struct net_device *ndev, void *priv);
 	void (*serdes_powerdown)(struct net_device *ndev, void *priv);
+	void (*ptp_clk_freq_config)(void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	struct mac_device_info *(*setup)(void *priv);
-- 
cgit v1.2.3


From 035e9f471691a16c32b389c8b2f236043a2a50d7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 16 Mar 2021 20:26:48 -0700
Subject: scsi: sbitmap: Silence a debug kernel warning triggered by
 sbitmap_put()

All sbitmap code uses implied preemption protection to update
sb->alloc_hint except sbitmap_put(). Using implied preemption protection is
safe since the value of sb->alloc_hint only affects performance of sbitmap
allocations but not their correctness. Change this_cpu_ptr() in
sbitmap_put() into raw_cpu_ptr() to suppress the following kernel warning
that appears with preemption debugging enabled (CONFIG_DEBUG_PREEMPT):

BUG: using smp_processor_id() in preemptible [00000000] code: scsi_eh_0/152
caller is debug_smp_processor_id+0x17/0x20
CPU: 1 PID: 152 Comm: scsi_eh_0 Tainted: G        W         5.12.0-rc1-dbg+ #6
Call Trace:
 show_stack+0x52/0x58
 dump_stack+0xaf/0xf3
 check_preemption_disabled+0xce/0xd0
 debug_smp_processor_id+0x17/0x20
 scsi_device_unbusy+0x13a/0x1c0 [scsi_mod]
 scsi_finish_command+0x4d/0x290 [scsi_mod]
 scsi_eh_flush_done_q+0x1e7/0x280 [scsi_mod]
 ata_scsi_port_error_handler+0x592/0x750 [libata]
 ata_scsi_error+0x1a0/0x1f0 [libata]
 scsi_error_handler+0x19e/0x330 [scsi_mod]
 kthread+0x222/0x250
 ret_from_fork+0x1f/0x30

Link: https://lore.kernel.org/r/20210317032648.9080-1-bvanassche@acm.org
Fixes: c548e62bcf6a ("scsi: sbitmap: Move allocation hint into sbitmap")
Cc: Hannes Reinecke <hare@suse.de>
Cc: Omar Sandoval <osandov@fb.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/sbitmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 3087e1f15fdd..2713e689ad66 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -324,7 +324,7 @@ static inline void sbitmap_put(struct sbitmap *sb, unsigned int bitnr)
 	sbitmap_deferred_clear_bit(sb, bitnr);
 
 	if (likely(sb->alloc_hint && !sb->round_robin && bitnr < sb->depth))
-		*this_cpu_ptr(sb->alloc_hint) = bitnr;
+		*raw_cpu_ptr(sb->alloc_hint) = bitnr;
 }
 
 static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
-- 
cgit v1.2.3


From 73f620951b2b594bdc38722c0d647c3b3312af7a Mon Sep 17 00:00:00 2001
From: Claire Chang <tientzu@chromium.org>
Date: Thu, 18 Mar 2021 17:14:22 +0100
Subject: swiotlb: move global variables into a new io_tlb_mem structure

Added a new struct, io_tlb_mem, as the IO TLB memory pool descriptor and
moved relevant global variables into that struct.
This will be useful later to allow for restricted DMA pool.

Signed-off-by: Claire Chang <tientzu@chromium.org>
[hch: rebased]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/swiotlb-xen.c |   2 +-
 include/linux/swiotlb.h   |  43 +++++-
 kernel/dma/swiotlb.c      | 354 +++++++++++++++++++++-------------------------
 3 files changed, 206 insertions(+), 193 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 4ecfce2c6f72..5329ad54a5f3 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -548,7 +548,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
 static int
 xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return xen_phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+	return xen_phys_to_dma(hwdev, io_tlb_default_mem.end - 1) <= mask;
 }
 
 const struct dma_map_ops xen_swiotlb_dma_ops = {
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 0696bdc8072e..5ec5378b17c3 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -6,6 +6,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/limits.h>
+#include <linux/spinlock.h>
 
 struct device;
 struct page;
@@ -61,11 +62,49 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
-extern phys_addr_t io_tlb_start, io_tlb_end;
+
+/**
+ * struct io_tlb_mem - IO TLB Memory Pool Descriptor
+ *
+ * @start:	The start address of the swiotlb memory pool. Used to do a quick
+ *		range check to see if the memory was in fact allocated by this
+ *		API.
+ * @end:	The end address of the swiotlb memory pool. Used to do a quick
+ *		range check to see if the memory was in fact allocated by this
+ *		API.
+ * @nslabs:	The number of IO TLB blocks (in groups of 64) between @start and
+ *		@end. This is command line adjustable via setup_io_tlb_npages.
+ * @used:	The number of used IO TLB block.
+ * @list:	The free list describing the number of free entries available
+ *		from each index.
+ * @index:	The index to start searching in the next round.
+ * @orig_addr:	The original address corresponding to a mapped entry.
+ * @alloc_size:	Size of the allocated buffer.
+ * @lock:	The lock to protect the above data structures in the map and
+ *		unmap calls.
+ * @debugfs:	The dentry to debugfs.
+ * @late_alloc:	%true if allocated using the page allocator
+ */
+struct io_tlb_mem {
+	phys_addr_t start;
+	phys_addr_t end;
+	unsigned long nslabs;
+	unsigned long used;
+	unsigned int *list;
+	unsigned int index;
+	phys_addr_t *orig_addr;
+	size_t *alloc_size;
+	spinlock_t lock;
+	struct dentry *debugfs;
+	bool late_alloc;
+};
+extern struct io_tlb_mem io_tlb_default_mem;
 
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 {
-	return paddr >= io_tlb_start && paddr < io_tlb_end;
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
+
+	return paddr >= mem->start && paddr < mem->end;
 }
 
 void __init swiotlb_exit(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 35e24f0ff8b2..d9c097f0f78c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -59,32 +59,11 @@
  */
 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
 
-enum swiotlb_force swiotlb_force;
-
-/*
- * Used to do a quick range check in swiotlb_tbl_unmap_single and
- * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
- * API.
- */
-phys_addr_t io_tlb_start, io_tlb_end;
-
-/*
- * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
- * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
- */
-static unsigned long io_tlb_nslabs;
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 
-/*
- * The number of used IO TLB block
- */
-static unsigned long io_tlb_used;
+enum swiotlb_force swiotlb_force;
 
-/*
- * This is a free list describing the number of free entries available from
- * each index
- */
-static unsigned int *io_tlb_list;
-static unsigned int io_tlb_index;
+struct io_tlb_mem io_tlb_default_mem;
 
 /*
  * Max segment that we can provide which (if pages are contingous) will
@@ -92,32 +71,15 @@ static unsigned int io_tlb_index;
  */
 static unsigned int max_segment;
 
-/*
- * We need to save away the original address corresponding to a mapped entry
- * for the sync operations.
- */
-#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-static phys_addr_t *io_tlb_orig_addr;
-
-/*
- * The mapped buffer's size should be validated during a sync operation.
- */
-static size_t *io_tlb_alloc_size;
-
-/*
- * Protect the above data structures in the map and unmap calls
- */
-static DEFINE_SPINLOCK(io_tlb_lock);
-
-static int late_alloc;
-
 static int __init
 setup_io_tlb_npages(char *str)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
+
 	if (isdigit(*str)) {
-		io_tlb_nslabs = simple_strtoul(str, &str, 0);
+		mem->nslabs = simple_strtoul(str, &str, 0);
 		/* avoid tail segment of size < IO_TLB_SEGSIZE */
-		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+		mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE);
 	}
 	if (*str == ',')
 		++str;
@@ -125,7 +87,7 @@ setup_io_tlb_npages(char *str)
 		swiotlb_force = SWIOTLB_FORCE;
 	} else if (!strcmp(str, "noforce")) {
 		swiotlb_force = SWIOTLB_NO_FORCE;
-		io_tlb_nslabs = 1;
+		mem->nslabs = 1;
 	}
 
 	return 0;
@@ -136,7 +98,7 @@ static bool no_iotlb_memory;
 
 unsigned long swiotlb_nr_tbl(void)
 {
-	return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs;
+	return unlikely(no_iotlb_memory) ? 0 : io_tlb_default_mem.nslabs;
 }
 EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
 
@@ -158,13 +120,14 @@ unsigned long swiotlb_size_or_default(void)
 {
 	unsigned long size;
 
-	size = io_tlb_nslabs << IO_TLB_SHIFT;
+	size = io_tlb_default_mem.nslabs << IO_TLB_SHIFT;
 
 	return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
 void __init swiotlb_adjust_size(unsigned long new_size)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long size;
 
 	/*
@@ -172,10 +135,10 @@ void __init swiotlb_adjust_size(unsigned long new_size)
 	 * architectures such as those supporting memory encryption to
 	 * adjust/expand SWIOTLB size for their use.
 	 */
-	if (!io_tlb_nslabs) {
+	if (!mem->nslabs) {
 		size = ALIGN(new_size, IO_TLB_SIZE);
-		io_tlb_nslabs = size >> IO_TLB_SHIFT;
-		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+		mem->nslabs = size >> IO_TLB_SHIFT;
+		mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE);
 
 		pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
 	}
@@ -183,14 +146,15 @@ void __init swiotlb_adjust_size(unsigned long new_size)
 
 void swiotlb_print_info(void)
 {
-	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	unsigned long bytes = mem->nslabs << IO_TLB_SHIFT;
 
 	if (no_iotlb_memory) {
 		pr_warn("No low mem\n");
 		return;
 	}
 
-	pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end,
+	pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end,
 	       bytes >> 20);
 }
 
@@ -212,68 +176,71 @@ static inline unsigned long nr_slots(u64 val)
  */
 void __init swiotlb_update_mem_attributes(void)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	void *vaddr;
 	unsigned long bytes;
 
-	if (no_iotlb_memory || late_alloc)
+	if (no_iotlb_memory || mem->late_alloc)
 		return;
 
-	vaddr = phys_to_virt(io_tlb_start);
-	bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+	vaddr = phys_to_virt(mem->start);
+	bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
 	set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
 	memset(vaddr, 0, bytes);
 }
 
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long i, bytes;
 	size_t alloc_size;
 
 	/* protect against double initialization */
-	if (WARN_ON_ONCE(io_tlb_start))
+	if (WARN_ON_ONCE(mem->start))
 		return -ENOMEM;
 
 	bytes = nslabs << IO_TLB_SHIFT;
 
-	io_tlb_nslabs = nslabs;
-	io_tlb_start = __pa(tlb);
-	io_tlb_end = io_tlb_start + bytes;
+	mem->nslabs = nslabs;
+	mem->start = __pa(tlb);
+	mem->end = mem->start + bytes;
+	mem->index = 0;
+	spin_lock_init(&mem->lock);
 
 	/*
 	 * Allocate and initialize the free list array.  This array is used
 	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
-	 * between io_tlb_start and io_tlb_end.
+	 * between mem->start and mem->end.
 	 */
-	alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int));
-	io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!io_tlb_list)
+	alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(int));
+	mem->list = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!mem->list)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
-	alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t));
-	io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!io_tlb_orig_addr)
+	alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(phys_addr_t));
+	mem->orig_addr = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!mem->orig_addr)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
-	alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
-	io_tlb_alloc_size = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!io_tlb_alloc_size)
+	alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(size_t));
+	mem->alloc_size = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (mem->alloc_size)
 		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
 		      __func__, alloc_size, PAGE_SIZE);
 
-	for (i = 0; i < io_tlb_nslabs; i++) {
-		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
-		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_alloc_size[i] = 0;
+	for (i = 0; i < mem->nslabs; i++) {
+		mem->list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
+		mem->orig_addr[i] = INVALID_PHYS_ADDR;
+		mem->alloc_size[i] = 0;
 	}
-	io_tlb_index = 0;
 	no_iotlb_memory = false;
 
 	if (verbose)
 		swiotlb_print_info();
 
-	swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+	swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
 	return 0;
 }
 
@@ -284,26 +251,27 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 void  __init
 swiotlb_init(int verbose)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	size_t default_size = IO_TLB_DEFAULT_SIZE;
 	unsigned char *vstart;
 	unsigned long bytes;
 
-	if (!io_tlb_nslabs) {
-		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
-		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	if (!mem->nslabs) {
+		mem->nslabs = (default_size >> IO_TLB_SHIFT);
+		mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE);
 	}
 
-	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+	bytes = mem->nslabs << IO_TLB_SHIFT;
 
 	/* Get IO TLB memory from the low pages */
 	vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
-	if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
+	if (vstart && !swiotlb_init_with_tbl(vstart, mem->nslabs, verbose))
 		return;
 
-	if (io_tlb_start) {
-		memblock_free_early(io_tlb_start,
-				    PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
-		io_tlb_start = 0;
+	if (mem->start) {
+		memblock_free_early(mem->start,
+				    PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT));
+		mem->start = 0;
 	}
 	pr_warn("Cannot allocate buffer");
 	no_iotlb_memory = true;
@@ -317,22 +285,23 @@ swiotlb_init(int verbose)
 int
 swiotlb_late_init_with_default_size(size_t default_size)
 {
-	unsigned long bytes, req_nslabs = io_tlb_nslabs;
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	unsigned long bytes, req_nslabs = mem->nslabs;
 	unsigned char *vstart = NULL;
 	unsigned int order;
 	int rc = 0;
 
-	if (!io_tlb_nslabs) {
-		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
-		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	if (!mem->nslabs) {
+		mem->nslabs = (default_size >> IO_TLB_SHIFT);
+		mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE);
 	}
 
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
-	io_tlb_nslabs = SLABS_PER_PAGE << order;
-	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+	order = get_order(mem->nslabs << IO_TLB_SHIFT);
+	mem->nslabs = SLABS_PER_PAGE << order;
+	bytes = mem->nslabs << IO_TLB_SHIFT;
 
 	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
 		vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
@@ -343,15 +312,15 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	}
 
 	if (!vstart) {
-		io_tlb_nslabs = req_nslabs;
+		mem->nslabs = req_nslabs;
 		return -ENOMEM;
 	}
 	if (order != get_order(bytes)) {
 		pr_warn("only able to allocate %ld MB\n",
 			(PAGE_SIZE << order) >> 20);
-		io_tlb_nslabs = SLABS_PER_PAGE << order;
+		mem->nslabs = SLABS_PER_PAGE << order;
 	}
-	rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
+	rc = swiotlb_late_init_with_tbl(vstart, mem->nslabs);
 	if (rc)
 		free_pages((unsigned long)vstart, order);
 
@@ -360,26 +329,32 @@ swiotlb_late_init_with_default_size(size_t default_size)
 
 static void swiotlb_cleanup(void)
 {
-	io_tlb_end = 0;
-	io_tlb_start = 0;
-	io_tlb_nslabs = 0;
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
+
+	mem->end = 0;
+	mem->start = 0;
+	mem->nslabs = 0;
 	max_segment = 0;
 }
 
 int
 swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long i, bytes;
 
 	/* protect against double initialization */
-	if (WARN_ON_ONCE(io_tlb_start))
+	if (WARN_ON_ONCE(mem->start))
 		return -ENOMEM;
 
 	bytes = nslabs << IO_TLB_SHIFT;
 
-	io_tlb_nslabs = nslabs;
-	io_tlb_start = virt_to_phys(tlb);
-	io_tlb_end = io_tlb_start + bytes;
+	mem->nslabs = nslabs;
+	mem->start = virt_to_phys(tlb);
+	mem->end = mem->start + bytes;
+	mem->index = 0;
+	mem->late_alloc = 1;
+	spin_lock_init(&mem->lock);
 
 	set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
 	memset(tlb, 0, bytes);
@@ -387,52 +362,45 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	/*
 	 * Allocate and initialize the free list array.  This array is used
 	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
-	 * between io_tlb_start and io_tlb_end.
+	 * between mem->start and mem->end.
 	 */
-	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
-				      get_order(io_tlb_nslabs * sizeof(int)));
-	if (!io_tlb_list)
+	mem->list = (unsigned int *)__get_free_pages(GFP_KERNEL,
+	                              get_order(mem->nslabs * sizeof(int)));
+	if (!mem->list)
 		goto cleanup3;
 
-	io_tlb_orig_addr = (phys_addr_t *)
+	mem->orig_addr = (phys_addr_t *)
 		__get_free_pages(GFP_KERNEL,
-				 get_order(io_tlb_nslabs *
+				 get_order(mem->nslabs *
 					   sizeof(phys_addr_t)));
-	if (!io_tlb_orig_addr)
+	if (!mem->orig_addr)
 		goto cleanup4;
 
-	io_tlb_alloc_size = (size_t *)
+	mem->alloc_size = (size_t *)
 		__get_free_pages(GFP_KERNEL,
-				 get_order(io_tlb_nslabs *
+				 get_order(mem->nslabs *
 					   sizeof(size_t)));
-	if (!io_tlb_alloc_size)
+	if (!mem->alloc_size)
 		goto cleanup5;
 
-
-	for (i = 0; i < io_tlb_nslabs; i++) {
-		io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
-		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_alloc_size[i] = 0;
+	for (i = 0; i < mem->nslabs; i++) {
+		mem->list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
+		mem->orig_addr[i] = INVALID_PHYS_ADDR;
+		mem->alloc_size[i] = 0;
 	}
-	io_tlb_index = 0;
 	no_iotlb_memory = false;
 
 	swiotlb_print_info();
-
-	late_alloc = 1;
-
-	swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
-
+	swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
 	return 0;
 
 cleanup5:
-	free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
-							      sizeof(phys_addr_t)));
-
+	free_pages((unsigned long)mem->orig_addr,
+		   get_order(mem->nslabs * sizeof(phys_addr_t)));
 cleanup4:
-	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
-	                                                 sizeof(int)));
-	io_tlb_list = NULL;
+	free_pages((unsigned long)mem->list,
+		   get_order(mem->nslabs * sizeof(int)));
+	mem->list = NULL;
 cleanup3:
 	swiotlb_cleanup();
 	return -ENOMEM;
@@ -440,27 +408,29 @@ cleanup3:
 
 void __init swiotlb_exit(void)
 {
-	if (!io_tlb_orig_addr)
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
+
+	if (!mem->orig_addr)
 		return;
 
-	if (late_alloc) {
-		free_pages((unsigned long)io_tlb_alloc_size,
-			   get_order(io_tlb_nslabs * sizeof(size_t)));
-		free_pages((unsigned long)io_tlb_orig_addr,
-			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
-		free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
-								 sizeof(int)));
-		free_pages((unsigned long)phys_to_virt(io_tlb_start),
-			   get_order(io_tlb_nslabs << IO_TLB_SHIFT));
+	if (mem->late_alloc) {
+		free_pages((unsigned long)mem->alloc_size,
+			   get_order(mem->nslabs * sizeof(size_t)));
+		free_pages((unsigned long)mem->orig_addr,
+			   get_order(mem->nslabs * sizeof(phys_addr_t)));
+		free_pages((unsigned long)mem->list,
+			   get_order(mem->nslabs * sizeof(int)));
+		free_pages((unsigned long)phys_to_virt(mem->start),
+			   get_order(mem->nslabs << IO_TLB_SHIFT));
 	} else {
-		memblock_free_late(__pa(io_tlb_orig_addr),
-				   PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
-		memblock_free_late(__pa(io_tlb_alloc_size),
-				   PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
-		memblock_free_late(__pa(io_tlb_list),
-				   PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
-		memblock_free_late(io_tlb_start,
-				   PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+		memblock_free_late(__pa(mem->alloc_size),
+				   PAGE_ALIGN(mem->nslabs * sizeof(size_t)));
+		memblock_free_late(__pa(mem->orig_addr),
+				   PAGE_ALIGN(mem->nslabs * sizeof(phys_addr_t)));
+		memblock_free_late(__pa(mem->list),
+				   PAGE_ALIGN(mem->nslabs * sizeof(int)));
+		memblock_free_late(mem->start,
+				   PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT));
 	}
 	swiotlb_cleanup();
 }
@@ -471,9 +441,10 @@ void __init swiotlb_exit(void)
 static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
 			   enum dma_data_direction dir)
 {
-	int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
-	size_t alloc_size = io_tlb_alloc_size[index];
-	phys_addr_t orig_addr = io_tlb_orig_addr[index];
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
+	phys_addr_t orig_addr = mem->orig_addr[index];
+	size_t alloc_size = mem->alloc_size[index];
 	unsigned long pfn = PFN_DOWN(orig_addr);
 	unsigned char *vaddr = phys_to_virt(tlb_addr);
 
@@ -538,9 +509,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask)
 	return nr_slots(boundary_mask + 1);
 }
 
-static unsigned int wrap_index(unsigned int index)
+static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
 {
-	if (index >= io_tlb_nslabs)
+	if (index >= mem->nslabs)
 		return 0;
 	return index;
 }
@@ -552,9 +523,10 @@ static unsigned int wrap_index(unsigned int index)
 static int find_slots(struct device *dev, phys_addr_t orig_addr,
 		size_t alloc_size)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long boundary_mask = dma_get_seg_boundary(dev);
 	dma_addr_t tbl_dma_addr =
-		phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask;
+		phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
 	unsigned long max_slots = get_max_slots(boundary_mask);
 	unsigned int iotlb_align_mask =
 		dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
@@ -573,15 +545,15 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
 	if (alloc_size >= PAGE_SIZE)
 		stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
 
-	spin_lock_irqsave(&io_tlb_lock, flags);
-	if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+	spin_lock_irqsave(&mem->lock, flags);
+	if (unlikely(nslots > mem->nslabs - mem->used))
 		goto not_found;
 
-	index = wrap = wrap_index(ALIGN(io_tlb_index, stride));
+	index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
 	do {
 		if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
 		    (orig_addr & iotlb_align_mask)) {
-			index = wrap_index(index + 1);
+			index = wrap_index(mem, index + 1);
 			continue;
 		}
 
@@ -593,34 +565,34 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
 		if (!iommu_is_span_boundary(index, nslots,
 					    nr_slots(tbl_dma_addr),
 					    max_slots)) {
-			if (io_tlb_list[index] >= nslots)
+			if (mem->list[index] >= nslots)
 				goto found;
 		}
-		index = wrap_index(index + stride);
+		index = wrap_index(mem, index + stride);
 	} while (index != wrap);
 
 not_found:
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
+	spin_unlock_irqrestore(&mem->lock, flags);
 	return -1;
 
 found:
 	for (i = index; i < index + nslots; i++)
-		io_tlb_list[i] = 0;
+		mem->list[i] = 0;
 	for (i = index - 1;
 	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
-	     io_tlb_list[i]; i--)
-		io_tlb_list[i] = ++count;
+	     mem->list[i]; i--)
+		mem->list[i] = ++count;
 
 	/*
 	 * Update the indices to avoid searching in the next round.
 	 */
-	if (index + nslots < io_tlb_nslabs)
-		io_tlb_index = index + nslots;
+	if (index + nslots < mem->nslabs)
+		mem->index = index + nslots;
 	else
-		io_tlb_index = 0;
-	io_tlb_used += nslots;
+		mem->index = 0;
+	mem->used += nslots;
 
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
+	spin_unlock_irqrestore(&mem->lock, flags);
 	return index;
 }
 
@@ -628,6 +600,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		size_t mapping_size, size_t alloc_size,
 		enum dma_data_direction dir, unsigned long attrs)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned int offset = swiotlb_align_offset(dev, orig_addr);
 	unsigned int index, i;
 	phys_addr_t tlb_addr;
@@ -649,7 +622,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		if (!(attrs & DMA_ATTR_NO_WARN))
 			dev_warn_ratelimited(dev,
 	"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
-				 alloc_size, io_tlb_nslabs, io_tlb_used);
+				 alloc_size, mem->nslabs, mem->used);
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
@@ -659,10 +632,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 * needed.
 	 */
 	for (i = 0; i < nr_slots(alloc_size + offset); i++) {
-		io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i);
-		io_tlb_alloc_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+		mem->orig_addr[index + i] = slot_addr(orig_addr, i);
+		mem->alloc_size[index + i] = alloc_size - (i << IO_TLB_SHIFT);
 	}
-	tlb_addr = slot_addr(io_tlb_start, index) + offset;
+	tlb_addr = slot_addr(mem->start, index) + offset;
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
 	    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
 		swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
@@ -676,10 +649,11 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      size_t mapping_size, enum dma_data_direction dir,
 			      unsigned long attrs)
 {
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 	unsigned long flags;
 	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
-	int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT;
-	int nslots = nr_slots(io_tlb_alloc_size[index] + offset);
+	int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
+	int nslots = nr_slots(mem->alloc_size[index] + offset);
 	int count, i;
 
 	/*
@@ -695,9 +669,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 * While returning the entries to the free list, we merge the entries
 	 * with slots below and above the pool being returned.
 	 */
-	spin_lock_irqsave(&io_tlb_lock, flags);
+	spin_lock_irqsave(&mem->lock, flags);
 	if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
-		count = io_tlb_list[index + nslots];
+		count = mem->list[index + nslots];
 	else
 		count = 0;
 
@@ -706,9 +680,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 * superceeding slots
 	 */
 	for (i = index + nslots - 1; i >= index; i--) {
-		io_tlb_list[i] = ++count;
-		io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
-		io_tlb_alloc_size[i] = 0;
+		mem->list[i] = ++count;
+		mem->orig_addr[i] = INVALID_PHYS_ADDR;
+		mem->alloc_size[i] = 0;
 	}
 
 	/*
@@ -716,11 +690,11 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 * available (non zero)
 	 */
 	for (i = index - 1;
-	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i];
+	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->list[i];
 	     i--)
-		io_tlb_list[i] = ++count;
-	io_tlb_used -= nslots;
-	spin_unlock_irqrestore(&io_tlb_lock, flags);
+		mem->list[i] = ++count;
+	mem->used -= nslots;
+	spin_unlock_irqrestore(&mem->lock, flags);
 }
 
 void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
@@ -783,21 +757,21 @@ size_t swiotlb_max_mapping_size(struct device *dev)
 bool is_swiotlb_active(void)
 {
 	/*
-	 * When SWIOTLB is initialized, even if io_tlb_start points to physical
-	 * address zero, io_tlb_end surely doesn't.
+	 * When SWIOTLB is initialized, even if mem->start points to physical
+	 * address zero, mem->end surely doesn't.
 	 */
-	return io_tlb_end != 0;
+	return io_tlb_default_mem.end != 0;
 }
 
 #ifdef CONFIG_DEBUG_FS
 
 static int __init swiotlb_create_debugfs(void)
 {
-	struct dentry *root;
+	struct io_tlb_mem *mem = &io_tlb_default_mem;
 
-	root = debugfs_create_dir("swiotlb", NULL);
-	debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs);
-	debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used);
+	mem->debugfs = debugfs_create_dir("swiotlb", NULL);
+	debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
+	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2d29960af0bee8cc6731b9bd3964850c9e7a6840 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 18 Mar 2021 17:14:23 +0100
Subject: swiotlb: dynamically allocate io_tlb_default_mem

Instead of allocating ->list and ->orig_addr separately just do one
dynamic allocation for the actual io_tlb_mem structure.  This simplifies
a lot of the initialization code, and also allows to just check
io_tlb_default_mem to see if swiotlb is in use.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/xen/swiotlb-xen.c |  22 ++--
 include/linux/swiotlb.h   |  18 +--
 kernel/dma/swiotlb.c      | 306 +++++++++++++++-------------------------------
 3 files changed, 117 insertions(+), 229 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 5329ad54a5f3..4c89afc0df62 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -158,17 +158,14 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err err)
 int __ref xen_swiotlb_init(void)
 {
 	enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN;
-	unsigned long nslabs, bytes, order;
-	unsigned int repeat = 3;
+	unsigned long bytes = swiotlb_size_or_default();
+	unsigned long nslabs = bytes >> IO_TLB_SHIFT;
+	unsigned int order, repeat = 3;
 	int rc = -ENOMEM;
 	char *start;
 
-	nslabs = swiotlb_nr_tbl();
-	if (!nslabs)
-		nslabs = DEFAULT_NSLABS;
 retry:
 	m_ret = XEN_SWIOTLB_ENOMEM;
-	bytes = nslabs << IO_TLB_SHIFT;
 	order = get_order(bytes);
 
 	/*
@@ -221,19 +218,16 @@ error:
 #ifdef CONFIG_X86
 void __init xen_swiotlb_init_early(void)
 {
-	unsigned long nslabs, bytes;
+	unsigned long bytes = swiotlb_size_or_default();
+	unsigned long nslabs = bytes >> IO_TLB_SHIFT;
 	unsigned int repeat = 3;
 	char *start;
 	int rc;
 
-	nslabs = swiotlb_nr_tbl();
-	if (!nslabs)
-		nslabs = DEFAULT_NSLABS;
 retry:
 	/*
 	 * Get IO TLB memory from any location.
 	 */
-	bytes = nslabs << IO_TLB_SHIFT;
 	start = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE);
 	if (!start)
 		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
@@ -248,8 +242,8 @@ retry:
 		if (repeat--) {
 			/* Min is 2MB */
 			nslabs = max(1024UL, (nslabs >> 1));
-			pr_info("Lowering to %luMB\n",
-				(nslabs << IO_TLB_SHIFT) >> 20);
+			bytes = nslabs << IO_TLB_SHIFT;
+			pr_info("Lowering to %luMB\n", bytes >> 20);
 			goto retry;
 		}
 		panic("%s (rc:%d)", xen_swiotlb_error(XEN_SWIOTLB_EFIXUP), rc);
@@ -548,7 +542,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
 static int
 xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return xen_phys_to_dma(hwdev, io_tlb_default_mem.end - 1) <= mask;
+	return xen_phys_to_dma(hwdev, io_tlb_default_mem->end - 1) <= mask;
 }
 
 const struct dma_map_ops xen_swiotlb_dma_ops = {
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 5ec5378b17c3..63f7a63f61d0 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -90,28 +90,30 @@ struct io_tlb_mem {
 	phys_addr_t end;
 	unsigned long nslabs;
 	unsigned long used;
-	unsigned int *list;
 	unsigned int index;
-	phys_addr_t *orig_addr;
-	size_t *alloc_size;
 	spinlock_t lock;
 	struct dentry *debugfs;
 	bool late_alloc;
+	struct io_tlb_slot {
+		phys_addr_t orig_addr;
+		size_t alloc_size;
+		unsigned int list;
+	} slots[];
 };
-extern struct io_tlb_mem io_tlb_default_mem;
+extern struct io_tlb_mem *io_tlb_default_mem;
 
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
 
-	return paddr >= mem->start && paddr < mem->end;
+	return mem && paddr >= mem->start && paddr < mem->end;
 }
 
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
 bool is_swiotlb_active(void);
-void __init swiotlb_adjust_size(unsigned long new_size);
+void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
@@ -135,7 +137,7 @@ static inline bool is_swiotlb_active(void)
 	return false;
 }
 
-static inline void swiotlb_adjust_size(unsigned long new_size)
+static inline void swiotlb_adjust_size(unsigned long size)
 {
 }
 #endif /* CONFIG_SWIOTLB */
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d9c097f0f78c..13de669a9b46 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -63,7 +63,7 @@
 
 enum swiotlb_force swiotlb_force;
 
-struct io_tlb_mem io_tlb_default_mem;
+struct io_tlb_mem *io_tlb_default_mem;
 
 /*
  * Max segment that we can provide which (if pages are contingous) will
@@ -71,15 +71,15 @@ struct io_tlb_mem io_tlb_default_mem;
  */
 static unsigned int max_segment;
 
+static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+
 static int __init
 setup_io_tlb_npages(char *str)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-
 	if (isdigit(*str)) {
-		mem->nslabs = simple_strtoul(str, &str, 0);
 		/* avoid tail segment of size < IO_TLB_SEGSIZE */
-		mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE);
+		default_nslabs =
+			ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
 	}
 	if (*str == ',')
 		++str;
@@ -87,24 +87,22 @@ setup_io_tlb_npages(char *str)
 		swiotlb_force = SWIOTLB_FORCE;
 	} else if (!strcmp(str, "noforce")) {
 		swiotlb_force = SWIOTLB_NO_FORCE;
-		mem->nslabs = 1;
+		default_nslabs = 1;
 	}
 
 	return 0;
 }
 early_param("swiotlb", setup_io_tlb_npages);
 
-static bool no_iotlb_memory;
-
 unsigned long swiotlb_nr_tbl(void)
 {
-	return unlikely(no_iotlb_memory) ? 0 : io_tlb_default_mem.nslabs;
+	return io_tlb_default_mem ? io_tlb_default_mem->nslabs : 0;
 }
 EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
 
 unsigned int swiotlb_max_segment(void)
 {
-	return unlikely(no_iotlb_memory) ? 0 : max_segment;
+	return io_tlb_default_mem ? max_segment : 0;
 }
 EXPORT_SYMBOL_GPL(swiotlb_max_segment);
 
@@ -118,44 +116,32 @@ void swiotlb_set_max_segment(unsigned int val)
 
 unsigned long swiotlb_size_or_default(void)
 {
-	unsigned long size;
-
-	size = io_tlb_default_mem.nslabs << IO_TLB_SHIFT;
-
-	return size ? size : (IO_TLB_DEFAULT_SIZE);
+	return default_nslabs << IO_TLB_SHIFT;
 }
 
-void __init swiotlb_adjust_size(unsigned long new_size)
+void __init swiotlb_adjust_size(unsigned long size)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	unsigned long size;
-
 	/*
 	 * If swiotlb parameter has not been specified, give a chance to
 	 * architectures such as those supporting memory encryption to
 	 * adjust/expand SWIOTLB size for their use.
 	 */
-	if (!mem->nslabs) {
-		size = ALIGN(new_size, IO_TLB_SIZE);
-		mem->nslabs = size >> IO_TLB_SHIFT;
-		mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE);
-
-		pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
-	}
+	size = ALIGN(size, IO_TLB_SIZE);
+	default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+	pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
 }
 
 void swiotlb_print_info(void)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	unsigned long bytes = mem->nslabs << IO_TLB_SHIFT;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
 
-	if (no_iotlb_memory) {
+	if (!mem) {
 		pr_warn("No low mem\n");
 		return;
 	}
 
 	pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end,
-	       bytes >> 20);
+	       (mem->nslabs << IO_TLB_SHIFT) >> 20);
 }
 
 static inline unsigned long io_tlb_offset(unsigned long val)
@@ -176,13 +162,12 @@ static inline unsigned long nr_slots(u64 val)
  */
 void __init swiotlb_update_mem_attributes(void)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
 	void *vaddr;
 	unsigned long bytes;
 
-	if (no_iotlb_memory || mem->late_alloc)
+	if (!mem || mem->late_alloc)
 		return;
-
 	vaddr = phys_to_virt(mem->start);
 	bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
 	set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
@@ -191,55 +176,33 @@ void __init swiotlb_update_mem_attributes(void)
 
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	unsigned long i, bytes;
+	unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+	struct io_tlb_mem *mem;
 	size_t alloc_size;
 
 	/* protect against double initialization */
-	if (WARN_ON_ONCE(mem->start))
+	if (WARN_ON_ONCE(io_tlb_default_mem))
 		return -ENOMEM;
 
-	bytes = nslabs << IO_TLB_SHIFT;
-
+	alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs));
+	mem = memblock_alloc(alloc_size, PAGE_SIZE);
+	if (!mem)
+		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+		      __func__, alloc_size, PAGE_SIZE);
 	mem->nslabs = nslabs;
 	mem->start = __pa(tlb);
 	mem->end = mem->start + bytes;
 	mem->index = 0;
 	spin_lock_init(&mem->lock);
-
-	/*
-	 * Allocate and initialize the free list array.  This array is used
-	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
-	 * between mem->start and mem->end.
-	 */
-	alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(int));
-	mem->list = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!mem->list)
-		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
-		      __func__, alloc_size, PAGE_SIZE);
-
-	alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(phys_addr_t));
-	mem->orig_addr = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (!mem->orig_addr)
-		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
-		      __func__, alloc_size, PAGE_SIZE);
-
-	alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(size_t));
-	mem->alloc_size = memblock_alloc(alloc_size, PAGE_SIZE);
-	if (mem->alloc_size)
-		panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
-		      __func__, alloc_size, PAGE_SIZE);
-
 	for (i = 0; i < mem->nslabs; i++) {
-		mem->list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
-		mem->orig_addr[i] = INVALID_PHYS_ADDR;
-		mem->alloc_size[i] = 0;
+		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+		mem->slots[i].alloc_size = 0;
 	}
-	no_iotlb_memory = false;
 
+	io_tlb_default_mem = mem;
 	if (verbose)
 		swiotlb_print_info();
-
 	swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
 	return 0;
 }
@@ -251,30 +214,21 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 void  __init
 swiotlb_init(int verbose)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	size_t default_size = IO_TLB_DEFAULT_SIZE;
-	unsigned char *vstart;
-	unsigned long bytes;
-
-	if (!mem->nslabs) {
-		mem->nslabs = (default_size >> IO_TLB_SHIFT);
-		mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE);
-	}
-
-	bytes = mem->nslabs << IO_TLB_SHIFT;
+	size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT);
+	void *tlb;
 
 	/* Get IO TLB memory from the low pages */
-	vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
-	if (vstart && !swiotlb_init_with_tbl(vstart, mem->nslabs, verbose))
-		return;
-
-	if (mem->start) {
-		memblock_free_early(mem->start,
-				    PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT));
-		mem->start = 0;
-	}
+	tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+	if (!tlb)
+		goto fail;
+	if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose))
+		goto fail_free_mem;
+	return;
+
+fail_free_mem:
+	memblock_free_early(__pa(tlb), bytes);
+fail:
 	pr_warn("Cannot allocate buffer");
-	no_iotlb_memory = true;
 }
 
 /*
@@ -285,23 +239,19 @@ swiotlb_init(int verbose)
 int
 swiotlb_late_init_with_default_size(size_t default_size)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	unsigned long bytes, req_nslabs = mem->nslabs;
+	unsigned long nslabs =
+		ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+	unsigned long bytes;
 	unsigned char *vstart = NULL;
 	unsigned int order;
 	int rc = 0;
 
-	if (!mem->nslabs) {
-		mem->nslabs = (default_size >> IO_TLB_SHIFT);
-		mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE);
-	}
-
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	order = get_order(mem->nslabs << IO_TLB_SHIFT);
-	mem->nslabs = SLABS_PER_PAGE << order;
-	bytes = mem->nslabs << IO_TLB_SHIFT;
+	order = get_order(nslabs << IO_TLB_SHIFT);
+	nslabs = SLABS_PER_PAGE << order;
+	bytes = nslabs << IO_TLB_SHIFT;
 
 	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
 		vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
@@ -311,43 +261,35 @@ swiotlb_late_init_with_default_size(size_t default_size)
 		order--;
 	}
 
-	if (!vstart) {
-		mem->nslabs = req_nslabs;
+	if (!vstart)
 		return -ENOMEM;
-	}
+
 	if (order != get_order(bytes)) {
 		pr_warn("only able to allocate %ld MB\n",
 			(PAGE_SIZE << order) >> 20);
-		mem->nslabs = SLABS_PER_PAGE << order;
+		nslabs = SLABS_PER_PAGE << order;
 	}
-	rc = swiotlb_late_init_with_tbl(vstart, mem->nslabs);
+	rc = swiotlb_late_init_with_tbl(vstart, nslabs);
 	if (rc)
 		free_pages((unsigned long)vstart, order);
 
 	return rc;
 }
 
-static void swiotlb_cleanup(void)
-{
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-
-	mem->end = 0;
-	mem->start = 0;
-	mem->nslabs = 0;
-	max_segment = 0;
-}
-
 int
 swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-	unsigned long i, bytes;
+	unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+	struct io_tlb_mem *mem;
 
 	/* protect against double initialization */
-	if (WARN_ON_ONCE(mem->start))
+	if (WARN_ON_ONCE(io_tlb_default_mem))
 		return -ENOMEM;
 
-	bytes = nslabs << IO_TLB_SHIFT;
+	mem = (void *)__get_free_pages(GFP_KERNEL,
+		get_order(struct_size(mem, slots, nslabs)));
+	if (!mem)
+		return -ENOMEM;
 
 	mem->nslabs = nslabs;
 	mem->start = virt_to_phys(tlb);
@@ -355,84 +297,35 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	mem->index = 0;
 	mem->late_alloc = 1;
 	spin_lock_init(&mem->lock);
+	for (i = 0; i < mem->nslabs; i++) {
+		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+		mem->slots[i].alloc_size = 0;
+	}
 
 	set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
 	memset(tlb, 0, bytes);
 
-	/*
-	 * Allocate and initialize the free list array.  This array is used
-	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
-	 * between mem->start and mem->end.
-	 */
-	mem->list = (unsigned int *)__get_free_pages(GFP_KERNEL,
-	                              get_order(mem->nslabs * sizeof(int)));
-	if (!mem->list)
-		goto cleanup3;
-
-	mem->orig_addr = (phys_addr_t *)
-		__get_free_pages(GFP_KERNEL,
-				 get_order(mem->nslabs *
-					   sizeof(phys_addr_t)));
-	if (!mem->orig_addr)
-		goto cleanup4;
-
-	mem->alloc_size = (size_t *)
-		__get_free_pages(GFP_KERNEL,
-				 get_order(mem->nslabs *
-					   sizeof(size_t)));
-	if (!mem->alloc_size)
-		goto cleanup5;
-
-	for (i = 0; i < mem->nslabs; i++) {
-		mem->list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i);
-		mem->orig_addr[i] = INVALID_PHYS_ADDR;
-		mem->alloc_size[i] = 0;
-	}
-	no_iotlb_memory = false;
-
+	io_tlb_default_mem = mem;
 	swiotlb_print_info();
 	swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT);
 	return 0;
-
-cleanup5:
-	free_pages((unsigned long)mem->orig_addr,
-		   get_order(mem->nslabs * sizeof(phys_addr_t)));
-cleanup4:
-	free_pages((unsigned long)mem->list,
-		   get_order(mem->nslabs * sizeof(int)));
-	mem->list = NULL;
-cleanup3:
-	swiotlb_cleanup();
-	return -ENOMEM;
 }
 
 void __init swiotlb_exit(void)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
+	size_t size;
 
-	if (!mem->orig_addr)
+	if (!mem)
 		return;
 
-	if (mem->late_alloc) {
-		free_pages((unsigned long)mem->alloc_size,
-			   get_order(mem->nslabs * sizeof(size_t)));
-		free_pages((unsigned long)mem->orig_addr,
-			   get_order(mem->nslabs * sizeof(phys_addr_t)));
-		free_pages((unsigned long)mem->list,
-			   get_order(mem->nslabs * sizeof(int)));
-		free_pages((unsigned long)phys_to_virt(mem->start),
-			   get_order(mem->nslabs << IO_TLB_SHIFT));
-	} else {
-		memblock_free_late(__pa(mem->alloc_size),
-				   PAGE_ALIGN(mem->nslabs * sizeof(size_t)));
-		memblock_free_late(__pa(mem->orig_addr),
-				   PAGE_ALIGN(mem->nslabs * sizeof(phys_addr_t)));
-		memblock_free_late(__pa(mem->list),
-				   PAGE_ALIGN(mem->nslabs * sizeof(int)));
-		memblock_free_late(mem->start,
-				   PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT));
-	}
-	swiotlb_cleanup();
+	size = struct_size(mem, slots, mem->nslabs);
+	if (mem->late_alloc)
+		free_pages((unsigned long)mem, get_order(size));
+	else
+		memblock_free_late(__pa(mem), PAGE_ALIGN(size));
+	io_tlb_default_mem = NULL;
 }
 
 /*
@@ -441,10 +334,10 @@ void __init swiotlb_exit(void)
 static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
 			   enum dma_data_direction dir)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
 	int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
-	phys_addr_t orig_addr = mem->orig_addr[index];
-	size_t alloc_size = mem->alloc_size[index];
+	phys_addr_t orig_addr = mem->slots[index].orig_addr;
+	size_t alloc_size = mem->slots[index].alloc_size;
 	unsigned long pfn = PFN_DOWN(orig_addr);
 	unsigned char *vaddr = phys_to_virt(tlb_addr);
 
@@ -523,7 +416,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
 static int find_slots(struct device *dev, phys_addr_t orig_addr,
 		size_t alloc_size)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
 	unsigned long boundary_mask = dma_get_seg_boundary(dev);
 	dma_addr_t tbl_dma_addr =
 		phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -565,7 +458,7 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr,
 		if (!iommu_is_span_boundary(index, nslots,
 					    nr_slots(tbl_dma_addr),
 					    max_slots)) {
-			if (mem->list[index] >= nslots)
+			if (mem->slots[index].list >= nslots)
 				goto found;
 		}
 		index = wrap_index(mem, index + stride);
@@ -577,11 +470,11 @@ not_found:
 
 found:
 	for (i = index; i < index + nslots; i++)
-		mem->list[i] = 0;
+		mem->slots[i].list = 0;
 	for (i = index - 1;
 	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
-	     mem->list[i]; i--)
-		mem->list[i] = ++count;
+	     mem->slots[i].list; i--)
+		mem->slots[i].list = ++count;
 
 	/*
 	 * Update the indices to avoid searching in the next round.
@@ -600,12 +493,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		size_t mapping_size, size_t alloc_size,
 		enum dma_data_direction dir, unsigned long attrs)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
 	unsigned int offset = swiotlb_align_offset(dev, orig_addr);
 	unsigned int index, i;
 	phys_addr_t tlb_addr;
 
-	if (no_iotlb_memory)
+	if (!mem)
 		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
 
 	if (mem_encrypt_active())
@@ -632,8 +525,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	 * needed.
 	 */
 	for (i = 0; i < nr_slots(alloc_size + offset); i++) {
-		mem->orig_addr[index + i] = slot_addr(orig_addr, i);
-		mem->alloc_size[index + i] = alloc_size - (i << IO_TLB_SHIFT);
+		mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+		mem->slots[index + i].alloc_size =
+			alloc_size - (i << IO_TLB_SHIFT);
 	}
 	tlb_addr = slot_addr(mem->start, index) + offset;
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
@@ -649,11 +543,11 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 			      size_t mapping_size, enum dma_data_direction dir,
 			      unsigned long attrs)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
 	unsigned long flags;
 	unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
 	int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
-	int nslots = nr_slots(mem->alloc_size[index] + offset);
+	int nslots = nr_slots(mem->slots[index].alloc_size + offset);
 	int count, i;
 
 	/*
@@ -671,7 +565,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 */
 	spin_lock_irqsave(&mem->lock, flags);
 	if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
-		count = mem->list[index + nslots];
+		count = mem->slots[index + nslots].list;
 	else
 		count = 0;
 
@@ -680,9 +574,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 * superceeding slots
 	 */
 	for (i = index + nslots - 1; i >= index; i--) {
-		mem->list[i] = ++count;
-		mem->orig_addr[i] = INVALID_PHYS_ADDR;
-		mem->alloc_size[i] = 0;
+		mem->slots[i].list = ++count;
+		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+		mem->slots[i].alloc_size = 0;
 	}
 
 	/*
@@ -690,9 +584,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 * available (non zero)
 	 */
 	for (i = index - 1;
-	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->list[i];
+	     io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
 	     i--)
-		mem->list[i] = ++count;
+		mem->slots[i].list = ++count;
 	mem->used -= nslots;
 	spin_unlock_irqrestore(&mem->lock, flags);
 }
@@ -756,19 +650,17 @@ size_t swiotlb_max_mapping_size(struct device *dev)
 
 bool is_swiotlb_active(void)
 {
-	/*
-	 * When SWIOTLB is initialized, even if mem->start points to physical
-	 * address zero, mem->end surely doesn't.
-	 */
-	return io_tlb_default_mem.end != 0;
+	return io_tlb_default_mem != NULL;
 }
 
 #ifdef CONFIG_DEBUG_FS
 
 static int __init swiotlb_create_debugfs(void)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
+	struct io_tlb_mem *mem = io_tlb_default_mem;
 
+	if (!mem)
+		return 0;
 	mem->debugfs = debugfs_create_dir("swiotlb", NULL);
 	debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
 	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
-- 
cgit v1.2.3


From 2cbc2776efe4faed0e17c48ae076aa03a0fcc61f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 18 Mar 2021 17:14:24 +0100
Subject: swiotlb: remove swiotlb_nr_tbl

All callers just use it to check if swiotlb is active at all, for which
they can just use is_swiotlb_active.  In the longer run drivers need
to stop using is_swiotlb_active as well, but let's do the simple step
first.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +-
 drivers/gpu/drm/nouveau/nouveau_ttm.c        | 2 +-
 drivers/pci/xen-pcifront.c                   | 2 +-
 include/linux/swiotlb.h                      | 1 -
 kernel/dma/swiotlb.c                         | 7 +------
 5 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index ad22f42541bd..a9d65fc8aa0e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj)
 
 	max_order = MAX_ORDER;
 #ifdef CONFIG_SWIOTLB
-	if (swiotlb_nr_tbl()) {
+	if (is_swiotlb_active()) {
 		unsigned int max_segment;
 
 		max_segment = swiotlb_max_segment();
diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index a37bc3d7b38b..9662522aa066 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c
@@ -321,7 +321,7 @@ nouveau_ttm_init(struct nouveau_drm *drm)
 	}
 
 #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
-	need_swiotlb = !!swiotlb_nr_tbl();
+	need_swiotlb = is_swiotlb_active();
 #endif
 
 	ret = ttm_bo_device_init(&drm->ttm.bdev, &nouveau_bo_driver,
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 2d7502648219..b7a8f3a1921f 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -693,7 +693,7 @@ static int pcifront_connect_and_init_dma(struct pcifront_device *pdev)
 
 	spin_unlock(&pcifront_dev_lock);
 
-	if (!err && !swiotlb_nr_tbl()) {
+	if (!err && !is_swiotlb_active()) {
 		err = pci_xen_swiotlb_init_late();
 		if (err)
 			dev_err(&pdev->xdev->dev, "Could not setup SWIOTLB!\n");
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 63f7a63f61d0..216854a5e513 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -37,7 +37,6 @@ enum swiotlb_force {
 
 extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
-extern unsigned long swiotlb_nr_tbl(void);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 extern int swiotlb_late_init_with_default_size(size_t default_size);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 13de669a9b46..539c76beb52e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -94,12 +94,6 @@ setup_io_tlb_npages(char *str)
 }
 early_param("swiotlb", setup_io_tlb_npages);
 
-unsigned long swiotlb_nr_tbl(void)
-{
-	return io_tlb_default_mem ? io_tlb_default_mem->nslabs : 0;
-}
-EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
-
 unsigned int swiotlb_max_segment(void)
 {
 	return io_tlb_default_mem ? max_segment : 0;
@@ -652,6 +646,7 @@ bool is_swiotlb_active(void)
 {
 	return io_tlb_default_mem != NULL;
 }
+EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
 #ifdef CONFIG_DEBUG_FS
 
-- 
cgit v1.2.3


From fb98cc0b3af2ba4d87301dff2b381b12eee35d7d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 10 Mar 2021 08:33:19 +0100
Subject: efi: use 32-bit alignment for efi_guid_t literals

Commit 494c704f9af0 ("efi: Use 32-bit alignment for efi_guid_t") updated
the type definition of efi_guid_t to ensure that it always appears
sufficiently aligned (the UEFI spec is ambiguous about this, but given
the fact that its EFI_GUID type is defined in terms of a struct carrying
a uint32_t, the natural alignment is definitely >= 32 bits).

However, we missed the EFI_GUID() macro which is used to instantiate
efi_guid_t literals: that macro is still based on the guid_t type,
which does not have a minimum alignment at all. This results in warnings
such as

  In file included from drivers/firmware/efi/mokvar-table.c:35:
  include/linux/efi.h:1093:34: warning: passing 1-byte aligned argument to
      4-byte aligned parameter 2 of 'get_var' may result in an unaligned pointer
      access [-Walign-mismatch]
          status = get_var(L"SecureBoot", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size,
                                          ^
  include/linux/efi.h:1101:24: warning: passing 1-byte aligned argument to
      4-byte aligned parameter 2 of 'get_var' may result in an unaligned pointer
      access [-Walign-mismatch]
          get_var(L"SetupMode", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size, &setupmode);

The distinction only matters on CPUs that do not support misaligned loads
fully, but 32-bit ARM's load-multiple instructions fall into that category,
and these are likely to be emitted by the compiler that built the firmware
for loading word-aligned 128-bit GUIDs from memory

So re-implement the initializer in terms of our own efi_guid_t type, so that
the alignment becomes a property of the literal's type.

Fixes: 494c704f9af0 ("efi: Use 32-bit alignment for efi_guid_t")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Link: https://github.com/ClangBuiltLinux/linux/issues/1327
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/efi.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8710f5710c1d..6b5d36babfcc 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -72,8 +72,10 @@ typedef void *efi_handle_t;
  */
 typedef guid_t efi_guid_t __aligned(__alignof__(u32));
 
-#define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \
-	GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
+#define EFI_GUID(a, b, c, d...) (efi_guid_t){ {					\
+	(a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff,	\
+	(b) & 0xff, ((b) >> 8) & 0xff,						\
+	(c) & 0xff, ((c) >> 8) & 0xff, d } }
 
 /*
  * Generic EFI table header
-- 
cgit v1.2.3


From 5261ced47f8e89173c3b015f6152a05f11a418c3 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 18 Mar 2021 13:28:12 -0400
Subject: locking/ww_mutex: Remove DEFINE_WW_MUTEX() macro

The current DEFINE_WW_MUTEX() macro fails to properly set up the lockdep
key of the ww_mutexes causing potential circular locking dependency
splat. Though it is possible to add more macro magic to make it work,
but the result is rather ugly.

Since locktorture was the only user of DEFINE_WW_MUTEX() and the
previous commit has just removed its use. It is easier to just remove
the macro to force future users of ww_mutexes to use ww_mutex_init()
for initialization.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20210318172814.4400-4-longman@redhat.com
---
 include/linux/ww_mutex.h | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index 6ecf2a0220db..b77f39f319ad 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -48,39 +48,26 @@ struct ww_acquire_ctx {
 #endif
 };
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) \
-		, .ww_class = class
-#else
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class)
-#endif
-
 #define __WW_CLASS_INITIALIZER(ww_class, _is_wait_die)	    \
 		{ .stamp = ATOMIC_LONG_INIT(0) \
 		, .acquire_name = #ww_class "_acquire" \
 		, .mutex_name = #ww_class "_mutex" \
 		, .is_wait_die = _is_wait_die }
 
-#define __WW_MUTEX_INITIALIZER(lockname, class) \
-		{ .base =  __MUTEX_INITIALIZER(lockname.base) \
-		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }
-
 #define DEFINE_WD_CLASS(classname) \
 	struct ww_class classname = __WW_CLASS_INITIALIZER(classname, 1)
 
 #define DEFINE_WW_CLASS(classname) \
 	struct ww_class classname = __WW_CLASS_INITIALIZER(classname, 0)
 
-#define DEFINE_WW_MUTEX(mutexname, ww_class) \
-	struct ww_mutex mutexname = __WW_MUTEX_INITIALIZER(mutexname, ww_class)
-
 /**
  * ww_mutex_init - initialize the w/w mutex
  * @lock: the mutex to be initialized
  * @ww_class: the w/w class the mutex should belong to
  *
  * Initialize the w/w mutex to unlocked state and associate it with the given
- * class.
+ * class. Static define macro for w/w mutex is not provided and this function
+ * is the only way to properly initialize the w/w mutex.
  *
  * It is not allowed to initialize an already locked mutex.
  */
-- 
cgit v1.2.3


From 84fcfbdadbfdd86c9a43a52703203e05fe7efd92 Mon Sep 17 00:00:00 2001
From: Wang Qing <wangqing@vivo.com>
Date: Fri, 12 Mar 2021 10:19:12 +0800
Subject: dma-mapping: remove a pointless empty line in dma_alloc_coherent

Signed-off-by: Wang Qing <wangqing@vivo.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 954847f9a3e0..e9d19b974f26 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -435,7 +435,6 @@ static inline void dma_sync_sgtable_for_device(struct device *dev,
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp)
 {
-
 	return dma_alloc_attrs(dev, size, dma_handle, gfp,
 			(gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0);
 }
-- 
cgit v1.2.3


From 919067cc845f323a80b6fe987b64238bd82d309e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 19 Mar 2021 10:39:33 -0700
Subject: net: add CONFIG_PCPU_DEV_REFCNT

I was working on a syzbot issue, claiming one device could not be
dismantled because its refcount was -1

unregister_netdevice: waiting for sit0 to become free. Usage count = -1

It would be nice if syzbot could trigger a warning at the time
this reference count became negative.

This patch adds CONFIG_PCPU_DEV_REFCNT options which defaults
to per cpu variables (as before this patch) on SMP builds.

v2: free_dev label in alloc_netdev_mqs() is moved to avoid
    a compiler warning (-Wunused-label), as reported
    by kernel test robot <lkp@intel.com>

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 +++++++++++++
 net/Kconfig               |  8 ++++++++
 net/core/dev.c            | 10 ++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4940509999be..8f003955c485 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2092,7 +2092,12 @@ struct net_device {
 	u32                     proto_down_reason;
 
 	struct list_head	todo_list;
+
+#ifdef CONFIG_PCPU_DEV_REFCNT
 	int __percpu		*pcpu_refcnt;
+#else
+	refcount_t		dev_refcnt;
+#endif
 
 	struct list_head	link_watch_list;
 
@@ -4044,7 +4049,11 @@ void netdev_run_todo(void);
  */
 static inline void dev_put(struct net_device *dev)
 {
+#ifdef CONFIG_PCPU_DEV_REFCNT
 	this_cpu_dec(*dev->pcpu_refcnt);
+#else
+	refcount_dec(&dev->dev_refcnt);
+#endif
 }
 
 /**
@@ -4055,7 +4064,11 @@ static inline void dev_put(struct net_device *dev)
  */
 static inline void dev_hold(struct net_device *dev)
 {
+#ifdef CONFIG_PCPU_DEV_REFCNT
 	this_cpu_inc(*dev->pcpu_refcnt);
+#else
+	refcount_inc(&dev->dev_refcnt);
+#endif
 }
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --git a/net/Kconfig b/net/Kconfig
index 0ead7ec0d2bd..9c456acc379e 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -245,6 +245,14 @@ source "net/l3mdev/Kconfig"
 source "net/qrtr/Kconfig"
 source "net/ncsi/Kconfig"
 
+config PCPU_DEV_REFCNT
+	bool "Use percpu variables to maintain network device refcount"
+	depends on SMP
+	default y
+	help
+	  network device refcount are using per cpu variables if this option is set.
+	  This can be forced to N to detect underflows (with a performance drop).
+
 config RPS
 	bool
 	depends on SMP && SYSFS
diff --git a/net/core/dev.c b/net/core/dev.c
index 4961fc2e9b19..be941ed754ac 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10312,11 +10312,15 @@ EXPORT_SYMBOL(register_netdev);
 
 int netdev_refcnt_read(const struct net_device *dev)
 {
+#ifdef CONFIG_PCPU_DEV_REFCNT
 	int i, refcnt = 0;
 
 	for_each_possible_cpu(i)
 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
 	return refcnt;
+#else
+	return refcount_read(&dev->dev_refcnt);
+#endif
 }
 EXPORT_SYMBOL(netdev_refcnt_read);
 
@@ -10674,9 +10678,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
 
+#ifdef CONFIG_PCPU_DEV_REFCNT
 	dev->pcpu_refcnt = alloc_percpu(int);
 	if (!dev->pcpu_refcnt)
 		goto free_dev;
+#endif
 
 	if (dev_addr_init(dev))
 		goto free_pcpu;
@@ -10740,8 +10746,10 @@ free_all:
 	return NULL;
 
 free_pcpu:
+#ifdef CONFIG_PCPU_DEV_REFCNT
 	free_percpu(dev->pcpu_refcnt);
 free_dev:
+#endif
 	netdev_freemem(dev);
 	return NULL;
 }
@@ -10783,8 +10791,10 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+#ifdef CONFIG_PCPU_DEV_REFCNT
 	free_percpu(dev->pcpu_refcnt);
 	dev->pcpu_refcnt = NULL;
+#endif
 	free_percpu(dev->xdp_bulkq);
 	dev->xdp_bulkq = NULL;
 
-- 
cgit v1.2.3


From f60a85cad677c4f9bb4cadd764f1d106c38c7cf8 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang@windriver.com>
Date: Wed, 17 Mar 2021 11:09:15 +0800
Subject: bpf: Fix umd memory leak in copy_process()

The syzbot reported a memleak as follows:

BUG: memory leak
unreferenced object 0xffff888101b41d00 (size 120):
  comm "kworker/u4:0", pid 8, jiffies 4294944270 (age 12.780s)
  backtrace:
    [<ffffffff8125dc56>] alloc_pid+0x66/0x560
    [<ffffffff81226405>] copy_process+0x1465/0x25e0
    [<ffffffff81227943>] kernel_clone+0xf3/0x670
    [<ffffffff812281a1>] kernel_thread+0x61/0x80
    [<ffffffff81253464>] call_usermodehelper_exec_work
    [<ffffffff81253464>] call_usermodehelper_exec_work+0xc4/0x120
    [<ffffffff812591c9>] process_one_work+0x2c9/0x600
    [<ffffffff81259ab9>] worker_thread+0x59/0x5d0
    [<ffffffff812611c8>] kthread+0x178/0x1b0
    [<ffffffff8100227f>] ret_from_fork+0x1f/0x30

unreferenced object 0xffff888110ef5c00 (size 232):
  comm "kworker/u4:0", pid 8414, jiffies 4294944270 (age 12.780s)
  backtrace:
    [<ffffffff8154a0cf>] kmem_cache_zalloc
    [<ffffffff8154a0cf>] __alloc_file+0x1f/0xf0
    [<ffffffff8154a809>] alloc_empty_file+0x69/0x120
    [<ffffffff8154a8f3>] alloc_file+0x33/0x1b0
    [<ffffffff8154ab22>] alloc_file_pseudo+0xb2/0x140
    [<ffffffff81559218>] create_pipe_files+0x138/0x2e0
    [<ffffffff8126c793>] umd_setup+0x33/0x220
    [<ffffffff81253574>] call_usermodehelper_exec_async+0xb4/0x1b0
    [<ffffffff8100227f>] ret_from_fork+0x1f/0x30

After the UMD process exits, the pipe_to_umh/pipe_from_umh and
tgid need to be released.

Fixes: d71fa5c9763c ("bpf: Add kernel module with user mode driver that populates bpffs.")
Reported-by: syzbot+44908bb56d2bfe56b28e@syzkaller.appspotmail.com
Signed-off-by: Zqiang <qiang.zhang@windriver.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210317030915.2865-1-qiang.zhang@windriver.com
---
 include/linux/usermode_driver.h       |  1 +
 kernel/bpf/preload/bpf_preload_kern.c | 19 +++++++++++++++----
 kernel/usermode_driver.c              | 21 +++++++++++++++------
 3 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
index 073a9e0ec07d..ad970416260d 100644
--- a/include/linux/usermode_driver.h
+++ b/include/linux/usermode_driver.h
@@ -14,5 +14,6 @@ struct umd_info {
 int umd_load_blob(struct umd_info *info, const void *data, size_t len);
 int umd_unload_blob(struct umd_info *info);
 int fork_usermode_driver(struct umd_info *info);
+void umd_cleanup_helper(struct umd_info *info);
 
 #endif /* __LINUX_USERMODE_DRIVER_H__ */
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 79c5772465f1..53736e52c1df 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -60,9 +60,12 @@ static int finish(void)
 			 &magic, sizeof(magic), &pos);
 	if (n != sizeof(magic))
 		return -EPIPE;
+
 	tgid = umd_ops.info.tgid;
-	wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
-	umd_ops.info.tgid = NULL;
+	if (tgid) {
+		wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+		umd_cleanup_helper(&umd_ops.info);
+	}
 	return 0;
 }
 
@@ -80,10 +83,18 @@ static int __init load_umd(void)
 
 static void __exit fini_umd(void)
 {
+	struct pid *tgid;
+
 	bpf_preload_ops = NULL;
+
 	/* kill UMD in case it's still there due to earlier error */
-	kill_pid(umd_ops.info.tgid, SIGKILL, 1);
-	umd_ops.info.tgid = NULL;
+	tgid = umd_ops.info.tgid;
+	if (tgid) {
+		kill_pid(tgid, SIGKILL, 1);
+
+		wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+		umd_cleanup_helper(&umd_ops.info);
+	}
 	umd_unload_blob(&umd_ops.info);
 }
 late_initcall(load_umd);
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 0b35212ffc3d..bb7bb3b478ab 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -139,13 +139,22 @@ static void umd_cleanup(struct subprocess_info *info)
 	struct umd_info *umd_info = info->data;
 
 	/* cleanup if umh_setup() was successful but exec failed */
-	if (info->retval) {
-		fput(umd_info->pipe_to_umh);
-		fput(umd_info->pipe_from_umh);
-		put_pid(umd_info->tgid);
-		umd_info->tgid = NULL;
-	}
+	if (info->retval)
+		umd_cleanup_helper(umd_info);
+}
+
+/**
+ * umd_cleanup_helper - release the resources which were allocated in umd_setup
+ * @info: information about usermode driver
+ */
+void umd_cleanup_helper(struct umd_info *info)
+{
+	fput(info->pipe_to_umh);
+	fput(info->pipe_from_umh);
+	put_pid(info->tgid);
+	info->tgid = NULL;
 }
+EXPORT_SYMBOL_GPL(umd_cleanup_helper);
 
 /**
  * fork_usermode_driver - fork a usermode driver
-- 
cgit v1.2.3


From a2bbe66493ee380eb25e080e7fcdd1278a847f7e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jul 2019 09:57:53 -0400
Subject: constify dentry argument of dentry_path()/dentry_path_raw()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/d_path.c            | 10 +++++-----
 include/linux/dcache.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/d_path.c b/fs/d_path.c
index a69e2cd36e6e..270d62133996 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -326,9 +326,9 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
-static char *__dentry_path(struct dentry *d, char *buf, int buflen)
+static char *__dentry_path(const struct dentry *d, char *buf, int buflen)
 {
-	struct dentry *dentry;
+	const struct dentry *dentry;
 	char *end, *retval;
 	int len, seq = 0;
 	int error = 0;
@@ -347,7 +347,7 @@ restart:
 	*retval = '/';
 	read_seqbegin_or_lock(&rename_lock, &seq);
 	while (!IS_ROOT(dentry)) {
-		struct dentry *parent = dentry->d_parent;
+		const struct dentry *parent = dentry->d_parent;
 
 		prefetch(parent);
 		error = prepend_name(&end, &len, &dentry->d_name);
@@ -371,13 +371,13 @@ Elong:
 	return ERR_PTR(-ENAMETOOLONG);
 }
 
-char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
+char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
 {
 	return __dentry_path(dentry, buf, buflen);
 }
 EXPORT_SYMBOL(dentry_path_raw);
 
-char *dentry_path(struct dentry *dentry, char *buf, int buflen)
+char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
 {
 	char *p = NULL;
 	char *retval;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c1e48014106f..4ecde5d8250c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -300,8 +300,8 @@ char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 extern char *__d_path(const struct path *, const struct path *, char *, int);
 extern char *d_absolute_path(const struct path *, char *, int);
 extern char *d_path(const struct path *, char *, int);
-extern char *dentry_path_raw(struct dentry *, char *, int);
-extern char *dentry_path(struct dentry *, char *, int);
+extern char *dentry_path_raw(const struct dentry *, char *, int);
+extern char *dentry_path(const struct dentry *, char *, int);
 
 /* Allocation counts.. */
 
-- 
cgit v1.2.3


From 4eed7f5a8334a179f40b2c78c1ead572b9dd04a0 Mon Sep 17 00:00:00 2001
From: LI Qingwu <Qing-wu.Li@leica-geosystems.com.cn>
Date: Fri, 19 Mar 2021 11:02:36 +0000
Subject: power: supply: bq27xxx: Add support for BQ78Z100

Add support for TI BQ78Z100, I2C interface gas gauge.
It provides a fully integrated safety protection
and authentication for 1 to 2-series cell Li-Ion and
Li-Polymer battery packs.

The patch was tested with BQ78Z100 equipment.

CASE I:  Discharging:
	POWER_SUPPLY_NAME=bq78z100-0
	POWER_SUPPLY_STATUS=Discharging
	POWER_SUPPLY_PRESENT=1
	POWER_SUPPLY_VOLTAGE_NOW=3386000
	POWER_SUPPLY_CURRENT_NOW=-5000
	POWER_SUPPLY_CAPACITY=27
	POWER_SUPPLY_CAPACITY_LEVEL=Normal
	POWER_SUPPLY_TEMP=269
	POWER_SUPPLY_TIME_TO_EMPTY_NOW=1249920
	POWER_SUPPLY_TECHNOLOGY=Li-ion
	POWER_SUPPLY_CHARGE_FULL=6494000
	POWER_SUPPLY_CHARGE_NOW=1736000
	POWER_SUPPLY_CHARGE_FULL_DESIGN=6000000
	POWER_SUPPLY_CYCLE_COUNT=1
	POWER_SUPPLY_POWER_AVG=-20000
	POWER_SUPPLY_HEALTH=Good
	POWER_SUPPLY_MANUFACTURER=Texas Instruments

CASE II : No discharging current:
	POWER_SUPPLY_NAME=bq78z100-0
	POWER_SUPPLY_STATUS=Not charging
	POWER_SUPPLY_PRESENT=1
	POWER_SUPPLY_VOLTAGE_NOW=3386000
	POWER_SUPPLY_CURRENT_NOW=0
	POWER_SUPPLY_CAPACITY=27
	POWER_SUPPLY_CAPACITY_LEVEL=Normal
	POWER_SUPPLY_TEMP=270
	POWER_SUPPLY_TECHNOLOGY=Li-ion
	POWER_SUPPLY_CHARGE_FULL=6494000
	POWER_SUPPLY_CHARGE_NOW=1734000
	POWER_SUPPLY_CHARGE_FULL_DESIGN=6000000
	POWER_SUPPLY_CYCLE_COUNT=1
	POWER_SUPPLY_POWER_AVG=0
	POWER_SUPPLY_HEALTH=Good
	POWER_SUPPLY_MANUFACTURER=Texas Instruments

Signed-off-by: LI Qingwu <Qing-wu.Li@leica-geosystems.com.cn>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c     | 44 ++++++++++++++++++++++++++++++
 drivers/power/supply/bq27xxx_battery_i2c.c |  2 ++
 include/linux/power/bq27xxx_battery.h      |  1 +
 3 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index b62a8cfd9d09..7e5e24b585d8 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -39,6 +39,7 @@
  * https://www.ti.com/product/bq27z561
  * https://www.ti.com/product/bq28z610
  * https://www.ti.com/product/bq34z100-g1
+ * https://www.ti.com/product/bq78z100
  */
 
 #include <linux/device.h>
@@ -515,6 +516,27 @@ static u8
 		[BQ27XXX_REG_DCAP] = 0x3c,
 		[BQ27XXX_REG_AP] = 0x22,
 		BQ27XXX_DM_REG_ROWS,
+	},
+	bq78z100_regs[BQ27XXX_REG_MAX] = {
+		[BQ27XXX_REG_CTRL] = 0x00,
+		[BQ27XXX_REG_TEMP] = 0x06,
+		[BQ27XXX_REG_INT_TEMP] = 0x28,
+		[BQ27XXX_REG_VOLT] = 0x08,
+		[BQ27XXX_REG_AI] = 0x14,
+		[BQ27XXX_REG_FLAGS] = 0x0a,
+		[BQ27XXX_REG_TTE] = 0x16,
+		[BQ27XXX_REG_TTF] = 0x18,
+		[BQ27XXX_REG_TTES] = 0x1c,
+		[BQ27XXX_REG_TTECP] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_NAC] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_RC] = 0x10,
+		[BQ27XXX_REG_FCC] = 0x12,
+		[BQ27XXX_REG_CYCT] = 0x2a,
+		[BQ27XXX_REG_AE] = INVALID_REG_ADDR,
+		[BQ27XXX_REG_SOC] = 0x2c,
+		[BQ27XXX_REG_DCAP] = 0x3c,
+		[BQ27XXX_REG_AP] = 0x22,
+		BQ27XXX_DM_REG_ROWS,
 	};
 
 static enum power_supply_property bq27000_props[] = {
@@ -813,6 +835,26 @@ static enum power_supply_property bq34z100_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static enum power_supply_property bq78z100_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_VOLTAGE_NOW,
+	POWER_SUPPLY_PROP_CURRENT_NOW,
+	POWER_SUPPLY_PROP_CAPACITY,
+	POWER_SUPPLY_PROP_CAPACITY_LEVEL,
+	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW,
+	POWER_SUPPLY_PROP_TIME_TO_FULL_NOW,
+	POWER_SUPPLY_PROP_TECHNOLOGY,
+	POWER_SUPPLY_PROP_CHARGE_FULL,
+	POWER_SUPPLY_PROP_CHARGE_NOW,
+	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
+	POWER_SUPPLY_PROP_CYCLE_COUNT,
+	POWER_SUPPLY_PROP_POWER_AVG,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_MANUFACTURER,
+};
+
 struct bq27xxx_dm_reg {
 	u8 subclass_id;
 	u8 offset;
@@ -911,6 +953,7 @@ static struct bq27xxx_dm_reg bq27621_dm_regs[] = {
 #define bq27z561_dm_regs 0
 #define bq28z610_dm_regs 0
 #define bq34z100_dm_regs 0
+#define bq78z100_dm_regs 0
 
 #define BQ27XXX_O_ZERO		BIT(0)
 #define BQ27XXX_O_OTDC		BIT(1) /* has OTC/OTD overtemperature flags */
@@ -969,6 +1012,7 @@ static struct {
 	[BQ28Z610]  = BQ27XXX_DATA(bq28z610,  0         , BQ27Z561_O_BITS),
 	[BQ34Z100]  = BQ27XXX_DATA(bq34z100,  0         , BQ27XXX_O_OTDC | BQ27XXX_O_SOC_SI | \
 							  BQ27XXX_O_HAS_CI | BQ27XXX_O_MUL_CHEM),
+	[BQ78Z100]  = BQ27XXX_DATA(bq78z100,  0         , BQ27Z561_O_BITS),
 };
 
 static DEFINE_MUTEX(bq27xxx_list_lock);
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index eb4f4284982f..46f078350fd3 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -248,6 +248,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27z561", BQ27Z561 },
 	{ "bq28z610", BQ28Z610 },
 	{ "bq34z100", BQ34Z100 },
+	{ "bq78z100", BQ78Z100 },
 	{},
 };
 MODULE_DEVICE_TABLE(i2c, bq27xxx_i2c_id_table);
@@ -284,6 +285,7 @@ static const struct of_device_id bq27xxx_battery_i2c_of_match_table[] = {
 	{ .compatible = "ti,bq27z561" },
 	{ .compatible = "ti,bq28z610" },
 	{ .compatible = "ti,bq34z100" },
+	{ .compatible = "ti,bq78z100" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, bq27xxx_battery_i2c_of_match_table);
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 8d5f4f40fb41..a1aa68141d0b 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -33,6 +33,7 @@ enum bq27xxx_chip {
 	BQ27Z561,
 	BQ28Z610,
 	BQ34Z100,
+	BQ78Z100,
 };
 
 struct bq27xxx_device_info;
-- 
cgit v1.2.3


From 3b03706fa621ce31a3e9ef6307020fde4e6aae16 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 18 Mar 2021 13:38:50 +0100
Subject: sched: Fix various typos

Fix ~42 single-word typos in scheduler code comments.

We have accumulated a few fun ones over the years. :-)

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/sched.h            |  2 +-
 kernel/sched/clock.c             |  2 +-
 kernel/sched/core.c              |  2 +-
 kernel/sched/cpuacct.c           |  2 +-
 kernel/sched/cpufreq_schedutil.c |  2 +-
 kernel/sched/cpupri.c            |  4 ++--
 kernel/sched/cputime.c           |  2 +-
 kernel/sched/deadline.c          | 12 ++++++------
 kernel/sched/debug.c             |  2 +-
 kernel/sched/fair.c              | 18 +++++++++---------
 kernel/sched/features.h          |  2 +-
 kernel/sched/idle.c              |  4 ++--
 kernel/sched/loadavg.c           |  2 +-
 kernel/sched/pelt.c              |  2 +-
 kernel/sched/pelt.h              |  2 +-
 kernel/sched/psi.c               |  6 +++---
 kernel/sched/rt.c                |  6 +++---
 kernel/sched/sched.h             |  8 ++++----
 kernel/sched/stats.c             |  2 +-
 kernel/sched/topology.c          |  2 +-
 20 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cf245bc237e7..05572e2140ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1097,7 +1097,7 @@ struct task_struct {
 #ifdef CONFIG_CPUSETS
 	/* Protected by ->alloc_lock: */
 	nodemask_t			mems_allowed;
-	/* Seqence number to catch updates: */
+	/* Sequence number to catch updates: */
 	seqcount_spinlock_t		mems_allowed_seq;
 	int				cpuset_mem_spread_rotor;
 	int				cpuset_slab_spread_rotor;
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 12bca64dff73..c2b2859ddd82 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -41,7 +41,7 @@
  * Otherwise it tries to create a semi stable clock from a mixture of other
  * clocks, including:
  *
- *  - GTOD (clock monotomic)
+ *  - GTOD (clock monotonic)
  *  - sched_clock()
  *  - explicit idle events
  *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 28c4df6dfd97..3384ea74cad4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8975,7 +8975,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 		return -EINVAL;
 
 	/*
-	 * Likewise, bound things on the otherside by preventing insane quota
+	 * Likewise, bound things on the other side by preventing insane quota
 	 * periods.  This also allows us to normalize in computing quota
 	 * feasibility.
 	 */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 941c28cf9738..104a1bade14f 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -104,7 +104,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
 
 	/*
 	 * We allow index == CPUACCT_STAT_NSTATS here to read
-	 * the sum of suages.
+	 * the sum of usages.
 	 */
 	BUG_ON(index > CPUACCT_STAT_NSTATS);
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 50cbad89f7fa..7cc2e115a5c4 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -471,7 +471,7 @@ static void sugov_work(struct kthread_work *work)
 
 	/*
 	 * Hold sg_policy->update_lock shortly to handle the case where:
-	 * incase sg_policy->next_freq is read here, and then updated by
+	 * in case sg_policy->next_freq is read here, and then updated by
 	 * sugov_deferred_update() just before work_in_progress is set to false
 	 * here, we may miss queueing the new update.
 	 *
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index ec9be789c7e2..d583f2aa744e 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -77,7 +77,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
 	 * When looking at the vector, we need to read the counter,
 	 * do a memory barrier, then read the mask.
 	 *
-	 * Note: This is still all racey, but we can deal with it.
+	 * Note: This is still all racy, but we can deal with it.
 	 *  Ideally, we only want to look at masks that are set.
 	 *
 	 *  If a mask is not set, then the only thing wrong is that we
@@ -186,7 +186,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
 	 * The cost of this trade-off is not entirely clear and will probably
 	 * be good for some workloads and bad for others.
 	 *
-	 * The main idea here is that if some CPUs were overcommitted, we try
+	 * The main idea here is that if some CPUs were over-committed, we try
 	 * to spread which is what the scheduler traditionally did. Sys admins
 	 * must do proper RT planning to avoid overloading the system if they
 	 * really care.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5f611658eeab..3b36644fa80a 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -563,7 +563,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
 
 	/*
 	 * If either stime or utime are 0, assume all runtime is userspace.
-	 * Once a task gets some ticks, the monotonicy code at 'update:'
+	 * Once a task gets some ticks, the monotonicity code at 'update:'
 	 * will ensure things converge to the observed ratio.
 	 */
 	if (stime == 0) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aac3539aa0fe..9a2989749b8d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -245,7 +245,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
 		p->dl.dl_non_contending = 0;
 		/*
 		 * If the timer handler is currently running and the
-		 * timer cannot be cancelled, inactive_task_timer()
+		 * timer cannot be canceled, inactive_task_timer()
 		 * will see that dl_not_contending is not set, and
 		 * will not touch the rq's active utilization,
 		 * so we are still safe.
@@ -267,7 +267,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
  * fires.
  *
  * If the task wakes up again before the inactive timer fires,
- * the timer is cancelled, whereas if the task wakes up after the
+ * the timer is canceled, whereas if the task wakes up after the
  * inactive timer fired (and running_bw has been decreased) the
  * task's utilization has to be added to running_bw again.
  * A flag in the deadline scheduling entity (dl_non_contending)
@@ -385,7 +385,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
 		dl_se->dl_non_contending = 0;
 		/*
 		 * If the timer handler is currently running and the
-		 * timer cannot be cancelled, inactive_task_timer()
+		 * timer cannot be canceled, inactive_task_timer()
 		 * will see that dl_not_contending is not set, and
 		 * will not touch the rq's active utilization,
 		 * so we are still safe.
@@ -1206,7 +1206,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
  * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
  * multiplied by 2^BW_SHIFT, the result has to be shifted right by
  * BW_SHIFT.
- * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
  * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
  * Since delta is a 64 bit variable, to have an overflow its value
  * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
@@ -1737,7 +1737,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
 		p->dl.dl_non_contending = 0;
 		/*
 		 * If the timer handler is currently running and the
-		 * timer cannot be cancelled, inactive_task_timer()
+		 * timer cannot be canceled, inactive_task_timer()
 		 * will see that dl_not_contending is not set, and
 		 * will not touch the rq's active utilization,
 		 * so we are still safe.
@@ -2745,7 +2745,7 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 
 /*
  * Default limits for DL period; on the top end we guard against small util
- * tasks still getting rediculous long effective runtimes, on the bottom end we
+ * tasks still getting ridiculously long effective runtimes, on the bottom end we
  * guard against timer DoS.
  */
 unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 486f403a778b..4b49cc2af5c4 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -815,7 +815,7 @@ void sysrq_sched_debug_show(void)
 }
 
 /*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
  * It returns 1 for the header position.
  * This means 2 is CPU 0.
  * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2e2ab1e00ef9..6aad02876346 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1125,7 +1125,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 	return rss / nr_scan_pages;
 }
 
-/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
 #define MAX_SCAN_WINDOW 2560
 
 static unsigned int task_scan_min(struct task_struct *p)
@@ -2577,7 +2577,7 @@ no_join:
 }
 
 /*
- * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * Get rid of NUMA statistics associated with a task (either current or dead).
  * If @final is set, the task is dead and has reached refcount zero, so we can
  * safely free all relevant data structures. Otherwise, there might be
  * concurrent reads from places like load balancing and procfs, and we should
@@ -3952,7 +3952,7 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
  *
  *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
  *
- * NOTE: this only works when value + maring < INT_MAX.
+ * NOTE: this only works when value + margin < INT_MAX.
  */
 static inline bool within_margin(int value, int margin)
 {
@@ -4256,7 +4256,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/*
 	 * When bandwidth control is enabled, cfs might have been removed
 	 * because of a parent been throttled but cfs->nr_running > 1. Try to
-	 * add it unconditionnally.
+	 * add it unconditionally.
 	 */
 	if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
 		list_add_leaf_cfs_rq(cfs_rq);
@@ -5311,7 +5311,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  * bits doesn't do much.
  */
 
-/* cpu online calback */
+/* cpu online callback */
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
 	struct task_group *tg;
@@ -6963,7 +6963,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 
 	/*
 	 * This is possible from callers such as attach_tasks(), in which we
-	 * unconditionally check_prempt_curr() after an enqueue (which may have
+	 * unconditionally check_preempt_curr() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
 	 */
@@ -7595,7 +7595,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 		return 0;
 	}
 
-	/* Record that we found atleast one task that could run on dst_cpu */
+	/* Record that we found at least one task that could run on dst_cpu */
 	env->flags &= ~LBF_ALL_PINNED;
 
 	if (task_running(env->src_rq, p)) {
@@ -9690,7 +9690,7 @@ more_balance:
 		 * load to given_cpu. In rare situations, this may cause
 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
 		 * _independently_ and at _same_ time to move some load to
-		 * given_cpu) causing exceess load to be moved to given_cpu.
+		 * given_cpu) causing excess load to be moved to given_cpu.
 		 * This however should not happen so much in practice and
 		 * moreover subsequent load balance cycles should correct the
 		 * excess load moved.
@@ -9834,7 +9834,7 @@ out_one_pinned:
 	/*
 	 * newidle_balance() disregards balance intervals, so we could
 	 * repeatedly reach this code, which would lead to balance_interval
-	 * skyrocketting in a short amount of time. Skip the balance_interval
+	 * skyrocketing in a short amount of time. Skip the balance_interval
 	 * increase logic to avoid that.
 	 */
 	if (env.idle == CPU_NEWLY_IDLE)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1bc2b158fc51..422fa68c0ee9 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -27,7 +27,7 @@ SCHED_FEAT(NEXT_BUDDY, false)
 SCHED_FEAT(LAST_BUDDY, true)
 
 /*
- * Consider buddies to be cache hot, decreases the likelyness of a
+ * Consider buddies to be cache hot, decreases the likeliness of a
  * cache buddy being migrated away, increases cache locality.
  */
 SCHED_FEAT(CACHE_HOT_BUDDY, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7a92d6054aba..7ca3d3d86c2a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -163,7 +163,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  *
  * NOTE: no locks or semaphores should be used here
  *
- * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * On architectures that support TIF_POLLING_NRFLAG, is called with polling
  * set, and it returns with polling set.  If it ever stops polling, it
  * must clear the polling bit.
  */
@@ -199,7 +199,7 @@ static void cpuidle_idle_call(void)
 	 * Suspend-to-idle ("s2idle") is a system state in which all user space
 	 * has been frozen, all I/O devices have been suspended and the only
 	 * activity happens here and in interrupts (if any). In that case bypass
-	 * the cpuidle governor and go stratight for the deepest idle state
+	 * the cpuidle governor and go straight for the deepest idle state
 	 * available.  Possibly also suspend the local tick and the entire
 	 * timekeeping to prevent timer interrupts from kicking us out of idle
 	 * until a proper wakeup interrupt happens.
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index d2a655643a02..1c79896f1bc0 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -189,7 +189,7 @@ calc_load_n(unsigned long load, unsigned long exp,
  *    w:0 1 1           0 0           1 1           0 0
  *
  *    This ensures we'll fold the old NO_HZ contribution in this window while
- *    accumlating the new one.
+ *    accumulating the new one.
  *
  *  - When we wake up from NO_HZ during the window, we push up our
  *    contribution, since we effectively move our sample point to a known
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 2c613e1cff3a..a554e3bbab2b 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -133,7 +133,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
 			 *	runnable = running = 0;
 			 *
 			 * clause from ___update_load_sum(); this results in
-			 * the below usage of @contrib to dissapear entirely,
+			 * the below usage of @contrib to disappear entirely,
 			 * so no point in calculating it.
 			 */
 			contrib = __accumulate_pelt_segments(periods,
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 795e43e02afc..1462846d244e 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -130,7 +130,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
 	 * Reflecting stolen time makes sense only if the idle
 	 * phase would be present at max capacity. As soon as the
 	 * utilization of a rq has reached the maximum value, it is
-	 * considered as an always runnig rq without idle time to
+	 * considered as an always running rq without idle time to
 	 * steal. This potential idle time is considered as lost in
 	 * this case. We keep track of this lost idle time compare to
 	 * rq's clock_task.
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ee3c5b48622f..c8480d785987 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -62,7 +62,7 @@
  * states, we would have to conclude a CPU SOME pressure number of
  * 100%, since *somebody* is waiting on a runqueue at all
  * times. However, that is clearly not the amount of contention the
- * workload is experiencing: only one out of 256 possible exceution
+ * workload is experiencing: only one out of 256 possible execution
  * threads will be contended at any given time, or about 0.4%.
  *
  * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
@@ -76,7 +76,7 @@
  * we have to base our calculation on the number of non-idle tasks in
  * conjunction with the number of available CPUs, which is the number
  * of potential execution threads. SOME becomes then the proportion of
- * delayed tasks to possibe threads, and FULL is the share of possible
+ * delayed tasks to possible threads, and FULL is the share of possible
  * threads that are unproductive due to delays:
  *
  *	threads = min(nr_nonidle_tasks, nr_cpus)
@@ -446,7 +446,7 @@ static void psi_avgs_work(struct work_struct *work)
 	mutex_unlock(&group->avgs_lock);
 }
 
-/* Trigger tracking window manupulations */
+/* Trigger tracking window manipulations */
 static void window_reset(struct psi_window *win, u64 now, u64 value,
 			 u64 prev_growth)
 {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8f720b71d13d..c286e5ba3c94 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -700,7 +700,7 @@ static void do_balance_runtime(struct rt_rq *rt_rq)
 		/*
 		 * Either all rqs have inf runtime and there's nothing to steal
 		 * or __disable_runtime() below sets a specific rq to inf to
-		 * indicate its been disabled and disalow stealing.
+		 * indicate its been disabled and disallow stealing.
 		 */
 		if (iter->rt_runtime == RUNTIME_INF)
 			goto next;
@@ -1998,7 +1998,7 @@ static void push_rt_tasks(struct rq *rq)
  *
  * Each root domain has its own irq work function that can iterate over
  * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
- * tassk must be checked if there's one or many CPUs that are lowering
+ * task must be checked if there's one or many CPUs that are lowering
  * their priority, there's a single irq work iterator that will try to
  * push off RT tasks that are waiting to run.
  *
@@ -2216,7 +2216,7 @@ static void pull_rt_task(struct rq *this_rq)
 			/*
 			 * There's a chance that p is higher in priority
 			 * than what's currently running on its CPU.
-			 * This is just that p is wakeing up and hasn't
+			 * This is just that p is waking up and hasn't
 			 * had a chance to schedule. We only pull
 			 * p if it is lower in priority than the
 			 * current task on the run queue
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2e09a647c4f..cbb0b011e9e0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1155,7 +1155,7 @@ static inline u64 __rq_clock_broken(struct rq *rq)
  *
  *	if (rq-clock_update_flags >= RQCF_UPDATED)
  *
- * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * to check if %RQCF_UPDATED is set. It'll never be shifted more than
  * one position though, because the next rq_unpin_lock() will shift it
  * back.
  */
@@ -1214,7 +1214,7 @@ static inline void rq_clock_skip_update(struct rq *rq)
 
 /*
  * See rt task throttling, which is the only time a skip
- * request is cancelled.
+ * request is canceled.
  */
 static inline void rq_clock_cancel_skipupdate(struct rq *rq)
 {
@@ -1861,7 +1861,7 @@ struct sched_class {
 
 	/*
 	 * The switched_from() call is allowed to drop rq->lock, therefore we
-	 * cannot assume the switched_from/switched_to pair is serliazed by
+	 * cannot assume the switched_from/switched_to pair is serialized by
 	 * rq->lock. They are however serialized by p->pi_lock.
 	 */
 	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
@@ -2452,7 +2452,7 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
 
 /*
  * Returns the irqtime minus the softirq time computed by ksoftirqd.
- * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
  * and never move forward.
  */
 static inline u64 irq_time_read(int cpu)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 750fb3c67eed..3f93fc3b5648 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -74,7 +74,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 }
 
 /*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
  * It returns 1 for the header position.
  * This means 2 is cpu 0.
  * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 12f80587e127..f2066d682cd8 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2159,7 +2159,7 @@ static cpumask_var_t			*doms_cur;
 /* Number of sched domains in 'doms_cur': */
 static int				ndoms_cur;
 
-/* Attribues of custom domains in 'doms_cur' */
+/* Attributes of custom domains in 'doms_cur' */
 static struct sched_domain_attr		*dattr_cur;
 
 /*
-- 
cgit v1.2.3


From e2db7592be8e83df47519116621411e1056b21c7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 22 Mar 2021 02:35:05 +0100
Subject: locking: Fix typos in comments

Fix ~16 single-word typos in locking code comments.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/spinlock.h | 2 +-
 include/linux/lockdep.h         | 2 +-
 include/linux/rwsem.h           | 2 +-
 kernel/locking/lockdep.c        | 4 ++--
 kernel/locking/lockdep_proc.c   | 2 +-
 kernel/locking/mcs_spinlock.h   | 2 +-
 kernel/locking/mutex.c          | 4 ++--
 kernel/locking/osq_lock.c       | 4 ++--
 kernel/locking/rtmutex.c        | 4 ++--
 kernel/locking/rwsem.c          | 2 +-
 kernel/locking/spinlock.c       | 4 ++--
 11 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 8f009e788ad4..f610a773f2be 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -22,7 +22,7 @@
  * assembler to insert a extra (16-bit) IT instruction, depending on the
  * presence or absence of neighbouring conditional instructions.
  *
- * To avoid this unpredictableness, an approprite IT is inserted explicitly:
+ * To avoid this unpredictability, an appropriate IT is inserted explicitly:
  * the assembler won't change IT instructions which are explicitly present
  * in the input.
  */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 17805aac0e85..09ac2e8348d2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -155,7 +155,7 @@ extern void lockdep_set_selftest_task(struct task_struct *task);
 extern void lockdep_init_task(struct task_struct *task);
 
 /*
- * Split the recrursion counter in two to readily detect 'off' vs recursion.
+ * Split the recursion counter in two to readily detect 'off' vs recursion.
  */
 #define LOCKDEP_RECURSION_BITS	16
 #define LOCKDEP_OFF		(1U << LOCKDEP_RECURSION_BITS)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 4c715be48717..a66038d88878 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -110,7 +110,7 @@ do {								\
 
 /*
  * This is the same regardless of which rwsem implementation that is being used.
- * It is just a heuristic meant to be called by somebody alreadying holding the
+ * It is just a heuristic meant to be called by somebody already holding the
  * rwsem to see if somebody from an incompatible type is wanting access to the
  * lock.
  */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c0b8926a67f0..0e97287891db 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1747,7 +1747,7 @@ static enum bfs_result __bfs(struct lock_list *source_entry,
 
 		/*
 		 * Step 4: if not match, expand the path by adding the
-		 *         forward or backwards dependencis in the search
+		 *         forward or backwards dependencies in the search
 		 *
 		 */
 		first = true;
@@ -1916,7 +1916,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
  * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
  * dependency graph, as any strong path ..-> A -> B ->.. we can get with
  * having dependency A -> B, we could already get a equivalent path ..-> A ->
- * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant.
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
  *
  * We need to make sure both the start and the end of A -> .. -> B is not
  * weaker than A -> B. For the start part, please see the comment in
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 02ef87f50df2..806978314496 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -348,7 +348,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 			debug_locks);
 
 	/*
-	 * Zappped classes and lockdep data buffers reuse statistics.
+	 * Zapped classes and lockdep data buffers reuse statistics.
 	 */
 	seq_puts(m, "\n");
 	seq_printf(m, " zapped classes:                %11lu\n",
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5e10153b4d3c..85251d8771d9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -7,7 +7,7 @@
  * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
  * with the desirable properties of being fair, and with each cpu trying
  * to acquire the lock spinning on a local variable.
- * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * It avoids expensive cache bounces that common test-and-set spin-lock
  * implementations incur.
  */
 #ifndef __LINUX_MCS_SPINLOCK_H
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 622ebdfcd083..cb6b112ce155 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -92,7 +92,7 @@ static inline unsigned long __owner_flags(unsigned long owner)
 }
 
 /*
- * Trylock variant that retuns the owning task on failure.
+ * Trylock variant that returns the owning task on failure.
  */
 static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
 {
@@ -207,7 +207,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 
 /*
  * Give up ownership to a specific task, when @task = NULL, this is equivalent
- * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves
  * WAITERS. Provides RELEASE semantics like a regular unlock, the
  * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
  */
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 1de006ed3aa8..d5610ad52b92 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -135,7 +135,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	 */
 
 	/*
-	 * Wait to acquire the lock or cancelation. Note that need_resched()
+	 * Wait to acquire the lock or cancellation. Note that need_resched()
 	 * will come with an IPI, which will wake smp_cond_load_relaxed() if it
 	 * is implemented with a monitor-wait. vcpu_is_preempted() relies on
 	 * polling, be careful.
@@ -164,7 +164,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 
 		/*
 		 * We can only fail the cmpxchg() racing against an unlock(),
-		 * in which case we should observe @node->locked becomming
+		 * in which case we should observe @node->locked becoming
 		 * true.
 		 */
 		if (smp_load_acquire(&node->locked))
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 29f09d0b8224..db31bce114f8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -706,7 +706,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	} else if (prerequeue_top_waiter == waiter) {
 		/*
 		 * The waiter was the top waiter on the lock, but is
-		 * no longer the top prority waiter. Replace waiter in
+		 * no longer the top priority waiter. Replace waiter in
 		 * the owner tasks pi waiters tree with the new top
 		 * (highest priority) waiter and adjust the priority
 		 * of the owner.
@@ -1194,7 +1194,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
 		return;
 
 	/*
-	 * Yell lowdly and stop the task right here.
+	 * Yell loudly and stop the task right here.
 	 */
 	rt_mutex_print_deadlock(w);
 	while (1) {
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index fe9cc65cd522..809b0016d344 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -819,7 +819,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 		 *    we try to get it. The new owner may be a spinnable
 		 *    writer.
 		 *
-		 * To take advantage of two scenarios listed agove, the RT
+		 * To take advantage of two scenarios listed above, the RT
 		 * task is made to retry one more time to see if it can
 		 * acquire the lock or continue spinning on the new owning
 		 * writer. Of course, if the time lag is long enough or the
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 0ff08380f531..c8d7ad9fb9b2 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -58,10 +58,10 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
 /*
  * We build the __lock_function inlines here. They are too large for
  * inlining all over the place, but here is only one user per function
- * which embedds them into the calling _lock_function below.
+ * which embeds them into the calling _lock_function below.
  *
  * This could be a long-held lock. We both prepare to spin for a long
- * time (making _this_ CPU preemptable if possible), and we also signal
+ * time (making _this_ CPU preemptible if possible), and we also signal
  * towards that other CPU that it should break the lock ASAP.
  */
 #define BUILD_LOCK_OPS(op, locktype)					\
-- 
cgit v1.2.3


From 97258ce902d1e1c396a4d7c38f6ae7085adb73c5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 22 Mar 2021 03:55:50 +0100
Subject: entry: Fix typos in comments

Fix 3 single-word typos in the generic syscall entry code.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/entry-common.h | 4 ++--
 kernel/entry/common.c        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 883acef895bc..2e2b8d6140ed 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -360,7 +360,7 @@ void syscall_exit_to_user_mode_work(struct pt_regs *regs);
  *
  * This is a combination of syscall_exit_to_user_mode_work() (1,2) and
  * exit_to_user_mode(). This function is preferred unless there is a
- * compelling architectural reason to use the seperate functions.
+ * compelling architectural reason to use the separate functions.
  */
 void syscall_exit_to_user_mode(struct pt_regs *regs);
 
@@ -381,7 +381,7 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs);
  * irqentry_exit_to_user_mode - Interrupt exit work
  * @regs:	Pointer to current's pt_regs
  *
- * Invoked with interrupts disbled and fully valid regs. Returns with all
+ * Invoked with interrupts disabled and fully valid regs. Returns with all
  * work handled, interrupts disabled such that the caller can immediately
  * switch to user mode. Called from architecture specific interrupt
  * handling code.
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 8442e5c9cfa2..8d996dddf8b9 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -341,7 +341,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	 * Checking for rcu_is_watching() here would prevent the nesting
 	 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
 	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
-	 * assume that it is the first interupt and eventually claim
+	 * assume that it is the first interrupt and eventually claim
 	 * quiescent state and end grace periods prematurely.
 	 *
 	 * Unconditionally invoke rcu_irq_enter() so RCU state stays
-- 
cgit v1.2.3


From a359f757965aafd0f58570de95dc6bc06cf12a9c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 22 Mar 2021 04:21:30 +0100
Subject: irq: Fix typos in comments

Fix ~36 single-word typos in the IRQ, irqchip and irqdomain code comments.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/irqchip/irq-aspeed-vic.c       |  4 ++--
 drivers/irqchip/irq-bcm7120-l2.c       |  2 +-
 drivers/irqchip/irq-csky-apb-intc.c    |  2 +-
 drivers/irqchip/irq-gic-v2m.c          |  2 +-
 drivers/irqchip/irq-gic-v3-its.c       | 10 +++++-----
 drivers/irqchip/irq-gic-v3.c           |  2 +-
 drivers/irqchip/irq-loongson-pch-pic.c |  2 +-
 drivers/irqchip/irq-meson-gpio.c       |  2 +-
 drivers/irqchip/irq-mtk-cirq.c         |  2 +-
 drivers/irqchip/irq-mxs.c              |  4 ++--
 drivers/irqchip/irq-sun4i.c            |  2 +-
 drivers/irqchip/irq-ti-sci-inta.c      |  2 +-
 drivers/irqchip/irq-vic.c              |  4 ++--
 drivers/irqchip/irq-xilinx-intc.c      |  2 +-
 include/linux/irq.h                    |  4 ++--
 include/linux/irqdesc.h                |  2 +-
 kernel/irq/chip.c                      |  2 +-
 kernel/irq/dummychip.c                 |  2 +-
 kernel/irq/irqdesc.c                   |  2 +-
 kernel/irq/irqdomain.c                 |  8 ++++----
 kernel/irq/manage.c                    |  6 +++---
 kernel/irq/msi.c                       |  2 +-
 kernel/irq/timings.c                   |  2 +-
 23 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-aspeed-vic.c b/drivers/irqchip/irq-aspeed-vic.c
index 6567ed782f82..58717cd44f99 100644
--- a/drivers/irqchip/irq-aspeed-vic.c
+++ b/drivers/irqchip/irq-aspeed-vic.c
@@ -71,7 +71,7 @@ static void vic_init_hw(struct aspeed_vic *vic)
 	writel(0, vic->base + AVIC_INT_SELECT);
 	writel(0, vic->base + AVIC_INT_SELECT + 4);
 
-	/* Some interrupts have a programable high/low level trigger
+	/* Some interrupts have a programmable high/low level trigger
 	 * (4 GPIO direct inputs), for now we assume this was configured
 	 * by firmware. We read which ones are edge now.
 	 */
@@ -203,7 +203,7 @@ static int __init avic_of_init(struct device_node *node,
 	}
 	vic->base = regs;
 
-	/* Initialize soures, all masked */
+	/* Initialize sources, all masked */
 	vic_init_hw(vic);
 
 	/* Ready to receive interrupts */
diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c
index c7c9e976acbb..ad59656ccc28 100644
--- a/drivers/irqchip/irq-bcm7120-l2.c
+++ b/drivers/irqchip/irq-bcm7120-l2.c
@@ -309,7 +309,7 @@ static int __init bcm7120_l2_intc_probe(struct device_node *dn,
 
 		if (data->can_wake) {
 			/* This IRQ chip can wake the system, set all
-			 * relevant child interupts in wake_enabled mask
+			 * relevant child interrupts in wake_enabled mask
 			 */
 			gc->wake_enabled = 0xffffffff;
 			gc->wake_enabled &= ~gc->unused;
diff --git a/drivers/irqchip/irq-csky-apb-intc.c b/drivers/irqchip/irq-csky-apb-intc.c
index 5a2ec43b7ddd..ab91afa86755 100644
--- a/drivers/irqchip/irq-csky-apb-intc.c
+++ b/drivers/irqchip/irq-csky-apb-intc.c
@@ -176,7 +176,7 @@ gx_intc_init(struct device_node *node, struct device_node *parent)
 	writel(0x0, reg_base + GX_INTC_NEN63_32);
 
 	/*
-	 * Initial mask reg with all unmasked, because we only use enalbe reg
+	 * Initial mask reg with all unmasked, because we only use enable reg
 	 */
 	writel(0x0, reg_base + GX_INTC_NMASK31_00);
 	writel(0x0, reg_base + GX_INTC_NMASK63_32);
diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index fbec07d634ad..4116b48e60af 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -371,7 +371,7 @@ static int __init gicv2m_init_one(struct fwnode_handle *fwnode,
 	 * the MSI data is the absolute value within the range from
 	 * spi_start to (spi_start + num_spis).
 	 *
-	 * Broadom NS2 GICv2m implementation has an erratum where the MSI data
+	 * Broadcom NS2 GICv2m implementation has an erratum where the MSI data
 	 * is 'spi_number - 32'
 	 *
 	 * Reading that register fails on the Graviton implementation
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index ed46e6057e33..c3485b230d70 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1492,7 +1492,7 @@ static void its_vlpi_set_doorbell(struct irq_data *d, bool enable)
 	 *
 	 * Ideally, we'd issue a VMAPTI to set the doorbell to its LPI
 	 * value or to 1023, depending on the enable bit. But that
-	 * would be issueing a mapping for an /existing/ DevID+EventID
+	 * would be issuing a mapping for an /existing/ DevID+EventID
 	 * pair, which is UNPREDICTABLE. Instead, let's issue a VMOVI
 	 * to the /same/ vPE, using this opportunity to adjust the
 	 * doorbell. Mouahahahaha. We loves it, Precious.
@@ -3122,7 +3122,7 @@ static void its_cpu_init_lpis(void)
 
 		/*
 		 * It's possible for CPU to receive VLPIs before it is
-		 * sheduled as a vPE, especially for the first CPU, and the
+		 * scheduled as a vPE, especially for the first CPU, and the
 		 * VLPI with INTID larger than 2^(IDbits+1) will be considered
 		 * as out of range and dropped by GIC.
 		 * So we initialize IDbits to known value to avoid VLPI drop.
@@ -3616,7 +3616,7 @@ static void its_irq_domain_free(struct irq_domain *domain, unsigned int virq,
 
 	/*
 	 * If all interrupts have been freed, start mopping the
-	 * floor. This is conditionned on the device not being shared.
+	 * floor. This is conditioned on the device not being shared.
 	 */
 	if (!its_dev->shared &&
 	    bitmap_empty(its_dev->event_map.lpi_map,
@@ -4194,7 +4194,7 @@ static int its_sgi_set_affinity(struct irq_data *d,
 {
 	/*
 	 * There is no notion of affinity for virtual SGIs, at least
-	 * not on the host (since they can only be targetting a vPE).
+	 * not on the host (since they can only be targeting a vPE).
 	 * Tell the kernel we've done whatever it asked for.
 	 */
 	irq_data_update_effective_affinity(d, mask_val);
@@ -4239,7 +4239,7 @@ static int its_sgi_get_irqchip_state(struct irq_data *d,
 	/*
 	 * Locking galore! We can race against two different events:
 	 *
-	 * - Concurent vPE affinity change: we must make sure it cannot
+	 * - Concurrent vPE affinity change: we must make sure it cannot
 	 *   happen, or we'll talk to the wrong redistributor. This is
 	 *   identical to what happens with vLPIs.
 	 *
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index eb0ee356a629..94b89258d045 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1379,7 +1379,7 @@ static int gic_irq_domain_translate(struct irq_domain *d,
 
 		/*
 		 * Make it clear that broken DTs are... broken.
-		 * Partitionned PPIs are an unfortunate exception.
+		 * Partitioned PPIs are an unfortunate exception.
 		 */
 		WARN_ON(*type == IRQ_TYPE_NONE &&
 			fwspec->param[0] != GIC_IRQ_TYPE_PARTITION);
diff --git a/drivers/irqchip/irq-loongson-pch-pic.c b/drivers/irqchip/irq-loongson-pch-pic.c
index 9bf6b9a5f734..f790ca6d78aa 100644
--- a/drivers/irqchip/irq-loongson-pch-pic.c
+++ b/drivers/irqchip/irq-loongson-pch-pic.c
@@ -163,7 +163,7 @@ static void pch_pic_reset(struct pch_pic *priv)
 	int i;
 
 	for (i = 0; i < PIC_COUNT; i++) {
-		/* Write vectore ID */
+		/* Write vectored ID */
 		writeb(priv->ht_vec_base + i, priv->base + PCH_INT_HTVEC(i));
 		/* Hardcode route to HT0 Lo */
 		writeb(1, priv->base + PCH_INT_ROUTE(i));
diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c
index bc7aebcc96e9..e50676ce2ec8 100644
--- a/drivers/irqchip/irq-meson-gpio.c
+++ b/drivers/irqchip/irq-meson-gpio.c
@@ -227,7 +227,7 @@ meson_gpio_irq_request_channel(struct meson_gpio_irq_controller *ctl,
 
 	/*
 	 * Get the hwirq number assigned to this channel through
-	 * a pointer the channel_irq table. The added benifit of this
+	 * a pointer the channel_irq table. The added benefit of this
 	 * method is that we can also retrieve the channel index with
 	 * it, using the table base.
 	 */
diff --git a/drivers/irqchip/irq-mtk-cirq.c b/drivers/irqchip/irq-mtk-cirq.c
index 69ba8ce3c178..9bca0918078e 100644
--- a/drivers/irqchip/irq-mtk-cirq.c
+++ b/drivers/irqchip/irq-mtk-cirq.c
@@ -217,7 +217,7 @@ static void mtk_cirq_resume(void)
 {
 	u32 value;
 
-	/* flush recored interrupts, will send signals to parent controller */
+	/* flush recorded interrupts, will send signals to parent controller */
 	value = readl_relaxed(cirq_data->base + CIRQ_CONTROL);
 	writel_relaxed(value | CIRQ_FLUSH, cirq_data->base + CIRQ_CONTROL);
 
diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c
index a671938fd97f..d1f5740cd575 100644
--- a/drivers/irqchip/irq-mxs.c
+++ b/drivers/irqchip/irq-mxs.c
@@ -58,7 +58,7 @@ struct icoll_priv {
 static struct icoll_priv icoll_priv;
 static struct irq_domain *icoll_domain;
 
-/* calculate bit offset depending on number of intterupt per register */
+/* calculate bit offset depending on number of interrupt per register */
 static u32 icoll_intr_bitshift(struct irq_data *d, u32 bit)
 {
 	/*
@@ -68,7 +68,7 @@ static u32 icoll_intr_bitshift(struct irq_data *d, u32 bit)
 	return bit << ((d->hwirq & 3) << 3);
 }
 
-/* calculate mem offset depending on number of intterupt per register */
+/* calculate mem offset depending on number of interrupt per register */
 static void __iomem *icoll_intr_reg(struct irq_data *d)
 {
 	/* offset = hwirq / intr_per_reg * 0x10 */
diff --git a/drivers/irqchip/irq-sun4i.c b/drivers/irqchip/irq-sun4i.c
index fb78d6623556..9ea94456b178 100644
--- a/drivers/irqchip/irq-sun4i.c
+++ b/drivers/irqchip/irq-sun4i.c
@@ -189,7 +189,7 @@ static void __exception_irq_entry sun4i_handle_irq(struct pt_regs *regs)
 	 * 3) spurious irq
 	 * So if we immediately get a reading of 0, check the irq-pending reg
 	 * to differentiate between 2 and 3. We only do this once to avoid
-	 * the extra check in the common case of 1 hapening after having
+	 * the extra check in the common case of 1 happening after having
 	 * read the vector-reg once.
 	 */
 	hwirq = readl(irq_ic_data->irq_base + SUN4I_IRQ_VECTOR_REG) >> 2;
diff --git a/drivers/irqchip/irq-ti-sci-inta.c b/drivers/irqchip/irq-ti-sci-inta.c
index 532d0ae172d9..ca1f593f4d13 100644
--- a/drivers/irqchip/irq-ti-sci-inta.c
+++ b/drivers/irqchip/irq-ti-sci-inta.c
@@ -78,7 +78,7 @@ struct ti_sci_inta_vint_desc {
  * struct ti_sci_inta_irq_domain - Structure representing a TISCI based
  *				   Interrupt Aggregator IRQ domain.
  * @sci:		Pointer to TISCI handle
- * @vint:		TISCI resource pointer representing IA inerrupts.
+ * @vint:		TISCI resource pointer representing IA interrupts.
  * @global_event:	TISCI resource pointer representing global events.
  * @vint_list:		List of the vints active in the system
  * @vint_mutex:		Mutex to protect vint_list
diff --git a/drivers/irqchip/irq-vic.c b/drivers/irqchip/irq-vic.c
index e46036374227..62f3d29f9042 100644
--- a/drivers/irqchip/irq-vic.c
+++ b/drivers/irqchip/irq-vic.c
@@ -163,7 +163,7 @@ static struct syscore_ops vic_syscore_ops = {
 };
 
 /**
- * vic_pm_init - initicall to register VIC pm
+ * vic_pm_init - initcall to register VIC pm
  *
  * This is called via late_initcall() to register
  * the resources for the VICs due to the early
@@ -397,7 +397,7 @@ static void __init vic_clear_interrupts(void __iomem *base)
 /*
  * The PL190 cell from ARM has been modified by ST to handle 64 interrupts.
  * The original cell has 32 interrupts, while the modified one has 64,
- * replocating two blocks 0x00..0x1f in 0x20..0x3f. In that case
+ * replicating two blocks 0x00..0x1f in 0x20..0x3f. In that case
  * the probe function is called twice, with base set to offset 000
  *  and 020 within the page. We call this "second block".
  */
diff --git a/drivers/irqchip/irq-xilinx-intc.c b/drivers/irqchip/irq-xilinx-intc.c
index 1d3d273309bd..8cd1bfc73057 100644
--- a/drivers/irqchip/irq-xilinx-intc.c
+++ b/drivers/irqchip/irq-xilinx-intc.c
@@ -210,7 +210,7 @@ static int __init xilinx_intc_of_init(struct device_node *intc,
 
 	/*
 	 * Disable all external interrupts until they are
-	 * explicity requested.
+	 * explicitly requested.
 	 */
 	xintc_write(irqc, IER, 0);
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2efde6a79b7e..bee82809107c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -116,7 +116,7 @@ enum {
  * IRQ_SET_MASK_NOCPY	- OK, chip did update irq_common_data.affinity
  * IRQ_SET_MASK_OK_DONE	- Same as IRQ_SET_MASK_OK for core. Special code to
  *			  support stacked irqchips, which indicates skipping
- *			  all descendent irqchips.
+ *			  all descendant irqchips.
  */
 enum {
 	IRQ_SET_MASK_OK = 0,
@@ -302,7 +302,7 @@ static inline bool irqd_is_level_type(struct irq_data *d)
 
 /*
  * Must only be called of irqchip.irq_set_affinity() or low level
- * hieararchy domain allocation functions.
+ * hierarchy domain allocation functions.
  */
 static inline void irqd_set_single_target(struct irq_data *d)
 {
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 891b323266df..df4651250785 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -32,7 +32,7 @@ struct pt_regs;
  * @last_unhandled:	aging timer for unhandled count
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @threads_handled:	stats field for deferred spurious detection of threaded handlers
- * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
+ * @threads_handled_last: comparator field for deferred spurious detection of threaded handlers
  * @lock:		locking for SMP
  * @affinity_hint:	hint to user space for preferred irq affinity
  * @affinity_notify:	context for notification of affinity changes
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 042399cb5caf..8cc8e5713287 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -808,7 +808,7 @@ void handle_edge_irq(struct irq_desc *desc)
 		/*
 		 * When another irq arrived while we were handling
 		 * one, we could have masked the irq.
-		 * Renable it, if it was not disabled in meantime.
+		 * Reenable it, if it was not disabled in meantime.
 		 */
 		if (unlikely(desc->istate & IRQS_PENDING)) {
 			if (!irqd_irq_disabled(&desc->irq_data) &&
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 0b0cdf206dc4..7fe6cffe7d0d 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -13,7 +13,7 @@
 
 /*
  * What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
+ * Each architecture has to answer this themselves.
  */
 static void ack_bad(struct irq_data *data)
 {
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index cc1a09406c6e..4a617d7312a4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -31,7 +31,7 @@ static int __init irq_affinity_setup(char *str)
 	cpulist_parse(str, irq_default_affinity);
 	/*
 	 * Set at least the boot cpu. We don't want to end up with
-	 * bugreports caused by random comandline masks
+	 * bugreports caused by random commandline masks
 	 */
 	cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
 	return 1;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 288151393a06..6cb7a9d7e8b1 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
  * @name:	Optional user provided domain name
  * @pa:		Optional user-provided physical address
  *
- * Allocate a struct irqchip_fwid, and return a poiner to the embedded
+ * Allocate a struct irqchip_fwid, and return a pointer to the embedded
  * fwnode_handle (or NULL on failure).
  *
  * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are
@@ -665,7 +665,7 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
 
 	pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
 
-	/* Look for default domain if nececssary */
+	/* Look for default domain if necessary */
 	if (domain == NULL)
 		domain = irq_default_domain;
 	if (domain == NULL) {
@@ -906,7 +906,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 {
 	struct irq_data *data;
 
-	/* Look for default domain if nececssary */
+	/* Look for default domain if necessary */
 	if (domain == NULL)
 		domain = irq_default_domain;
 	if (domain == NULL)
@@ -1436,7 +1436,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
  * The whole process to setup an IRQ has been split into two steps.
  * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
  * descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program hardwares with preallocated
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
  * resources. In this way, it's easier to rollback when failing to
  * allocate resources.
  */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 07ed2e4ff9a3..e976c4927b25 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -326,7 +326,7 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
 	 * If the interrupt is not yet activated, just store the affinity
 	 * mask and do not call the chip driver at all. On activation the
 	 * driver has to make sure anyway that the interrupt is in a
-	 * useable state so startup works.
+	 * usable state so startup works.
 	 */
 	if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
 	    irqd_is_activated(data) || !irqd_affinity_on_activate(data))
@@ -1054,7 +1054,7 @@ again:
 	 * to IRQS_INPROGRESS and the irq line is masked forever.
 	 *
 	 * This also serializes the state of shared oneshot handlers
-	 * versus "desc->threads_onehsot |= action->thread_mask;" in
+	 * versus "desc->threads_oneshot |= action->thread_mask;" in
 	 * irq_wake_thread(). See the comment there which explains the
 	 * serialization.
 	 */
@@ -1909,7 +1909,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
 	/* Last action releases resources */
 	if (!desc->action) {
 		/*
-		 * Reaquire bus lock as irq_release_resources() might
+		 * Reacquire bus lock as irq_release_resources() might
 		 * require it to deallocate resources over the slow bus.
 		 */
 		chip_bus_lock(desc);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index b338d622f26e..c41965e348b5 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -5,7 +5,7 @@
  *
  * This file is licensed under GPLv2.
  *
- * This file contains common code to support Message Signalled Interrupt for
+ * This file contains common code to support Message Signaled Interrupts for
  * PCI compatible and non PCI compatible devices.
  */
 #include <linux/types.h>
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index c31860569931..d309d6fbf5bd 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -485,7 +485,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
 
 	/*
 	 * The interrupt triggered more than one second apart, that
-	 * ends the sequence as predictible for our purpose. In this
+	 * ends the sequence as predictable for our purpose. In this
 	 * case, assume we have the beginning of a sequence and the
 	 * timestamp is the first value. As it is impossible to
 	 * predict anything at this point, return.
-- 
cgit v1.2.3


From 4c38f2df71c8e33c0b64865992d693f5022eeaad Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 23 Jun 2020 15:49:40 +0530
Subject: cpufreq: CPPC: Add support for frequency invariance

The Frequency Invariance Engine (FIE) is providing a frequency scaling
correction factor that helps achieve more accurate load-tracking.

Normally, this scaling factor can be obtained directly with the help of
the cpufreq drivers as they know the exact frequency the hardware is
running at. But that isn't the case for CPPC cpufreq driver.

Another way of obtaining that is using the arch specific counter
support, which is already present in kernel, but that hardware is
optional for platforms.

This patch updates the CPPC driver to register itself with the topology
core to provide its own implementation (cppc_scale_freq_tick()) of
topology_scale_freq_tick() which gets called by the scheduler on every
tick. Note that the arch specific counters have higher priority than
CPPC counters, if available, though the CPPC driver doesn't need to have
any special handling for that.

On an invocation of cppc_scale_freq_tick(), we schedule an irq work
(since we reach here from hard-irq context), which then schedules a
normal work item and cppc_scale_freq_workfn() updates the per_cpu
arch_freq_scale variable based on the counter updates since the last
tick.

To allow platforms to disable this CPPC counter-based frequency
invariance support, this is all done under CONFIG_ACPI_CPPC_CPUFREQ_FIE,
which is enabled by default.

This also exports sched_setattr_nocheck() as the CPPC driver can be
built as a module.

Cc: linux-acpi@vger.kernel.org
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/Kconfig.arm    |  10 ++
 drivers/cpufreq/cppc_cpufreq.c | 245 +++++++++++++++++++++++++++++++++++++++--
 include/linux/arch_topology.h  |   1 +
 kernel/sched/core.c            |   1 +
 4 files changed, 245 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index e65e0a43be64..a5c5f70acfc9 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ
 
 	  If in doubt, say N.
 
+config ACPI_CPPC_CPUFREQ_FIE
+	bool "Frequency Invariance support for CPPC cpufreq driver"
+	depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY
+	default y
+	help
+	  This extends frequency invariance support in the CPPC cpufreq driver,
+	  by using CPPC delivered and reference performance counters.
+
+	  If in doubt, say N.
+
 config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM
 	tristate "Allwinner nvmem based SUN50I CPUFreq driver"
 	depends on ARCH_SUNXI
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 8a482c434ea6..b8e1b8ea628c 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -10,14 +10,18 @@
 
 #define pr_fmt(fmt)	"CPPC Cpufreq:"	fmt
 
+#include <linux/arch_topology.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/dmi.h>
+#include <linux/irq_work.h>
+#include <linux/kthread.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
+#include <uapi/linux/sched/types.h>
 
 #include <asm/unaligned.h>
 
@@ -57,6 +61,204 @@ static struct cppc_workaround_oem_info wa_info[] = {
 	}
 };
 
+#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
+
+/* Frequency invariance support */
+struct cppc_freq_invariance {
+	int cpu;
+	struct irq_work irq_work;
+	struct kthread_work work;
+	struct cppc_perf_fb_ctrs prev_perf_fb_ctrs;
+	struct cppc_cpudata *cpu_data;
+};
+
+static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv);
+static struct kthread_worker *kworker_fie;
+static bool fie_disabled;
+
+static struct cpufreq_driver cppc_cpufreq_driver;
+static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu);
+static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
+				 struct cppc_perf_fb_ctrs fb_ctrs_t0,
+				 struct cppc_perf_fb_ctrs fb_ctrs_t1);
+
+/**
+ * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance
+ * @work: The work item.
+ *
+ * The CPPC driver register itself with the topology core to provide its own
+ * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which
+ * gets called by the scheduler on every tick.
+ *
+ * Note that the arch specific counters have higher priority than CPPC counters,
+ * if available, though the CPPC driver doesn't need to have any special
+ * handling for that.
+ *
+ * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we
+ * reach here from hard-irq context), which then schedules a normal work item
+ * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable
+ * based on the counter updates since the last tick.
+ */
+static void cppc_scale_freq_workfn(struct kthread_work *work)
+{
+	struct cppc_freq_invariance *cppc_fi;
+	struct cppc_perf_fb_ctrs fb_ctrs = {0};
+	struct cppc_cpudata *cpu_data;
+	unsigned long local_freq_scale;
+	u64 perf;
+
+	cppc_fi = container_of(work, struct cppc_freq_invariance, work);
+	cpu_data = cppc_fi->cpu_data;
+
+	if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) {
+		pr_warn("%s: failed to read perf counters\n", __func__);
+		return;
+	}
+
+	cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
+	perf = cppc_perf_from_fbctrs(cpu_data, cppc_fi->prev_perf_fb_ctrs,
+				     fb_ctrs);
+
+	perf <<= SCHED_CAPACITY_SHIFT;
+	local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf);
+	if (WARN_ON(local_freq_scale > 1024))
+		local_freq_scale = 1024;
+
+	per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale;
+}
+
+static void cppc_irq_work(struct irq_work *irq_work)
+{
+	struct cppc_freq_invariance *cppc_fi;
+
+	cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work);
+	kthread_queue_work(kworker_fie, &cppc_fi->work);
+}
+
+static void cppc_scale_freq_tick(void)
+{
+	struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id());
+
+	/*
+	 * cppc_get_perf_ctrs() can potentially sleep, call that from the right
+	 * context.
+	 */
+	irq_work_queue(&cppc_fi->irq_work);
+}
+
+static struct scale_freq_data cppc_sftd = {
+	.source = SCALE_FREQ_SOURCE_CPPC,
+	.set_freq_scale = cppc_scale_freq_tick,
+};
+
+static void cppc_freq_invariance_policy_init(struct cpufreq_policy *policy,
+					     struct cppc_cpudata *cpu_data)
+{
+	struct cppc_perf_fb_ctrs fb_ctrs = {0};
+	struct cppc_freq_invariance *cppc_fi;
+	int i, ret;
+
+	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+		return;
+
+	if (fie_disabled)
+		return;
+
+	for_each_cpu(i, policy->cpus) {
+		cppc_fi = &per_cpu(cppc_freq_inv, i);
+		cppc_fi->cpu = i;
+		cppc_fi->cpu_data = cpu_data;
+		kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn);
+		init_irq_work(&cppc_fi->irq_work, cppc_irq_work);
+
+		ret = cppc_get_perf_ctrs(i, &fb_ctrs);
+		if (ret) {
+			pr_warn("%s: failed to read perf counters: %d\n",
+				__func__, ret);
+			fie_disabled = true;
+		} else {
+			cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
+		}
+	}
+}
+
+static void __init cppc_freq_invariance_init(void)
+{
+	struct sched_attr attr = {
+		.size		= sizeof(struct sched_attr),
+		.sched_policy	= SCHED_DEADLINE,
+		.sched_nice	= 0,
+		.sched_priority	= 0,
+		/*
+		 * Fake (unused) bandwidth; workaround to "fix"
+		 * priority inheritance.
+		 */
+		.sched_runtime	= 1000000,
+		.sched_deadline = 10000000,
+		.sched_period	= 10000000,
+	};
+	int ret;
+
+	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+		return;
+
+	if (fie_disabled)
+		return;
+
+	kworker_fie = kthread_create_worker(0, "cppc_fie");
+	if (IS_ERR(kworker_fie))
+		return;
+
+	ret = sched_setattr_nocheck(kworker_fie->task, &attr);
+	if (ret) {
+		pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__,
+			ret);
+		kthread_destroy_worker(kworker_fie);
+		return;
+	}
+
+	/* Register for freq-invariance */
+	topology_set_scale_freq_source(&cppc_sftd, cpu_present_mask);
+}
+
+static void cppc_freq_invariance_exit(void)
+{
+	struct cppc_freq_invariance *cppc_fi;
+	int i;
+
+	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+		return;
+
+	if (fie_disabled)
+		return;
+
+	topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, cpu_present_mask);
+
+	for_each_possible_cpu(i) {
+		cppc_fi = &per_cpu(cppc_freq_inv, i);
+		irq_work_sync(&cppc_fi->irq_work);
+	}
+
+	kthread_destroy_worker(kworker_fie);
+	kworker_fie = NULL;
+}
+
+#else
+static inline void
+cppc_freq_invariance_policy_init(struct cpufreq_policy *policy,
+				 struct cppc_cpudata *cpu_data)
+{
+}
+
+static inline void cppc_freq_invariance_init(void)
+{
+}
+
+static inline void cppc_freq_invariance_exit(void)
+{
+}
+#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */
+
 /* Callback function used to retrieve the max frequency from DMI */
 static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private)
 {
@@ -355,9 +557,12 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	cpu_data->perf_ctrls.desired_perf =  caps->highest_perf;
 
 	ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
-	if (ret)
+	if (ret) {
 		pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n",
 			 caps->highest_perf, cpu, ret);
+	} else {
+		cppc_freq_invariance_policy_init(policy, cpu_data);
+	}
 
 	return ret;
 }
@@ -370,12 +575,12 @@ static inline u64 get_delta(u64 t1, u64 t0)
 	return (u32)t1 - (u32)t0;
 }
 
-static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
-				     struct cppc_perf_fb_ctrs fb_ctrs_t0,
-				     struct cppc_perf_fb_ctrs fb_ctrs_t1)
+static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
+				 struct cppc_perf_fb_ctrs fb_ctrs_t0,
+				 struct cppc_perf_fb_ctrs fb_ctrs_t1)
 {
 	u64 delta_reference, delta_delivered;
-	u64 reference_perf, delivered_perf;
+	u64 reference_perf;
 
 	reference_perf = fb_ctrs_t0.reference_perf;
 
@@ -384,12 +589,21 @@ static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
 	delta_delivered = get_delta(fb_ctrs_t1.delivered,
 				    fb_ctrs_t0.delivered);
 
-	/* Check to avoid divide-by zero */
-	if (delta_reference || delta_delivered)
-		delivered_perf = (reference_perf * delta_delivered) /
-					delta_reference;
-	else
-		delivered_perf = cpu_data->perf_ctrls.desired_perf;
+	/* Check to avoid divide-by zero and invalid delivered_perf */
+	if (!delta_reference || !delta_delivered)
+		return cpu_data->perf_ctrls.desired_perf;
+
+	return (reference_perf * delta_delivered) / delta_reference;
+}
+
+static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
+				     struct cppc_perf_fb_ctrs fb_ctrs_t0,
+				     struct cppc_perf_fb_ctrs fb_ctrs_t1)
+{
+	u64 delivered_perf;
+
+	delivered_perf = cppc_perf_from_fbctrs(cpu_data, fb_ctrs_t0,
+					       fb_ctrs_t1);
 
 	return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf);
 }
@@ -514,6 +728,8 @@ static void cppc_check_hisi_workaround(void)
 
 static int __init cppc_cpufreq_init(void)
 {
+	int ret;
+
 	if ((acpi_disabled) || !acpi_cpc_valid())
 		return -ENODEV;
 
@@ -521,7 +737,11 @@ static int __init cppc_cpufreq_init(void)
 
 	cppc_check_hisi_workaround();
 
-	return cpufreq_register_driver(&cppc_cpufreq_driver);
+	ret = cpufreq_register_driver(&cppc_cpufreq_driver);
+	if (!ret)
+		cppc_freq_invariance_init();
+
+	return ret;
 }
 
 static inline void free_cpu_data(void)
@@ -538,6 +758,7 @@ static inline void free_cpu_data(void)
 
 static void __exit cppc_cpufreq_exit(void)
 {
+	cppc_freq_invariance_exit();
 	cpufreq_unregister_driver(&cppc_cpufreq_driver);
 
 	free_cpu_data();
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 11e555cfaecb..f180240dc95f 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -37,6 +37,7 @@ bool topology_scale_freq_invariant(void);
 enum scale_freq_source {
 	SCALE_FREQ_SOURCE_CPUFREQ = 0,
 	SCALE_FREQ_SOURCE_ARCH,
+	SCALE_FREQ_SOURCE_CPPC,
 };
 
 struct scale_freq_data {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca2bb629595f..3adedc7b1725 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6386,6 +6386,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
 {
 	return __sched_setscheduler(p, attr, false, true);
 }
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
 
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
-- 
cgit v1.2.3


From 60250052a1a6e3d605d736fea240226a42e4b3ee Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Mon, 15 Mar 2021 20:44:45 +0100
Subject: media: camera-mx2: Remove unused header file

The imx27/imx25 camera driver has been removed a long time ago,
so get rid of this unused header file.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/linux/platform_data/media/camera-mx2.h | 31 --------------------------
 1 file changed, 31 deletions(-)
 delete mode 100644 include/linux/platform_data/media/camera-mx2.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/media/camera-mx2.h b/include/linux/platform_data/media/camera-mx2.h
deleted file mode 100644
index 8cfa76b6e1e1..000000000000
--- a/include/linux/platform_data/media/camera-mx2.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * mx2-cam.h - i.MX27/i.MX25 camera driver header file
- *
- * Copyright (C) 2003, Intel Corporation
- * Copyright (C) 2008, Sascha Hauer <s.hauer@pengutronix.de>
- * Copyright (C) 2010, Baruch Siach <baruch@tkos.co.il>
- */
-
-#ifndef __MACH_MX2_CAM_H_
-#define __MACH_MX2_CAM_H_
-
-#define MX2_CAMERA_EXT_VSYNC		(1 << 1)
-#define MX2_CAMERA_CCIR			(1 << 2)
-#define MX2_CAMERA_CCIR_INTERLACE	(1 << 3)
-#define MX2_CAMERA_HSYNC_HIGH		(1 << 4)
-#define MX2_CAMERA_GATED_CLOCK		(1 << 5)
-#define MX2_CAMERA_INV_DATA		(1 << 6)
-#define MX2_CAMERA_PCLK_SAMPLE_RISING	(1 << 7)
-
-/**
- * struct mx2_camera_platform_data - optional platform data for mx2_camera
- * @flags: any combination of MX2_CAMERA_*
- * @clk: clock rate of the csi block / 2
- */
-struct mx2_camera_platform_data {
-	unsigned long flags;
-	unsigned long clk;
-};
-
-#endif /* __MACH_MX2_CAM_H_ */
-- 
cgit v1.2.3


From 3f5b610b45a36049392883f3754c616ad11b7c7e Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Mon, 15 Mar 2021 20:44:46 +0100
Subject: media: camera-mx3: Remove unused header file

The imx3 camera driver has been removed a long time ago, so get
rid of this unused header file.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/linux/platform_data/media/camera-mx3.h | 43 --------------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 include/linux/platform_data/media/camera-mx3.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/media/camera-mx3.h b/include/linux/platform_data/media/camera-mx3.h
deleted file mode 100644
index 781c004e5596..000000000000
--- a/include/linux/platform_data/media/camera-mx3.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * mx3_camera.h - i.MX3x camera driver header file
- *
- * Copyright (C) 2008, Guennadi Liakhovetski, DENX Software Engineering, <lg@denx.de>
- */
-
-#ifndef _MX3_CAMERA_H_
-#define _MX3_CAMERA_H_
-
-#include <linux/device.h>
-
-#define MX3_CAMERA_CLK_SRC	1
-#define MX3_CAMERA_EXT_VSYNC	2
-#define MX3_CAMERA_DP		4
-#define MX3_CAMERA_PCP		8
-#define MX3_CAMERA_HSP		0x10
-#define MX3_CAMERA_VSP		0x20
-#define MX3_CAMERA_DATAWIDTH_4	0x40
-#define MX3_CAMERA_DATAWIDTH_8	0x80
-#define MX3_CAMERA_DATAWIDTH_10	0x100
-#define MX3_CAMERA_DATAWIDTH_15	0x200
-
-#define MX3_CAMERA_DATAWIDTH_MASK (MX3_CAMERA_DATAWIDTH_4 | MX3_CAMERA_DATAWIDTH_8 | \
-				   MX3_CAMERA_DATAWIDTH_10 | MX3_CAMERA_DATAWIDTH_15)
-
-struct v4l2_async_subdev;
-
-/**
- * struct mx3_camera_pdata - i.MX3x camera platform data
- * @flags:	MX3_CAMERA_* flags
- * @mclk_10khz:	master clock frequency in 10kHz units
- * @dma_dev:	IPU DMA device to match against in channel allocation
- */
-struct mx3_camera_pdata {
-	unsigned long flags;
-	unsigned long mclk_10khz;
-	struct device *dma_dev;
-	struct v4l2_async_subdev **asd;	/* Flat array, arranged in groups */
-	int *asd_sizes;			/* 0-terminated array of asd group sizes */
-};
-
-#endif
-- 
cgit v1.2.3


From 9666cec380d60808eb86d3be4caf84faeebe3081 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Mon, 7 Dec 2020 14:45:56 +0100
Subject: pwm: Drop function pwmchip_add_with_polarity()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pwmchip_add() only calls pwmchip_add_with_polarity() and nothing else. All
other users of pwmchip_add_with_polarity() are gone. So drop
pwmchip_add_with_polarity() and move the code instead to pwmchip_add().

The initial assignment to pwm->state.polarity is dropped. In every correct
usage of the PWM API this value is overwritten later anyhow.

Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 25 +++----------------------
 include/linux/pwm.h |  2 --
 2 files changed, 3 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 25ee06a14bb3..c4d5c0667137 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -250,17 +250,14 @@ static bool pwm_ops_check(const struct pwm_chip *chip)
 }
 
 /**
- * pwmchip_add_with_polarity() - register a new PWM chip
+ * pwmchip_add() - register a new PWM chip
  * @chip: the PWM chip to add
- * @polarity: initial polarity of PWM channels
  *
- * Register a new PWM chip. The initial polarity for all channels is specified
- * by the @polarity parameter.
+ * Register a new PWM chip.
  *
  * Returns: 0 on success or a negative error code on failure.
  */
-int pwmchip_add_with_polarity(struct pwm_chip *chip,
-			      enum pwm_polarity polarity)
+int pwmchip_add(struct pwm_chip *chip)
 {
 	struct pwm_device *pwm;
 	unsigned int i;
@@ -292,7 +289,6 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip,
 		pwm->chip = chip;
 		pwm->pwm = chip->base + i;
 		pwm->hwpwm = i;
-		pwm->state.polarity = polarity;
 
 		radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
 	}
@@ -315,21 +311,6 @@ out:
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(pwmchip_add_with_polarity);
-
-/**
- * pwmchip_add() - register a new PWM chip
- * @chip: the PWM chip to add
- *
- * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
- * will be used. The initial polarity for all channels is normal.
- *
- * Returns: 0 on success or a negative error code on failure.
- */
-int pwmchip_add(struct pwm_chip *chip)
-{
-	return pwmchip_add_with_polarity(chip, PWM_POLARITY_NORMAL);
-}
 EXPORT_SYMBOL_GPL(pwmchip_add);
 
 /**
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index e4d84d4db293..8f4eefd129aa 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -392,8 +392,6 @@ int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result,
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
-int pwmchip_add_with_polarity(struct pwm_chip *chip,
-			      enum pwm_polarity polarity);
 int pwmchip_add(struct pwm_chip *chip);
 int pwmchip_remove(struct pwm_chip *chip);
 struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
-- 
cgit v1.2.3


From 6558b667a7297418b8951ba54da68d551035ecc5 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Tue, 2 Mar 2021 15:51:03 +0800
Subject: soundwire: add override addr ops

Platform firmware may have incorrect _ADR values causing the driver
probes to fail. Add the override_ops, which when configured will allow
for quirks based on DMI etc to override the addr values.

Co-developed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Link: https://lore.kernel.org/r/20210302075105.11515-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/slave.c     | 8 +++++++-
 include/linux/soundwire/sdw.h | 4 +++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c
index 180f38bd003b..112b21967c7a 100644
--- a/drivers/soundwire/slave.c
+++ b/drivers/soundwire/slave.c
@@ -95,7 +95,7 @@ static bool find_slave(struct sdw_bus *bus,
 		       struct acpi_device *adev,
 		       struct sdw_slave_id *id)
 {
-	unsigned long long addr;
+	u64 addr;
 	unsigned int link_id;
 	acpi_status status;
 
@@ -108,6 +108,12 @@ static bool find_slave(struct sdw_bus *bus,
 		return false;
 	}
 
+	if (bus->ops->override_adr)
+		addr = bus->ops->override_adr(bus, addr);
+
+	if (!addr)
+		return false;
+
 	/* Extract link id from ADR, Bit 51 to 48 (included) */
 	link_id = SDW_DISCO_LINK_ID(addr);
 
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index d08039d65825..f0a3895e8faf 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -804,6 +804,7 @@ struct sdw_defer {
 /**
  * struct sdw_master_ops - Master driver ops
  * @read_prop: Read Master properties
+ * @override_adr: Override value read from firmware (quirk for buggy firmware)
  * @xfer_msg: Transfer message callback
  * @xfer_msg_defer: Defer version of transfer message callback
  * @reset_page_addr: Reset the SCP page address registers
@@ -813,7 +814,8 @@ struct sdw_defer {
  */
 struct sdw_master_ops {
 	int (*read_prop)(struct sdw_bus *bus);
-
+	u64 (*override_adr)
+			(struct sdw_bus *bus, u64 addr);
 	enum sdw_command_response (*xfer_msg)
 			(struct sdw_bus *bus, struct sdw_msg *msg);
 	enum sdw_command_response (*xfer_msg_defer)
-- 
cgit v1.2.3


From 5bb643c39b97c440b2981c69f05596c7ea868b73 Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Tue, 2 Mar 2021 16:27:18 +0800
Subject: soundwire: add master quirks for bus clash and parity

Currently quirks are only allowed for Slave devices. This patch
describes the need for two quirks at the Master level.

a) bus clash
The SoundWire specification allows a Slave device to report a bus clash
with the in-band interrupt mechanism when it detects a conflict while
driving a bitSlot it owns. This can be a symptom of an electrical conflict
or a programming error, and it's vital to detect reliably.

Unfortunately, on some platforms, bus clashes are randomly reported by
Slave devices after a bus reset, with an interrupt status set even before
the bus clash interrupt is enabled. These initial spurious interrupts are
not relevant and should optionally be filtered out, while leaving the
interrupt mechanism enabled to detect 'true' issues.

This patch suggests the addition of a Master level quirk to discard such
interrupts. The quirk should in theory have been added at the Slave level,
but since the problem was detected with different generations of Slave
devices it's hard to point to a specific IP. The problem might also be
board-dependent and hence dealing with a Master quirk is simpler.

b) parity

Additional tests on a new platform with the Maxim 98373 amplifier
showed a rare case where the parity interrupt is also thrown on
startup, at the same time as bus clashes. This issue only seems to
happen infrequently and was only observed during suspend-resume stress
tests while audio is streaming. We could make the problem go away by
adding a Slave-level quirk, but there is no evidence that the issue is
actually a Slave problem: the parity is provided by the Master, which
could also set an invalid parity in corner cases.

BugLink: https://github.com/thesofproject/linux/issues/2578
BugLink: https://github.com/thesofproject/linux/issues/2533
Co-developed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Link: https://lore.kernel.org/r/20210302082720.12322-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index f0a3895e8faf..eaa1486bdca9 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -405,6 +405,7 @@ struct sdw_slave_prop {
  * command
  * @mclk_freq: clock reference passed to SoundWire Master, in Hz.
  * @hw_disabled: if true, the Master is not functional, typically due to pin-mux
+ * @quirks: bitmask identifying optional behavior beyond the scope of the MIPI specification
  */
 struct sdw_master_prop {
 	u32 revision;
@@ -421,8 +422,29 @@ struct sdw_master_prop {
 	u32 err_threshold;
 	u32 mclk_freq;
 	bool hw_disabled;
+	u64 quirks;
 };
 
+/* Definitions for Master quirks */
+
+/*
+ * In a number of platforms bus clashes are reported after a hardware
+ * reset but without any explanations or evidence of a real problem.
+ * The following quirk will discard all initial bus clash interrupts
+ * but will leave the detection on should real bus clashes happen
+ */
+#define SDW_MASTER_QUIRKS_CLEAR_INITIAL_CLASH	BIT(0)
+
+/*
+ * Some Slave devices have known issues with incorrect parity errors
+ * reported after a hardware reset. However during integration unexplained
+ * parity errors can be reported by Slave devices, possibly due to electrical
+ * issues at the Master level.
+ * The following quirk will discard all initial parity errors but will leave
+ * the detection on should real parity errors happen.
+ */
+#define SDW_MASTER_QUIRKS_CLEAR_INITIAL_PARITY	BIT(1)
+
 int sdw_master_read_prop(struct sdw_bus *bus);
 int sdw_slave_read_prop(struct sdw_slave *slave);
 
-- 
cgit v1.2.3


From 458025f6c16188a9f28219578448fac24b2cd487 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 22 Mar 2021 11:53:00 +0100
Subject: vgaarb: avoid -Wempty-body warnings

Building with W=1 shows a few warnings for an empty macro:

drivers/gpu/drm/qxl/qxl_drv.c: In function 'qxl_pci_probe':
drivers/gpu/drm/qxl/qxl_drv.c:131:50: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body]
  131 |                 vga_put(pdev, VGA_RSRC_LEGACY_IO);
      |                                                  ^
drivers/gpu/drm/qxl/qxl_drv.c: In function 'qxl_pci_remove':
drivers/gpu/drm/qxl/qxl_drv.c:159:50: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body]
  159 |                 vga_put(pdev, VGA_RSRC_LEGACY_IO);

Change this to an inline function to make it more robust and avoid
the warning.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20210322105307.1291840-2-arnd@kernel.org
---
 include/linux/vgaarb.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h
index fc6dfeba04a5..dc6ddce92066 100644
--- a/include/linux/vgaarb.h
+++ b/include/linux/vgaarb.h
@@ -112,7 +112,9 @@ static inline int vga_get_uninterruptible(struct pci_dev *pdev,
 #if defined(CONFIG_VGA_ARB)
 extern void vga_put(struct pci_dev *pdev, unsigned int rsrc);
 #else
-#define vga_put(pdev, rsrc)
+static inline void vga_put(struct pci_dev *pdev, unsigned int rsrc)
+{
+}
 #endif
 
 
-- 
cgit v1.2.3


From bddfdbcddbe267519cd36aeb115fdf8620980111 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Oct 2020 15:53:42 -0400
Subject: NFSD: Extract the svcxdr_init_encode() helper

NFSD initializes an encode xdr_stream only after the RPC layer has
already inserted the RPC Reply header. Thus it behaves differently
than xdr_init_encode does, which assumes the passed-in xdr_buf is
entirely devoid of content.

nfs4proc.c has this server-side stream initialization helper, but
it is visible only to the NFSv4 code. Move this helper to a place
that can be accessed by NFSv2 and NFSv3 server XDR functions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c         |  31 ++++---------
 fs/nfsd/nfs4state.c        |   6 +--
 fs/nfsd/nfs4xdr.c          | 110 ++++++++++++++++++++++-----------------------
 fs/nfsd/nfssvc.c           |   4 +-
 fs/nfsd/xdr4.h             |   2 +-
 include/linux/sunrpc/svc.h |  25 +++++++++++
 6 files changed, 94 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index dd9f38d072dd..c57cf09a8b60 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2262,25 +2262,6 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
 	return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
 }
 
-static void svcxdr_init_encode(struct svc_rqst *rqstp,
-			       struct nfsd4_compoundres *resp)
-{
-	struct xdr_stream *xdr = &resp->xdr;
-	struct xdr_buf *buf = &rqstp->rq_res;
-	struct kvec *head = buf->head;
-
-	xdr->buf = buf;
-	xdr->iov = head;
-	xdr->p   = head->iov_base + head->iov_len;
-	xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
-	/* Tail and page_len should be zero at this point: */
-	buf->len = buf->head[0].iov_len;
-	xdr_reset_scratch_buffer(xdr);
-	xdr->page_ptr = buf->pages - 1;
-	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
-		- rqstp->rq_auth_slack;
-}
-
 #ifdef CONFIG_NFSD_V4_2_INTER_SSC
 static void
 check_if_stalefh_allowed(struct nfsd4_compoundargs *args)
@@ -2335,10 +2316,14 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	__be32		status;
 
-	svcxdr_init_encode(rqstp, resp);
-	resp->tagp = resp->xdr.p;
+	resp->xdr = &rqstp->rq_res_stream;
+
+	/* reserve space for: NFS status code */
+	xdr_reserve_space(resp->xdr, XDR_UNIT);
+
+	resp->tagp = resp->xdr->p;
 	/* reserve space for: taglen, tag, and opcnt */
-	xdr_reserve_space(&resp->xdr, 8 + args->taglen);
+	xdr_reserve_space(resp->xdr, XDR_UNIT * 2 + args->taglen);
 	resp->taglen = args->taglen;
 	resp->tag = args->tag;
 	resp->rqstp = rqstp;
@@ -2444,7 +2429,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
 encode_op:
 		if (op->status == nfserr_replay_me) {
 			op->replay = &cstate->replay_owner->so_replay;
-			nfsd4_encode_replay(&resp->xdr, op);
+			nfsd4_encode_replay(resp->xdr, op);
 			status = op->status = op->replay->rp_status;
 		} else {
 			nfsd4_encode_operation(resp, op);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 97447a64bad0..3290d0a7bb95 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2903,7 +2903,7 @@ out_err:
 static void
 nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
 {
-	struct xdr_buf *buf = resp->xdr.buf;
+	struct xdr_buf *buf = resp->xdr->buf;
 	struct nfsd4_slot *slot = resp->cstate.slot;
 	unsigned int base;
 
@@ -2973,7 +2973,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 			 struct nfsd4_sequence *seq)
 {
 	struct nfsd4_slot *slot = resp->cstate.slot;
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 	__be32 status;
 
@@ -3708,7 +3708,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	struct nfsd4_sequence *seq = &u->sequence;
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct nfsd4_session *session;
 	struct nfs4_client *clp;
 	struct nfsd4_slot *slot;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index eaaa1605b5b5..e0f06d3cbd44 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3581,7 +3581,7 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
 static __be32
 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 8);
@@ -3594,7 +3594,7 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
@@ -3611,7 +3611,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_stateid(xdr, &close->cl_stateid);
 }
@@ -3620,7 +3620,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 static __be32
 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
@@ -3634,7 +3634,7 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 static __be32
 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 20);
@@ -3649,7 +3649,7 @@ static __be32
 nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
 {
 	struct svc_fh *fhp = getattr->ga_fhp;
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
 				    getattr->ga_bmval, resp->rqstp, 0);
@@ -3658,7 +3658,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 static __be32
 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct svc_fh *fhp = *fhpp;
 	unsigned int len;
 	__be32 *p;
@@ -3713,7 +3713,7 @@ again:
 static __be32
 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	if (!nfserr)
 		nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
@@ -3726,7 +3726,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
 static __be32
 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
@@ -3736,7 +3736,7 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 static __be32
 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
 }
@@ -3745,7 +3745,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 static __be32
 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 20);
@@ -3759,7 +3759,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
@@ -3853,7 +3853,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 static __be32
 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
 }
@@ -3861,7 +3861,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
 static __be32
 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_encode_stateid(xdr, &od->od_stateid);
 }
@@ -3871,7 +3871,7 @@ static __be32 nfsd4_encode_splice_read(
 				struct nfsd4_read *read,
 				struct file *file, unsigned long maxcount)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct xdr_buf *buf = xdr->buf;
 	int status, space_left;
 	u32 eof;
@@ -3937,7 +3937,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
 				 struct nfsd4_read *read,
 				 struct file *file, unsigned long maxcount)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	u32 eof;
 	int starting_len = xdr->buf->len - 8;
 	__be32 nfserr;
@@ -3976,7 +3976,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  struct nfsd4_read *read)
 {
 	unsigned long maxcount;
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct file *file;
 	int starting_len = xdr->buf->len;
 	__be32 *p;
@@ -3990,7 +3990,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 		WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
 		return nfserr_resource;
 	}
-	if (resp->xdr.buf->page_len &&
+	if (resp->xdr->buf->page_len &&
 	    test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
 		WARN_ON_ONCE(1);
 		return nfserr_resource;
@@ -4020,7 +4020,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
 	int maxcount;
 	__be32 wire_count;
 	int zero = 0;
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	int length_offset = xdr->buf->len;
 	int status;
 	__be32 *p;
@@ -4072,7 +4072,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 	int bytes_left;
 	loff_t offset;
 	__be64 wire_offset;
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	int starting_len = xdr->buf->len;
 	__be32 *p;
 
@@ -4083,8 +4083,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 	/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
 	*p++ = cpu_to_be32(0);
 	*p++ = cpu_to_be32(0);
-	resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p)
-				- (char *)resp->xdr.buf->head[0].iov_base;
+	xdr->buf->head[0].iov_len = (char *)xdr->p -
+				    (char *)xdr->buf->head[0].iov_base;
 
 	/*
 	 * Number of bytes left for directory entries allowing for the
@@ -4159,7 +4159,7 @@ err_no_verf:
 static __be32
 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 20);
@@ -4172,7 +4172,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 static __be32
 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 40);
@@ -4255,7 +4255,7 @@ static __be32
 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 		     struct nfsd4_secinfo *secinfo)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp);
 }
@@ -4264,7 +4264,7 @@ static __be32
 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
 		     struct nfsd4_secinfo_no_name *secinfo)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 
 	return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
 }
@@ -4276,7 +4276,7 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
 static __be32
 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 16);
@@ -4300,7 +4300,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 static __be32
 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	if (!nfserr) {
@@ -4324,7 +4324,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
 static __be32
 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 16);
@@ -4341,7 +4341,7 @@ static __be32
 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
 			 struct nfsd4_exchange_id *exid)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 	char *major_id;
 	char *server_scope;
@@ -4419,7 +4419,7 @@ static __be32
 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
 			    struct nfsd4_create_session *sess)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 24);
@@ -4472,7 +4472,7 @@ static __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
 		      struct nfsd4_sequence *seq)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
@@ -4495,7 +4495,7 @@ static __be32
 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 			  struct nfsd4_test_stateid *test_stateid)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct nfsd4_test_stateid_id *stateid, *next;
 	__be32 *p;
 
@@ -4516,7 +4516,7 @@ static __be32
 nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 		struct nfsd4_getdeviceinfo *gdev)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	const struct nfsd4_layout_ops *ops;
 	u32 starting_len = xdr->buf->len, needed_len;
 	__be32 *p;
@@ -4572,7 +4572,7 @@ static __be32
 nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
 		struct nfsd4_layoutget *lgp)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	const struct nfsd4_layout_ops *ops;
 	__be32 *p;
 
@@ -4599,7 +4599,7 @@ static __be32
 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
 			  struct nfsd4_layoutcommit *lcp)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 4);
@@ -4620,7 +4620,7 @@ static __be32
 nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
 		struct nfsd4_layoutreturn *lrp)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 4);
@@ -4638,7 +4638,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
 		struct nfsd42_write_res *write, bool sync)
 {
 	__be32 *p;
-	p = xdr_reserve_space(&resp->xdr, 4);
+	p = xdr_reserve_space(resp->xdr, 4);
 	if (!p)
 		return nfserr_resource;
 
@@ -4647,11 +4647,11 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
 	else {
 		__be32 nfserr;
 		*p++ = cpu_to_be32(1);
-		nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid);
+		nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid);
 		if (nfserr)
 			return nfserr;
 	}
-	p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
+	p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
 	if (!p)
 		return nfserr_resource;
 
@@ -4665,7 +4665,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
 static __be32
 nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct nfs42_netaddr *addr;
 	__be32 *p;
 
@@ -4713,7 +4713,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
 	if (nfserr)
 		return nfserr;
 
-	p = xdr_reserve_space(&resp->xdr, 4 + 4);
+	p = xdr_reserve_space(resp->xdr, 4 + 4);
 	*p++ = xdr_one; /* cr_consecutive */
 	*p++ = cpu_to_be32(copy->cp_synchronous);
 	return 0;
@@ -4723,7 +4723,7 @@ static __be32
 nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
 			    struct nfsd4_offload_status *os)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 8 + 4);
@@ -4740,7 +4740,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
 			    unsigned long *maxcount, u32 *eof,
 			    loff_t *pos)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct file *file = read->rd_nf->nf_file;
 	int starting_len = xdr->buf->len;
 	loff_t hole_pos;
@@ -4799,7 +4799,7 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
 	count = data_pos - read->rd_offset;
 
 	/* Content type, offset, byte count */
-	p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
+	p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
 	if (!p)
 		return nfserr_resource;
 
@@ -4817,7 +4817,7 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
 		       struct nfsd4_read *read)
 {
 	unsigned long maxcount, count;
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct file *file;
 	int starting_len = xdr->buf->len;
 	int last_segment = xdr->buf->len;
@@ -4888,7 +4888,7 @@ static __be32
 nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
 			 struct nfsd4_copy_notify *cn)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	if (nfserr)
@@ -4924,7 +4924,7 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 {
 	__be32 *p;
 
-	p = xdr_reserve_space(&resp->xdr, 4 + 8);
+	p = xdr_reserve_space(resp->xdr, 4 + 8);
 	*p++ = cpu_to_be32(seek->seek_eof);
 	p = xdr_encode_hyper(p, seek->seek_pos);
 
@@ -4985,7 +4985,7 @@ static __be32
 nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 		      struct nfsd4_getxattr *getxattr)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p, err;
 
 	p = xdr_reserve_space(xdr, 4);
@@ -5009,7 +5009,7 @@ static __be32
 nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 		      struct nfsd4_setxattr *setxattr)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 20);
@@ -5050,7 +5050,7 @@ static __be32
 nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 			struct nfsd4_listxattrs *listxattrs)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	u32 cookie_offset, count_offset, eof;
 	u32 left, xdrleft, slen, count;
 	u32 xdrlen, offset;
@@ -5161,7 +5161,7 @@ static __be32
 nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
 			 struct nfsd4_removexattr *removexattr)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 20);
@@ -5301,7 +5301,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize)
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
-	struct xdr_stream *xdr = &resp->xdr;
+	struct xdr_stream *xdr = resp->xdr;
 	struct nfs4_stateowner *so = resp->cstate.replay_owner;
 	struct svc_rqst *rqstp = resp->rqstp;
 	const struct nfsd4_operation *opdesc = op->opdesc;
@@ -5430,14 +5430,14 @@ int
 nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
 {
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
-	struct xdr_buf *buf = resp->xdr.buf;
+	struct xdr_buf *buf = resp->xdr->buf;
 
 	WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
 				 buf->tail[0].iov_len);
 
 	*p = resp->cstate.status;
 
-	rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+	rqstp->rq_next_page = resp->xdr->page_ptr + 1;
 
 	p = resp->tagp;
 	*p++ = htonl(resp->taglen);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 6de406322106..d909e4956244 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -997,7 +997,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	 * NFSv4 does some encoding while processing
 	 */
 	p = resv->iov_base + resv->iov_len;
-	resv->iov_len += sizeof(__be32);
+	svcxdr_init_encode(rqstp);
 
 	*statp = proc->pc_func(rqstp);
 	if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
@@ -1052,7 +1052,7 @@ int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
  */
 int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
 {
-        return xdr_ressize_check(rqstp, p);
+	return 1;
 }
 
 int nfsd_pool_stats_open(struct inode *inode, struct file *file)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index c300885ae75d..fe540a3415c6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -698,7 +698,7 @@ struct nfsd4_compoundargs {
 
 struct nfsd4_compoundres {
 	/* scratch variables for XDR encode */
-	struct xdr_stream		xdr;
+	struct xdr_stream		*xdr;
 	struct svc_rqst *		rqstp;
 
 	u32				taglen;
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 31ee3b6047c3..e91d51ea028b 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -248,6 +248,7 @@ struct svc_rqst {
 	size_t			rq_xprt_hlen;	/* xprt header len */
 	struct xdr_buf		rq_arg;
 	struct xdr_stream	rq_arg_stream;
+	struct xdr_stream	rq_res_stream;
 	struct page		*rq_scratch_page;
 	struct xdr_buf		rq_res;
 	struct page		*rq_pages[RPCSVC_MAXPAGES + 1];
@@ -574,4 +575,28 @@ static inline void svcxdr_init_decode(struct svc_rqst *rqstp)
 	xdr_set_scratch_page(xdr, rqstp->rq_scratch_page);
 }
 
+/**
+ * svcxdr_init_encode - Prepare an xdr_stream for svc Reply encoding
+ * @rqstp: controlling server RPC transaction context
+ *
+ */
+static inline void svcxdr_init_encode(struct svc_rqst *rqstp)
+{
+	struct xdr_stream *xdr = &rqstp->rq_res_stream;
+	struct xdr_buf *buf = &rqstp->rq_res;
+	struct kvec *resv = buf->head;
+
+	xdr_reset_scratch_buffer(xdr);
+
+	xdr->buf = buf;
+	xdr->iov = resv;
+	xdr->p   = resv->iov_base + resv->iov_len;
+	xdr->end = resv->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
+	buf->len = resv->iov_len;
+	xdr->page_ptr = buf->pages - 1;
+	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages);
+	buf->buflen -= rqstp->rq_auth_slack;
+	xdr->rqst = NULL;
+}
+
 #endif /* SUNRPC_SVC_H */
-- 
cgit v1.2.3


From cc9bcdad7773c295375e66c892c7ac00524706f2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 22 Oct 2020 15:23:50 -0400
Subject: NFSD: Update the NFSv3 READ3res encode to use struct xdr_stream

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3proc.c         |  1 +
 fs/nfsd/nfs3xdr.c          | 43 +++++++++++++++++++++++--------------------
 fs/nfsd/xdr3.h             |  1 +
 include/linux/sunrpc/xdr.h | 20 ++++++++++++++++++++
 4 files changed, 45 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index e55a1d14ede2..93d196752f87 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -159,6 +159,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
 
 	v = 0;
 	len = argp->count;
+	resp->pages = rqstp->rq_next_page;
 	while (len > 0) {
 		struct page *page = *(rqstp->rq_next_page++);
 
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 352691c3e246..859cc6c51c1a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1005,30 +1005,33 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
 int
 nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
 {
+	struct xdr_stream *xdr = &rqstp->rq_res_stream;
 	struct nfsd3_readres *resp = rqstp->rq_resp;
 	struct kvec *head = rqstp->rq_res.head;
 
-	*p++ = resp->status;
-	p = encode_post_op_attr(rqstp, p, &resp->fh);
-	if (resp->status == 0) {
-		*p++ = htonl(resp->count);
-		*p++ = htonl(resp->eof);
-		*p++ = htonl(resp->count);	/* xdr opaque count */
-		xdr_ressize_check(rqstp, p);
-		/* now update rqstp->rq_res to reflect data as well */
-		rqstp->rq_res.page_len = resp->count;
-		if (resp->count & 3) {
-			/* need to pad the tail */
-			rqstp->rq_res.tail[0].iov_base = p;
-			*p = 0;
-			rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
-		}
-		if (svc_encode_result_payload(rqstp, head->iov_len,
-					      resp->count))
+	if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+		return 0;
+	switch (resp->status) {
+	case nfs_ok:
+		if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
 			return 0;
-		return 1;
-	} else
-		return xdr_ressize_check(rqstp, p);
+		if (xdr_stream_encode_u32(xdr, resp->count) < 0)
+			return 0;
+		if (xdr_stream_encode_bool(xdr, resp->eof) < 0)
+			return 0;
+		if (xdr_stream_encode_u32(xdr, resp->count) < 0)
+			return 0;
+		xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
+				resp->count);
+		if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
+			return 0;
+		break;
+	default:
+		if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+			return 0;
+	}
+
+	return 1;
 }
 
 /* WRITE */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 1d633c5d5fa2..8073350418ae 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -145,6 +145,7 @@ struct nfsd3_readres {
 	struct svc_fh		fh;
 	unsigned long		count;
 	__u32			eof;
+	struct page		**pages;
 };
 
 struct nfsd3_writeres {
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 2bc75c167f00..9dda7171b7b4 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -394,6 +394,26 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr)
 	return len;
 }
 
+/**
+ * xdr_stream_encode_bool - Encode a "not present" list item
+ * @xdr: pointer to xdr_stream
+ * @n: boolean value to encode
+ *
+ * Return values:
+ *   On success, returns length in bytes of XDR buffer consumed
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n)
+{
+	const size_t len = XDR_UNIT;
+	__be32 *p = xdr_reserve_space(xdr, len);
+
+	if (unlikely(!p))
+		return -EMSGSIZE;
+	*p = n ? xdr_one : xdr_zero;
+	return len;
+}
+
 /**
  * xdr_stream_encode_u32 - Encode a 32-bit integer
  * @xdr: pointer to xdr_stream
-- 
cgit v1.2.3


From ded04a587f6ceaaba3caefad4021f2212b46c9ff Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 6 Nov 2020 13:15:09 -0500
Subject: NFSD: Update the NFSv3 PATHCONF3res encoder to use struct xdr_stream

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3xdr.c          | 44 +++++++++++++++++++++++++++++++++-----------
 include/linux/sunrpc/xdr.h | 18 ++++++++++++++++--
 2 files changed, 49 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 514f53ad7302..1467bba02e18 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1501,25 +1501,47 @@ nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p)
 	return 1;
 }
 
+static bool
+svcxdr_encode_pathconf3resok(struct xdr_stream *xdr,
+			     const struct nfsd3_pathconfres *resp)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, XDR_UNIT * 6);
+	if (!p)
+		return false;
+	*p++ = cpu_to_be32(resp->p_link_max);
+	*p++ = cpu_to_be32(resp->p_name_max);
+	p = xdr_encode_bool(p, resp->p_no_trunc);
+	p = xdr_encode_bool(p, resp->p_chown_restricted);
+	p = xdr_encode_bool(p, resp->p_case_insensitive);
+	xdr_encode_bool(p, resp->p_case_preserving);
+
+	return true;
+}
+
 /* PATHCONF */
 int
 nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)
 {
+	struct xdr_stream *xdr = &rqstp->rq_res_stream;
 	struct nfsd3_pathconfres *resp = rqstp->rq_resp;
 
-	*p++ = resp->status;
-	*p++ = xdr_zero;	/* no post_op_attr */
-
-	if (resp->status == 0) {
-		*p++ = htonl(resp->p_link_max);
-		*p++ = htonl(resp->p_name_max);
-		*p++ = htonl(resp->p_no_trunc);
-		*p++ = htonl(resp->p_chown_restricted);
-		*p++ = htonl(resp->p_case_insensitive);
-		*p++ = htonl(resp->p_case_preserving);
+	if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+		return 0;
+	switch (resp->status) {
+	case nfs_ok:
+		if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
+			return 0;
+		if (!svcxdr_encode_pathconf3resok(xdr, resp))
+			return 0;
+		break;
+	default:
+		if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
+			return 0;
 	}
 
-	return xdr_ressize_check(rqstp, p);
+	return 1;
 }
 
 /* COMMIT */
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 9dda7171b7b4..a965cbc136ad 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -395,7 +395,21 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr)
 }
 
 /**
- * xdr_stream_encode_bool - Encode a "not present" list item
+ * xdr_encode_bool - Encode a boolean item
+ * @p: address in a buffer into which to encode
+ * @n: boolean value to encode
+ *
+ * Return value:
+ *   Address of item following the encoded boolean
+ */
+static inline __be32 *xdr_encode_bool(__be32 *p, u32 n)
+{
+	*p = n ? xdr_one : xdr_zero;
+	return p++;
+}
+
+/**
+ * xdr_stream_encode_bool - Encode a boolean item
  * @xdr: pointer to xdr_stream
  * @n: boolean value to encode
  *
@@ -410,7 +424,7 @@ static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n)
 
 	if (unlikely(!p))
 		return -EMSGSIZE;
-	*p = n ? xdr_one : xdr_zero;
+	xdr_encode_bool(p, n);
 	return len;
 }
 
-- 
cgit v1.2.3


From 8edc0648880a151026fe625fa1b76772b5766f68 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 18 Nov 2020 14:55:05 -0500
Subject: NFSD: Add an xdr_stream-based encoder for NFSv2/3 ACLs

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs_common/nfsacl.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfsacl.h |  3 +++
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 79c563c1a5e8..5a5bd85d08f8 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -136,6 +136,77 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(nfsacl_encode);
 
+/**
+ * nfs_stream_encode_acl - Encode an NFSv3 ACL
+ *
+ * @xdr: an xdr_stream positioned to receive an encoded ACL
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Return values:
+ *   %false: The ACL could not be encoded
+ *   %true: @xdr is advanced to the next available position
+ */
+bool nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode,
+			   struct posix_acl *acl, int encode_entries,
+			   int typeflag)
+{
+	const size_t elem_size = XDR_UNIT * 3;
+	u32 entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
+	struct nfsacl_encode_desc nfsacl_desc = {
+		.desc = {
+			.elem_size = elem_size,
+			.array_len = encode_entries ? entries : 0,
+			.xcode = xdr_nfsace_encode,
+		},
+		.acl = acl,
+		.typeflag = typeflag,
+		.uid = inode->i_uid,
+		.gid = inode->i_gid,
+	};
+	struct nfsacl_simple_acl aclbuf;
+	unsigned int base;
+	int err;
+
+	if (entries > NFS_ACL_MAX_ENTRIES)
+		return false;
+	if (xdr_stream_encode_u32(xdr, entries) < 0)
+		return false;
+
+	if (encode_entries && acl && acl->a_count == 3) {
+		struct posix_acl *acl2 = &aclbuf.acl;
+
+		/* Avoid the use of posix_acl_alloc().  nfsacl_encode() is
+		 * invoked in contexts where a memory allocation failure is
+		 * fatal.  Fortunately this fake ACL is small enough to
+		 * construct on the stack. */
+		posix_acl_init(acl2, 4);
+
+		/* Insert entries in canonical order: other orders seem
+		 to confuse Solaris VxFS. */
+		acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
+		acl2->a_entries[1] = acl->a_entries[1];  /* ACL_GROUP_OBJ */
+		acl2->a_entries[2] = acl->a_entries[1];  /* ACL_MASK */
+		acl2->a_entries[2].e_tag = ACL_MASK;
+		acl2->a_entries[3] = acl->a_entries[2];  /* ACL_OTHER */
+		nfsacl_desc.acl = acl2;
+	}
+
+	base = xdr_stream_pos(xdr);
+	if (!xdr_reserve_space(xdr, XDR_UNIT +
+			       elem_size * nfsacl_desc.desc.array_len))
+		return false;
+	err = xdr_encode_array2(xdr->buf, base, &nfsacl_desc.desc);
+	if (err)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(nfs_stream_encode_acl);
+
+
 struct nfsacl_decode_desc {
 	struct xdr_array2_desc desc;
 	unsigned int count;
diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h
index 0ba99c513649..8e76a79cdc6a 100644
--- a/include/linux/nfsacl.h
+++ b/include/linux/nfsacl.h
@@ -41,5 +41,8 @@ nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
 extern bool
 nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt,
 		      struct posix_acl **pacl);
+extern bool
+nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode,
+		      struct posix_acl *acl, int encode_entries, int typeflag);
 
 #endif  /* __LINUX_NFSACL_H */
-- 
cgit v1.2.3


From 417c0fc24dd4dbd60d94fa8deb36bf1176930e06 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 12 Mar 2021 09:36:02 +0100
Subject: mfd/power: ab8500: Push data to power supply code

The global definition of platform data for the battery
management code has no utility after the OF conversion,
move the <linux/mfd/abx500/ab8500-bm.h> to be a local
file in drivers/power/supply and stop defining the
platform data in drivers/power/supply/ab8500_bmdata.c
and broadcast to the kernel only to have it assigned
as platform data to the MFD cells and then picked back
into the same subsystem that defined it in the first
place. This kills off a layer of indirection.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/ab8500-core.c              |  17 +-
 drivers/power/supply/ab8500-bm.h       | 463 ++++++++++++++++++++++++++++++++
 drivers/power/supply/ab8500_bmdata.c   |   3 +-
 drivers/power/supply/ab8500_btemp.c    |  35 +--
 drivers/power/supply/ab8500_charger.c  |  10 +-
 drivers/power/supply/ab8500_fg.c       |  10 +-
 drivers/power/supply/abx500_chargalg.c |  10 +-
 drivers/power/supply/pm2301_charger.c  |   2 +-
 include/linux/mfd/abx500/ab8500-bm.h   | 476 ---------------------------------
 9 files changed, 487 insertions(+), 539 deletions(-)
 create mode 100644 drivers/power/supply/ab8500-bm.h
 delete mode 100644 include/linux/mfd/abx500/ab8500-bm.h

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index a9037911162b..22c0e3d87629 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -19,7 +19,6 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/abx500/ab8500-bm.h>
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -609,14 +608,14 @@ int ab8500_suspend(struct ab8500 *ab8500)
 }
 
 static const struct mfd_cell ab8500_bm_devs[] = {
-	MFD_CELL_OF("ab8500-charger", NULL, &ab8500_bm_data,
-		    sizeof(ab8500_bm_data), 0, "stericsson,ab8500-charger"),
-	MFD_CELL_OF("ab8500-btemp", NULL, &ab8500_bm_data,
-		    sizeof(ab8500_bm_data), 0, "stericsson,ab8500-btemp"),
-	MFD_CELL_OF("ab8500-fg", NULL, &ab8500_bm_data,
-		    sizeof(ab8500_bm_data), 0, "stericsson,ab8500-fg"),
-	MFD_CELL_OF("ab8500-chargalg", NULL, &ab8500_bm_data,
-		    sizeof(ab8500_bm_data), 0, "stericsson,ab8500-chargalg"),
+	MFD_CELL_OF("ab8500-charger", NULL, NULL, 0, 0,
+		    "stericsson,ab8500-charger"),
+	MFD_CELL_OF("ab8500-btemp", NULL, NULL, 0, 0,
+		    "stericsson,ab8500-btemp"),
+	MFD_CELL_OF("ab8500-fg", NULL, NULL, 0, 0,
+		    "stericsson,ab8500-fg"),
+	MFD_CELL_OF("ab8500-chargalg", NULL, NULL, 0, 0,
+		    "stericsson,ab8500-chargalg"),
 };
 
 static const struct mfd_cell ab8500_devs[] = {
diff --git a/drivers/power/supply/ab8500-bm.h b/drivers/power/supply/ab8500-bm.h
new file mode 100644
index 000000000000..a1b31c971a45
--- /dev/null
+++ b/drivers/power/supply/ab8500-bm.h
@@ -0,0 +1,463 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _AB8500_CHARGER_H_
+#define _AB8500_CHARGER_H_
+
+#include <linux/kernel.h>
+#include <linux/mfd/abx500.h>
+
+/*
+ * System control 2 register offsets.
+ * bank = 0x02
+ */
+#define AB8500_MAIN_WDOG_CTRL_REG	0x01
+#define AB8500_LOW_BAT_REG		0x03
+#define AB8500_BATT_OK_REG		0x04
+/*
+ * USB/ULPI register offsets
+ * Bank : 0x5
+ */
+#define AB8500_USB_LINE_STAT_REG	0x80
+#define AB8500_USB_LINE_CTRL2_REG	0x82
+#define AB8500_USB_LINK1_STAT_REG	0x94
+
+/*
+ * Charger / status register offfsets
+ * Bank : 0x0B
+ */
+#define AB8500_CH_STATUS1_REG		0x00
+#define AB8500_CH_STATUS2_REG		0x01
+#define AB8500_CH_USBCH_STAT1_REG	0x02
+#define AB8500_CH_USBCH_STAT2_REG	0x03
+#define AB8540_CH_USBCH_STAT3_REG	0x04
+#define AB8500_CH_STAT_REG		0x05
+
+/*
+ * Charger / control register offfsets
+ * Bank : 0x0B
+ */
+#define AB8500_CH_VOLT_LVL_REG		0x40
+#define AB8500_CH_VOLT_LVL_MAX_REG	0x41  /*Only in Cut2.0*/
+#define AB8500_CH_OPT_CRNTLVL_REG	0x42
+#define AB8500_CH_OPT_CRNTLVL_MAX_REG	0x43  /*Only in Cut2.0*/
+#define AB8500_CH_WD_TIMER_REG		0x50
+#define AB8500_CHARG_WD_CTRL		0x51
+#define AB8500_BTEMP_HIGH_TH		0x52
+#define AB8500_LED_INDICATOR_PWM_CTRL	0x53
+#define AB8500_LED_INDICATOR_PWM_DUTY	0x54
+#define AB8500_BATT_OVV			0x55
+#define AB8500_CHARGER_CTRL		0x56
+#define AB8500_BAT_CTRL_CURRENT_SOURCE	0x60  /*Only in Cut2.0*/
+
+/*
+ * Charger / main control register offsets
+ * Bank : 0x0B
+ */
+#define AB8500_MCH_CTRL1		0x80
+#define AB8500_MCH_CTRL2		0x81
+#define AB8500_MCH_IPT_CURLVL_REG	0x82
+#define AB8500_CH_WD_REG		0x83
+
+/*
+ * Charger / USB control register offsets
+ * Bank : 0x0B
+ */
+#define AB8500_USBCH_CTRL1_REG		0xC0
+#define AB8500_USBCH_CTRL2_REG		0xC1
+#define AB8500_USBCH_IPT_CRNTLVL_REG	0xC2
+#define AB8540_USB_PP_MODE_REG		0xC5
+#define AB8540_USB_PP_CHR_REG		0xC6
+
+/*
+ * Gas Gauge register offsets
+ * Bank : 0x0C
+ */
+#define AB8500_GASG_CC_CTRL_REG		0x00
+#define AB8500_GASG_CC_ACCU1_REG	0x01
+#define AB8500_GASG_CC_ACCU2_REG	0x02
+#define AB8500_GASG_CC_ACCU3_REG	0x03
+#define AB8500_GASG_CC_ACCU4_REG	0x04
+#define AB8500_GASG_CC_SMPL_CNTRL_REG	0x05
+#define AB8500_GASG_CC_SMPL_CNTRH_REG	0x06
+#define AB8500_GASG_CC_SMPL_CNVL_REG	0x07
+#define AB8500_GASG_CC_SMPL_CNVH_REG	0x08
+#define AB8500_GASG_CC_CNTR_AVGOFF_REG	0x09
+#define AB8500_GASG_CC_OFFSET_REG	0x0A
+#define AB8500_GASG_CC_NCOV_ACCU	0x10
+#define AB8500_GASG_CC_NCOV_ACCU_CTRL	0x11
+#define AB8500_GASG_CC_NCOV_ACCU_LOW	0x12
+#define AB8500_GASG_CC_NCOV_ACCU_MED	0x13
+#define AB8500_GASG_CC_NCOV_ACCU_HIGH	0x14
+
+/*
+ * Interrupt register offsets
+ * Bank : 0x0E
+ */
+#define AB8500_IT_SOURCE2_REG		0x01
+#define AB8500_IT_SOURCE21_REG		0x14
+
+/*
+ * RTC register offsets
+ * Bank: 0x0F
+ */
+#define AB8500_RTC_BACKUP_CHG_REG	0x0C
+#define AB8500_RTC_CC_CONF_REG		0x01
+#define AB8500_RTC_CTRL_REG		0x0B
+#define AB8500_RTC_CTRL1_REG		0x11
+
+/*
+ * OTP register offsets
+ * Bank : 0x15
+ */
+#define AB8500_OTP_CONF_15		0x0E
+
+/* GPADC constants from AB8500 spec, UM0836 */
+#define ADC_RESOLUTION			1024
+#define ADC_CH_MAIN_MIN			0
+#define ADC_CH_MAIN_MAX			20030
+#define ADC_CH_VBUS_MIN			0
+#define ADC_CH_VBUS_MAX			20030
+#define ADC_CH_VBAT_MIN			2300
+#define ADC_CH_VBAT_MAX			4800
+#define ADC_CH_BKBAT_MIN		0
+#define ADC_CH_BKBAT_MAX		3200
+
+/* Main charge i/p current */
+#define MAIN_CH_IP_CUR_0P9A		0x80
+#define MAIN_CH_IP_CUR_1P0A		0x90
+#define MAIN_CH_IP_CUR_1P1A		0xA0
+#define MAIN_CH_IP_CUR_1P2A		0xB0
+#define MAIN_CH_IP_CUR_1P3A		0xC0
+#define MAIN_CH_IP_CUR_1P4A		0xD0
+#define MAIN_CH_IP_CUR_1P5A		0xE0
+
+/* ChVoltLevel */
+#define CH_VOL_LVL_3P5			0x00
+#define CH_VOL_LVL_4P0			0x14
+#define CH_VOL_LVL_4P05			0x16
+#define CH_VOL_LVL_4P1			0x1B
+#define CH_VOL_LVL_4P15			0x20
+#define CH_VOL_LVL_4P2			0x25
+#define CH_VOL_LVL_4P6			0x4D
+
+/* ChOutputCurrentLevel */
+#define CH_OP_CUR_LVL_0P1		0x00
+#define CH_OP_CUR_LVL_0P2		0x01
+#define CH_OP_CUR_LVL_0P3		0x02
+#define CH_OP_CUR_LVL_0P4		0x03
+#define CH_OP_CUR_LVL_0P5		0x04
+#define CH_OP_CUR_LVL_0P6		0x05
+#define CH_OP_CUR_LVL_0P7		0x06
+#define CH_OP_CUR_LVL_0P8		0x07
+#define CH_OP_CUR_LVL_0P9		0x08
+#define CH_OP_CUR_LVL_1P4		0x0D
+#define CH_OP_CUR_LVL_1P5		0x0E
+#define CH_OP_CUR_LVL_1P6		0x0F
+#define CH_OP_CUR_LVL_2P		0x3F
+
+/* BTEMP High thermal limits */
+#define BTEMP_HIGH_TH_57_0		0x00
+#define BTEMP_HIGH_TH_52		0x01
+#define BTEMP_HIGH_TH_57_1		0x02
+#define BTEMP_HIGH_TH_62		0x03
+
+/* current is mA */
+#define USB_0P1A			100
+#define USB_0P2A			200
+#define USB_0P3A			300
+#define USB_0P4A			400
+#define USB_0P5A			500
+
+#define LOW_BAT_3P1V			0x20
+#define LOW_BAT_2P3V			0x00
+#define LOW_BAT_RESET			0x01
+#define LOW_BAT_ENABLE			0x01
+
+/* Backup battery constants */
+#define BUP_ICH_SEL_50UA		0x00
+#define BUP_ICH_SEL_150UA		0x04
+#define BUP_ICH_SEL_300UA		0x08
+#define BUP_ICH_SEL_700UA		0x0C
+
+enum bup_vch_sel {
+	BUP_VCH_SEL_2P5V,
+	BUP_VCH_SEL_2P6V,
+	BUP_VCH_SEL_2P8V,
+	BUP_VCH_SEL_3P1V,
+	/*
+	 * Note that the following 5 values 2.7v, 2.9v, 3.0v, 3.2v, 3.3v
+	 * are only available on ab8540. You can't choose these 5
+	 * voltage on ab8500/ab8505/ab9540.
+	 */
+	BUP_VCH_SEL_2P7V,
+	BUP_VCH_SEL_2P9V,
+	BUP_VCH_SEL_3P0V,
+	BUP_VCH_SEL_3P2V,
+	BUP_VCH_SEL_3P3V,
+};
+
+#define BUP_VCH_RANGE		0x02
+#define VBUP33_VRTCN		0x01
+
+/* Battery OVV constants */
+#define BATT_OVV_ENA			0x02
+#define BATT_OVV_TH_3P7			0x00
+#define BATT_OVV_TH_4P75		0x01
+
+/* A value to indicate over voltage */
+#define BATT_OVV_VALUE			4750
+
+/* VBUS OVV constants */
+#define VBUS_OVV_SELECT_MASK		0x78
+#define VBUS_OVV_SELECT_5P6V		0x00
+#define VBUS_OVV_SELECT_5P7V		0x08
+#define VBUS_OVV_SELECT_5P8V		0x10
+#define VBUS_OVV_SELECT_5P9V		0x18
+#define VBUS_OVV_SELECT_6P0V		0x20
+#define VBUS_OVV_SELECT_6P1V		0x28
+#define VBUS_OVV_SELECT_6P2V		0x30
+#define VBUS_OVV_SELECT_6P3V		0x38
+
+#define VBUS_AUTO_IN_CURR_LIM_ENA	0x04
+
+/* Fuel Gauge constants */
+#define RESET_ACCU			0x02
+#define READ_REQ			0x01
+#define CC_DEEP_SLEEP_ENA		0x02
+#define CC_PWR_UP_ENA			0x01
+#define CC_SAMPLES_40			0x28
+#define RD_NCONV_ACCU_REQ		0x01
+#define CC_CALIB			0x08
+#define CC_INTAVGOFFSET_ENA		0x10
+#define CC_MUXOFFSET			0x80
+#define CC_INT_CAL_N_AVG_MASK		0x60
+#define CC_INT_CAL_SAMPLES_16		0x40
+#define CC_INT_CAL_SAMPLES_8		0x20
+#define CC_INT_CAL_SAMPLES_4		0x00
+
+/* RTC constants */
+#define RTC_BUP_CH_ENA			0x10
+
+/* BatCtrl Current Source Constants */
+#define BAT_CTRL_7U_ENA			0x01
+#define BAT_CTRL_20U_ENA		0x02
+#define BAT_CTRL_18U_ENA		0x01
+#define BAT_CTRL_16U_ENA		0x02
+#define BAT_CTRL_CMP_ENA		0x04
+#define FORCE_BAT_CTRL_CMP_HIGH		0x08
+#define BAT_CTRL_PULL_UP_ENA		0x10
+
+/* Battery type */
+#define BATTERY_UNKNOWN			00
+
+/* Registers for pcut feature in ab8505 and ab9540 */
+#define AB8505_RTC_PCUT_CTL_STATUS_REG	0x12
+#define AB8505_RTC_PCUT_TIME_REG	0x13
+#define AB8505_RTC_PCUT_MAX_TIME_REG	0x14
+#define AB8505_RTC_PCUT_FLAG_TIME_REG	0x15
+#define AB8505_RTC_PCUT_RESTART_REG	0x16
+#define AB8505_RTC_PCUT_DEBOUNCE_REG	0x17
+
+/* USB Power Path constants for ab8540 */
+#define BUS_VSYS_VOL_SELECT_MASK		0x06
+#define BUS_VSYS_VOL_SELECT_3P6V		0x00
+#define BUS_VSYS_VOL_SELECT_3P325V		0x02
+#define BUS_VSYS_VOL_SELECT_3P9V		0x04
+#define BUS_VSYS_VOL_SELECT_4P3V		0x06
+#define BUS_POWER_PATH_MODE_ENA			0x01
+#define BUS_PP_PRECHG_CURRENT_MASK		0x0E
+#define BUS_POWER_PATH_PRECHG_ENA		0x01
+
+/**
+ * struct res_to_temp - defines one point in a temp to res curve. To
+ * be used in battery packs that combines the identification resistor with a
+ * NTC resistor.
+ * @temp:			battery pack temperature in Celsius
+ * @resist:			NTC resistor net total resistance
+ */
+struct res_to_temp {
+	int temp;
+	int resist;
+};
+
+/**
+ * struct batres_vs_temp - defines one point in a temp vs battery internal
+ * resistance curve.
+ * @temp:			battery pack temperature in Celsius
+ * @resist:			battery internal reistance in mOhm
+ */
+struct batres_vs_temp {
+	int temp;
+	int resist;
+};
+
+/* Forward declaration */
+struct ab8500_fg;
+
+/**
+ * struct ab8500_fg_parameters - Fuel gauge algorithm parameters, in seconds
+ * if not specified
+ * @recovery_sleep_timer:	Time between measurements while recovering
+ * @recovery_total_time:	Total recovery time
+ * @init_timer:			Measurement interval during startup
+ * @init_discard_time:		Time we discard voltage measurement at startup
+ * @init_total_time:		Total init time during startup
+ * @high_curr_time:		Time current has to be high to go to recovery
+ * @accu_charging:		FG accumulation time while charging
+ * @accu_high_curr:		FG accumulation time in high current mode
+ * @high_curr_threshold:	High current threshold, in mA
+ * @lowbat_threshold:		Low battery threshold, in mV
+ * @battok_falling_th_sel0	Threshold in mV for battOk signal sel0
+ *				Resolution in 50 mV step.
+ * @battok_raising_th_sel1	Threshold in mV for battOk signal sel1
+ *				Resolution in 50 mV step.
+ * @user_cap_limit		Capacity reported from user must be within this
+ *				limit to be considered as sane, in percentage
+ *				points.
+ * @maint_thres			This is the threshold where we stop reporting
+ *				battery full while in maintenance, in per cent
+ * @pcut_enable:			Enable power cut feature in ab8505
+ * @pcut_max_time:		Max time threshold
+ * @pcut_flag_time:		Flagtime threshold
+ * @pcut_max_restart:		Max number of restarts
+ * @pcut_debunce_time:	Sets battery debounce time
+ */
+struct ab8500_fg_parameters {
+	int recovery_sleep_timer;
+	int recovery_total_time;
+	int init_timer;
+	int init_discard_time;
+	int init_total_time;
+	int high_curr_time;
+	int accu_charging;
+	int accu_high_curr;
+	int high_curr_threshold;
+	int lowbat_threshold;
+	int battok_falling_th_sel0;
+	int battok_raising_th_sel1;
+	int user_cap_limit;
+	int maint_thres;
+	bool pcut_enable;
+	u8 pcut_max_time;
+	u8 pcut_flag_time;
+	u8 pcut_max_restart;
+	u8 pcut_debunce_time;
+};
+
+/**
+ * struct ab8500_charger_maximization - struct used by the board config.
+ * @use_maxi:		Enable maximization for this battery type
+ * @maxi_chg_curr:	Maximum charger current allowed
+ * @maxi_wait_cycles:	cycles to wait before setting charger current
+ * @charger_curr_step	delta between two charger current settings (mA)
+ */
+struct ab8500_maxim_parameters {
+	bool ena_maxi;
+	int chg_curr;
+	int wait_cycles;
+	int charger_curr_step;
+};
+
+/**
+ * struct ab8500_bm_capacity_levels - ab8500 capacity level data
+ * @critical:		critical capacity level in percent
+ * @low:		low capacity level in percent
+ * @normal:		normal capacity level in percent
+ * @high:		high capacity level in percent
+ * @full:		full capacity level in percent
+ */
+struct ab8500_bm_capacity_levels {
+	int critical;
+	int low;
+	int normal;
+	int high;
+	int full;
+};
+
+/**
+ * struct ab8500_bm_charger_parameters - Charger specific parameters
+ * @usb_volt_max:	maximum allowed USB charger voltage in mV
+ * @usb_curr_max:	maximum allowed USB charger current in mA
+ * @ac_volt_max:	maximum allowed AC charger voltage in mV
+ * @ac_curr_max:	maximum allowed AC charger current in mA
+ */
+struct ab8500_bm_charger_parameters {
+	int usb_volt_max;
+	int usb_curr_max;
+	int ac_volt_max;
+	int ac_curr_max;
+};
+
+/**
+ * struct ab8500_bm_data - ab8500 battery management data
+ * @temp_under		under this temp, charging is stopped
+ * @temp_low		between this temp and temp_under charging is reduced
+ * @temp_high		between this temp and temp_over charging is reduced
+ * @temp_over		over this temp, charging is stopped
+ * @temp_interval_chg	temperature measurement interval in s when charging
+ * @temp_interval_nochg	temperature measurement interval in s when not charging
+ * @main_safety_tmr_h	safety timer for main charger
+ * @usb_safety_tmr_h	safety timer for usb charger
+ * @bkup_bat_v		voltage which we charge the backup battery with
+ * @bkup_bat_i		current which we charge the backup battery with
+ * @no_maintenance	indicates that maintenance charging is disabled
+ * @capacity_scaling    indicates whether capacity scaling is to be used
+ * @adc_therm		placement of thermistor, batctrl or battemp adc
+ * @chg_unknown_bat	flag to enable charging of unknown batteries
+ * @enable_overshoot	flag to enable VBAT overshoot control
+ * @fg_res		resistance of FG resistor in 0.1mOhm
+ * @n_btypes		number of elements in array bat_type
+ * @batt_id		index of the identified battery in array bat_type
+ * @interval_charging	charge alg cycle period time when charging (sec)
+ * @interval_not_charging charge alg cycle period time when not charging (sec)
+ * @temp_hysteresis	temperature hysteresis
+ * @gnd_lift_resistance	Battery ground to phone ground resistance (mOhm)
+ * @maxi:		maximization parameters
+ * @cap_levels		capacity in percent for the different capacity levels
+ * @bat_type		table of supported battery types
+ * @chg_params		charger parameters
+ * @fg_params		fuel gauge parameters
+ */
+struct ab8500_bm_data {
+	int temp_under;
+	int temp_low;
+	int temp_high;
+	int temp_over;
+	int temp_interval_chg;
+	int temp_interval_nochg;
+	int main_safety_tmr_h;
+	int usb_safety_tmr_h;
+	int bkup_bat_v;
+	int bkup_bat_i;
+	bool no_maintenance;
+	bool capacity_scaling;
+	bool chg_unknown_bat;
+	bool enable_overshoot;
+	enum abx500_adc_therm adc_therm;
+	int fg_res;
+	int n_btypes;
+	int batt_id;
+	int interval_charging;
+	int interval_not_charging;
+	int temp_hysteresis;
+	int gnd_lift_resistance;
+	const struct ab8500_maxim_parameters *maxi;
+	const struct ab8500_bm_capacity_levels *cap_levels;
+	const struct ab8500_bm_charger_parameters *chg_params;
+	const struct ab8500_fg_parameters *fg_params;
+};
+
+struct ab8500_btemp;
+struct ab8500_fg;
+
+extern struct abx500_bm_data ab8500_bm_data;
+
+void ab8500_charger_usb_state_changed(u8 bm_usb_state, u16 mA);
+struct ab8500_fg *ab8500_fg_get(void);
+int ab8500_fg_inst_curr_blocking(struct ab8500_fg *dev);
+int ab8500_fg_inst_curr_start(struct ab8500_fg *di);
+int ab8500_fg_inst_curr_finalize(struct ab8500_fg *di, int *res);
+int ab8500_fg_inst_curr_started(struct ab8500_fg *di);
+int ab8500_fg_inst_curr_done(struct ab8500_fg *di);
+
+#endif /* _AB8500_CHARGER_H_ */
diff --git a/drivers/power/supply/ab8500_bmdata.c b/drivers/power/supply/ab8500_bmdata.c
index f6a66979cbb5..c2b8c0bb77e2 100644
--- a/drivers/power/supply/ab8500_bmdata.c
+++ b/drivers/power/supply/ab8500_bmdata.c
@@ -4,7 +4,8 @@
 #include <linux/of.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/abx500/ab8500-bm.h>
+
+#include "ab8500-bm.h"
 
 /*
  * These are the defined batteries that uses a NTC and ID resistor placed
diff --git a/drivers/power/supply/ab8500_btemp.c b/drivers/power/supply/ab8500_btemp.c
index 3cec0affd866..fdfcd59fc43e 100644
--- a/drivers/power/supply/ab8500_btemp.c
+++ b/drivers/power/supply/ab8500_btemp.c
@@ -25,9 +25,10 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/abx500/ab8500-bm.h>
 #include <linux/iio/consumer.h>
 
+#include "ab8500-bm.h"
+
 #define VTVOUT_V			1800
 
 #define BTEMP_THERMAL_LOW_LIMIT		-10
@@ -119,16 +120,6 @@ static enum power_supply_property ab8500_btemp_props[] = {
 
 static LIST_HEAD(ab8500_btemp_list);
 
-/**
- * ab8500_btemp_get() - returns a reference to the primary AB8500 BTEMP
- * (i.e. the first BTEMP in the instance list)
- */
-struct ab8500_btemp *ab8500_btemp_get(void)
-{
-	return list_first_entry(&ab8500_btemp_list, struct ab8500_btemp, node);
-}
-EXPORT_SYMBOL(ab8500_btemp_get);
-
 /**
  * ab8500_btemp_batctrl_volt_to_res() - convert batctrl voltage to resistance
  * @di:		pointer to the ab8500_btemp structure
@@ -754,7 +745,7 @@ static void ab8500_btemp_periodic(struct ab8500_btemp *di,
  *
  * Returns battery temperature
  */
-int ab8500_btemp_get_temp(struct ab8500_btemp *di)
+static int ab8500_btemp_get_temp(struct ab8500_btemp *di)
 {
 	int temp = 0;
 
@@ -790,19 +781,6 @@ int ab8500_btemp_get_temp(struct ab8500_btemp *di)
 	}
 	return temp;
 }
-EXPORT_SYMBOL(ab8500_btemp_get_temp);
-
-/**
- * ab8500_btemp_get_batctrl_temp() - get the temperature
- * @btemp:      pointer to the btemp structure
- *
- * Returns the batctrl temperature in millidegrees
- */
-int ab8500_btemp_get_batctrl_temp(struct ab8500_btemp *btemp)
-{
-	return btemp->bat_temp * 1000;
-}
-EXPORT_SYMBOL(ab8500_btemp_get_batctrl_temp);
 
 /**
  * ab8500_btemp_get_property() - get the btemp properties
@@ -991,7 +969,6 @@ static const struct power_supply_desc ab8500_btemp_desc = {
 static int ab8500_btemp_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
-	struct abx500_bm_data *plat = pdev->dev.platform_data;
 	struct power_supply_config psy_cfg = {};
 	struct device *dev = &pdev->dev;
 	struct ab8500_btemp *di;
@@ -1002,11 +979,7 @@ static int ab8500_btemp_probe(struct platform_device *pdev)
 	if (!di)
 		return -ENOMEM;
 
-	if (!plat) {
-		dev_err(dev, "no battery management data supplied\n");
-		return -EINVAL;
-	}
-	di->bm = plat;
+	di->bm = &ab8500_bm_data;
 
 	ret = ab8500_bm_of_probe(dev, np, di->bm);
 	if (ret) {
diff --git a/drivers/power/supply/ab8500_charger.c b/drivers/power/supply/ab8500_charger.c
index aa573cd299e2..50989a5ec95c 100644
--- a/drivers/power/supply/ab8500_charger.c
+++ b/drivers/power/supply/ab8500_charger.c
@@ -28,12 +28,13 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/abx500/ab8500.h>
 #include <linux/mfd/abx500.h>
-#include <linux/mfd/abx500/ab8500-bm.h>
 #include <linux/mfd/abx500/ux500_chargalg.h>
 #include <linux/usb/otg.h>
 #include <linux/mutex.h>
 #include <linux/iio/consumer.h>
 
+#include "ab8500-bm.h"
+
 /* Charger constants */
 #define NO_PW_CONN			0
 #define AC_PW_CONN			1
@@ -3344,7 +3345,6 @@ static const struct power_supply_desc ab8500_usb_chg_desc = {
 static int ab8500_charger_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
-	struct abx500_bm_data *plat = pdev->dev.platform_data;
 	struct power_supply_config ac_psy_cfg = {}, usb_psy_cfg = {};
 	struct ab8500_charger *di;
 	int irq, i, charger_status, ret = 0, ch_stat;
@@ -3354,11 +3354,7 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	if (!di)
 		return -ENOMEM;
 
-	if (!plat) {
-		dev_err(dev, "no battery management data supplied\n");
-		return -EINVAL;
-	}
-	di->bm = plat;
+	di->bm = &ab8500_bm_data;
 
 	ret = ab8500_bm_of_probe(dev, np, di->bm);
 	if (ret) {
diff --git a/drivers/power/supply/ab8500_fg.c b/drivers/power/supply/ab8500_fg.c
index 079e11325a81..0c7c01a0d979 100644
--- a/drivers/power/supply/ab8500_fg.c
+++ b/drivers/power/supply/ab8500_fg.c
@@ -31,10 +31,11 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/abx500/ab8500-bm.h>
 #include <linux/iio/consumer.h>
 #include <linux/kernel.h>
 
+#include "ab8500-bm.h"
+
 #define MILLI_TO_MICRO			1000
 #define FG_LSB_IN_MA			1627
 #define QLSB_NANO_AMP_HOURS_X10		1071
@@ -3026,7 +3027,6 @@ static const struct power_supply_desc ab8500_fg_desc = {
 static int ab8500_fg_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
-	struct abx500_bm_data *plat = pdev->dev.platform_data;
 	struct power_supply_config psy_cfg = {};
 	struct device *dev = &pdev->dev;
 	struct ab8500_fg *di;
@@ -3037,11 +3037,7 @@ static int ab8500_fg_probe(struct platform_device *pdev)
 	if (!di)
 		return -ENOMEM;
 
-	if (!plat) {
-		dev_err(dev, "no battery management data supplied\n");
-		return -EINVAL;
-	}
-	di->bm = plat;
+	di->bm = &ab8500_bm_data;
 
 	ret = ab8500_bm_of_probe(dev, np, di->bm);
 	if (ret) {
diff --git a/drivers/power/supply/abx500_chargalg.c b/drivers/power/supply/abx500_chargalg.c
index 591ddd2987a3..5b28d58041b4 100644
--- a/drivers/power/supply/abx500_chargalg.c
+++ b/drivers/power/supply/abx500_chargalg.c
@@ -29,9 +29,10 @@
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
 #include <linux/mfd/abx500/ux500_chargalg.h>
-#include <linux/mfd/abx500/ab8500-bm.h>
 #include <linux/notifier.h>
 
+#include "ab8500-bm.h"
+
 /* Watchdog kick interval */
 #define CHG_WD_INTERVAL			(6 * HZ)
 
@@ -1980,7 +1981,6 @@ static const struct power_supply_desc abx500_chargalg_desc = {
 static int abx500_chargalg_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
-	struct abx500_bm_data *plat = pdev->dev.platform_data;
 	struct power_supply_config psy_cfg = {};
 	struct abx500_chargalg *di;
 	int ret = 0;
@@ -1991,11 +1991,7 @@ static int abx500_chargalg_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	if (!plat) {
-		dev_err(&pdev->dev, "no battery management data supplied\n");
-		return -EINVAL;
-	}
-	di->bm = plat;
+	di->bm = &ab8500_bm_data;
 
 	ret = ab8500_bm_of_probe(&pdev->dev, np, di->bm);
 	if (ret) {
diff --git a/drivers/power/supply/pm2301_charger.c b/drivers/power/supply/pm2301_charger.c
index ac06ecf7fc9c..5aeff75db33b 100644
--- a/drivers/power/supply/pm2301_charger.c
+++ b/drivers/power/supply/pm2301_charger.c
@@ -18,13 +18,13 @@
 #include <linux/i2c.h>
 #include <linux/workqueue.h>
 #include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/abx500/ab8500-bm.h>
 #include <linux/mfd/abx500/ux500_chargalg.h>
 #include <linux/pm2301_charger.h>
 #include <linux/gpio.h>
 #include <linux/pm_runtime.h>
 #include <linux/pm.h>
 
+#include "ab8500-bm.h"
 #include "pm2301_charger.h"
 
 #define to_pm2xxx_charger_ac_device_info(x) container_of((x), \
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
deleted file mode 100644
index 903e94c189d8..000000000000
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ /dev/null
@@ -1,476 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright ST-Ericsson 2012.
- *
- * Author: Arun Murthy <arun.murthy@stericsson.com>
- */
-
-#ifndef _AB8500_BM_H
-#define _AB8500_BM_H
-
-#include <linux/kernel.h>
-#include <linux/mfd/abx500.h>
-
-/*
- * System control 2 register offsets.
- * bank = 0x02
- */
-#define AB8500_MAIN_WDOG_CTRL_REG	0x01
-#define AB8500_LOW_BAT_REG		0x03
-#define AB8500_BATT_OK_REG		0x04
-/*
- * USB/ULPI register offsets
- * Bank : 0x5
- */
-#define AB8500_USB_LINE_STAT_REG	0x80
-#define AB8500_USB_LINE_CTRL2_REG	0x82
-#define AB8500_USB_LINK1_STAT_REG	0x94
-
-/*
- * Charger / status register offfsets
- * Bank : 0x0B
- */
-#define AB8500_CH_STATUS1_REG		0x00
-#define AB8500_CH_STATUS2_REG		0x01
-#define AB8500_CH_USBCH_STAT1_REG	0x02
-#define AB8500_CH_USBCH_STAT2_REG	0x03
-#define AB8540_CH_USBCH_STAT3_REG	0x04
-#define AB8500_CH_STAT_REG		0x05
-
-/*
- * Charger / control register offfsets
- * Bank : 0x0B
- */
-#define AB8500_CH_VOLT_LVL_REG		0x40
-#define AB8500_CH_VOLT_LVL_MAX_REG	0x41  /*Only in Cut2.0*/
-#define AB8500_CH_OPT_CRNTLVL_REG	0x42
-#define AB8500_CH_OPT_CRNTLVL_MAX_REG	0x43  /*Only in Cut2.0*/
-#define AB8500_CH_WD_TIMER_REG		0x50
-#define AB8500_CHARG_WD_CTRL		0x51
-#define AB8500_BTEMP_HIGH_TH		0x52
-#define AB8500_LED_INDICATOR_PWM_CTRL	0x53
-#define AB8500_LED_INDICATOR_PWM_DUTY	0x54
-#define AB8500_BATT_OVV			0x55
-#define AB8500_CHARGER_CTRL		0x56
-#define AB8500_BAT_CTRL_CURRENT_SOURCE	0x60  /*Only in Cut2.0*/
-
-/*
- * Charger / main control register offsets
- * Bank : 0x0B
- */
-#define AB8500_MCH_CTRL1		0x80
-#define AB8500_MCH_CTRL2		0x81
-#define AB8500_MCH_IPT_CURLVL_REG	0x82
-#define AB8500_CH_WD_REG		0x83
-
-/*
- * Charger / USB control register offsets
- * Bank : 0x0B
- */
-#define AB8500_USBCH_CTRL1_REG		0xC0
-#define AB8500_USBCH_CTRL2_REG		0xC1
-#define AB8500_USBCH_IPT_CRNTLVL_REG	0xC2
-#define AB8540_USB_PP_MODE_REG		0xC5
-#define AB8540_USB_PP_CHR_REG		0xC6
-
-/*
- * Gas Gauge register offsets
- * Bank : 0x0C
- */
-#define AB8500_GASG_CC_CTRL_REG		0x00
-#define AB8500_GASG_CC_ACCU1_REG	0x01
-#define AB8500_GASG_CC_ACCU2_REG	0x02
-#define AB8500_GASG_CC_ACCU3_REG	0x03
-#define AB8500_GASG_CC_ACCU4_REG	0x04
-#define AB8500_GASG_CC_SMPL_CNTRL_REG	0x05
-#define AB8500_GASG_CC_SMPL_CNTRH_REG	0x06
-#define AB8500_GASG_CC_SMPL_CNVL_REG	0x07
-#define AB8500_GASG_CC_SMPL_CNVH_REG	0x08
-#define AB8500_GASG_CC_CNTR_AVGOFF_REG	0x09
-#define AB8500_GASG_CC_OFFSET_REG	0x0A
-#define AB8500_GASG_CC_NCOV_ACCU	0x10
-#define AB8500_GASG_CC_NCOV_ACCU_CTRL	0x11
-#define AB8500_GASG_CC_NCOV_ACCU_LOW	0x12
-#define AB8500_GASG_CC_NCOV_ACCU_MED	0x13
-#define AB8500_GASG_CC_NCOV_ACCU_HIGH	0x14
-
-/*
- * Interrupt register offsets
- * Bank : 0x0E
- */
-#define AB8500_IT_SOURCE2_REG		0x01
-#define AB8500_IT_SOURCE21_REG		0x14
-
-/*
- * RTC register offsets
- * Bank: 0x0F
- */
-#define AB8500_RTC_BACKUP_CHG_REG	0x0C
-#define AB8500_RTC_CC_CONF_REG		0x01
-#define AB8500_RTC_CTRL_REG		0x0B
-#define AB8500_RTC_CTRL1_REG		0x11
-
-/*
- * OTP register offsets
- * Bank : 0x15
- */
-#define AB8500_OTP_CONF_15		0x0E
-
-/* GPADC constants from AB8500 spec, UM0836 */
-#define ADC_RESOLUTION			1024
-#define ADC_CH_MAIN_MIN			0
-#define ADC_CH_MAIN_MAX			20030
-#define ADC_CH_VBUS_MIN			0
-#define ADC_CH_VBUS_MAX			20030
-#define ADC_CH_VBAT_MIN			2300
-#define ADC_CH_VBAT_MAX			4800
-#define ADC_CH_BKBAT_MIN		0
-#define ADC_CH_BKBAT_MAX		3200
-
-/* Main charge i/p current */
-#define MAIN_CH_IP_CUR_0P9A		0x80
-#define MAIN_CH_IP_CUR_1P0A		0x90
-#define MAIN_CH_IP_CUR_1P1A		0xA0
-#define MAIN_CH_IP_CUR_1P2A		0xB0
-#define MAIN_CH_IP_CUR_1P3A		0xC0
-#define MAIN_CH_IP_CUR_1P4A		0xD0
-#define MAIN_CH_IP_CUR_1P5A		0xE0
-
-/* ChVoltLevel */
-#define CH_VOL_LVL_3P5			0x00
-#define CH_VOL_LVL_4P0			0x14
-#define CH_VOL_LVL_4P05			0x16
-#define CH_VOL_LVL_4P1			0x1B
-#define CH_VOL_LVL_4P15			0x20
-#define CH_VOL_LVL_4P2			0x25
-#define CH_VOL_LVL_4P6			0x4D
-
-/* ChOutputCurrentLevel */
-#define CH_OP_CUR_LVL_0P1		0x00
-#define CH_OP_CUR_LVL_0P2		0x01
-#define CH_OP_CUR_LVL_0P3		0x02
-#define CH_OP_CUR_LVL_0P4		0x03
-#define CH_OP_CUR_LVL_0P5		0x04
-#define CH_OP_CUR_LVL_0P6		0x05
-#define CH_OP_CUR_LVL_0P7		0x06
-#define CH_OP_CUR_LVL_0P8		0x07
-#define CH_OP_CUR_LVL_0P9		0x08
-#define CH_OP_CUR_LVL_1P4		0x0D
-#define CH_OP_CUR_LVL_1P5		0x0E
-#define CH_OP_CUR_LVL_1P6		0x0F
-#define CH_OP_CUR_LVL_2P		0x3F
-
-/* BTEMP High thermal limits */
-#define BTEMP_HIGH_TH_57_0		0x00
-#define BTEMP_HIGH_TH_52		0x01
-#define BTEMP_HIGH_TH_57_1		0x02
-#define BTEMP_HIGH_TH_62		0x03
-
-/* current is mA */
-#define USB_0P1A			100
-#define USB_0P2A			200
-#define USB_0P3A			300
-#define USB_0P4A			400
-#define USB_0P5A			500
-
-#define LOW_BAT_3P1V			0x20
-#define LOW_BAT_2P3V			0x00
-#define LOW_BAT_RESET			0x01
-#define LOW_BAT_ENABLE			0x01
-
-/* Backup battery constants */
-#define BUP_ICH_SEL_50UA		0x00
-#define BUP_ICH_SEL_150UA		0x04
-#define BUP_ICH_SEL_300UA		0x08
-#define BUP_ICH_SEL_700UA		0x0C
-
-enum bup_vch_sel {
-	BUP_VCH_SEL_2P5V,
-	BUP_VCH_SEL_2P6V,
-	BUP_VCH_SEL_2P8V,
-	BUP_VCH_SEL_3P1V,
-	/*
-	 * Note that the following 5 values 2.7v, 2.9v, 3.0v, 3.2v, 3.3v
-	 * are only available on ab8540. You can't choose these 5
-	 * voltage on ab8500/ab8505/ab9540.
-	 */
-	BUP_VCH_SEL_2P7V,
-	BUP_VCH_SEL_2P9V,
-	BUP_VCH_SEL_3P0V,
-	BUP_VCH_SEL_3P2V,
-	BUP_VCH_SEL_3P3V,
-};
-
-#define BUP_VCH_RANGE		0x02
-#define VBUP33_VRTCN		0x01
-
-/* Battery OVV constants */
-#define BATT_OVV_ENA			0x02
-#define BATT_OVV_TH_3P7			0x00
-#define BATT_OVV_TH_4P75		0x01
-
-/* A value to indicate over voltage */
-#define BATT_OVV_VALUE			4750
-
-/* VBUS OVV constants */
-#define VBUS_OVV_SELECT_MASK		0x78
-#define VBUS_OVV_SELECT_5P6V		0x00
-#define VBUS_OVV_SELECT_5P7V		0x08
-#define VBUS_OVV_SELECT_5P8V		0x10
-#define VBUS_OVV_SELECT_5P9V		0x18
-#define VBUS_OVV_SELECT_6P0V		0x20
-#define VBUS_OVV_SELECT_6P1V		0x28
-#define VBUS_OVV_SELECT_6P2V		0x30
-#define VBUS_OVV_SELECT_6P3V		0x38
-
-#define VBUS_AUTO_IN_CURR_LIM_ENA	0x04
-
-/* Fuel Gauge constants */
-#define RESET_ACCU			0x02
-#define READ_REQ			0x01
-#define CC_DEEP_SLEEP_ENA		0x02
-#define CC_PWR_UP_ENA			0x01
-#define CC_SAMPLES_40			0x28
-#define RD_NCONV_ACCU_REQ		0x01
-#define CC_CALIB			0x08
-#define CC_INTAVGOFFSET_ENA		0x10
-#define CC_MUXOFFSET			0x80
-#define CC_INT_CAL_N_AVG_MASK		0x60
-#define CC_INT_CAL_SAMPLES_16		0x40
-#define CC_INT_CAL_SAMPLES_8		0x20
-#define CC_INT_CAL_SAMPLES_4		0x00
-
-/* RTC constants */
-#define RTC_BUP_CH_ENA			0x10
-
-/* BatCtrl Current Source Constants */
-#define BAT_CTRL_7U_ENA			0x01
-#define BAT_CTRL_20U_ENA		0x02
-#define BAT_CTRL_18U_ENA		0x01
-#define BAT_CTRL_16U_ENA		0x02
-#define BAT_CTRL_CMP_ENA		0x04
-#define FORCE_BAT_CTRL_CMP_HIGH		0x08
-#define BAT_CTRL_PULL_UP_ENA		0x10
-
-/* Battery type */
-#define BATTERY_UNKNOWN			00
-
-/* Registers for pcut feature in ab8505 and ab9540 */
-#define AB8505_RTC_PCUT_CTL_STATUS_REG	0x12
-#define AB8505_RTC_PCUT_TIME_REG	0x13
-#define AB8505_RTC_PCUT_MAX_TIME_REG	0x14
-#define AB8505_RTC_PCUT_FLAG_TIME_REG	0x15
-#define AB8505_RTC_PCUT_RESTART_REG	0x16
-#define AB8505_RTC_PCUT_DEBOUNCE_REG	0x17
-
-/* USB Power Path constants for ab8540 */
-#define BUS_VSYS_VOL_SELECT_MASK		0x06
-#define BUS_VSYS_VOL_SELECT_3P6V		0x00
-#define BUS_VSYS_VOL_SELECT_3P325V		0x02
-#define BUS_VSYS_VOL_SELECT_3P9V		0x04
-#define BUS_VSYS_VOL_SELECT_4P3V		0x06
-#define BUS_POWER_PATH_MODE_ENA			0x01
-#define BUS_PP_PRECHG_CURRENT_MASK		0x0E
-#define BUS_POWER_PATH_PRECHG_ENA		0x01
-
-/**
- * struct res_to_temp - defines one point in a temp to res curve. To
- * be used in battery packs that combines the identification resistor with a
- * NTC resistor.
- * @temp:			battery pack temperature in Celsius
- * @resist:			NTC resistor net total resistance
- */
-struct res_to_temp {
-	int temp;
-	int resist;
-};
-
-/**
- * struct batres_vs_temp - defines one point in a temp vs battery internal
- * resistance curve.
- * @temp:			battery pack temperature in Celsius
- * @resist:			battery internal reistance in mOhm
- */
-struct batres_vs_temp {
-	int temp;
-	int resist;
-};
-
-/* Forward declaration */
-struct ab8500_fg;
-
-/**
- * struct ab8500_fg_parameters - Fuel gauge algorithm parameters, in seconds
- * if not specified
- * @recovery_sleep_timer:	Time between measurements while recovering
- * @recovery_total_time:	Total recovery time
- * @init_timer:			Measurement interval during startup
- * @init_discard_time:		Time we discard voltage measurement at startup
- * @init_total_time:		Total init time during startup
- * @high_curr_time:		Time current has to be high to go to recovery
- * @accu_charging:		FG accumulation time while charging
- * @accu_high_curr:		FG accumulation time in high current mode
- * @high_curr_threshold:	High current threshold, in mA
- * @lowbat_threshold:		Low battery threshold, in mV
- * @battok_falling_th_sel0	Threshold in mV for battOk signal sel0
- *				Resolution in 50 mV step.
- * @battok_raising_th_sel1	Threshold in mV for battOk signal sel1
- *				Resolution in 50 mV step.
- * @user_cap_limit		Capacity reported from user must be within this
- *				limit to be considered as sane, in percentage
- *				points.
- * @maint_thres			This is the threshold where we stop reporting
- *				battery full while in maintenance, in per cent
- * @pcut_enable:			Enable power cut feature in ab8505
- * @pcut_max_time:		Max time threshold
- * @pcut_flag_time:		Flagtime threshold
- * @pcut_max_restart:		Max number of restarts
- * @pcut_debunce_time:	Sets battery debounce time
- */
-struct ab8500_fg_parameters {
-	int recovery_sleep_timer;
-	int recovery_total_time;
-	int init_timer;
-	int init_discard_time;
-	int init_total_time;
-	int high_curr_time;
-	int accu_charging;
-	int accu_high_curr;
-	int high_curr_threshold;
-	int lowbat_threshold;
-	int battok_falling_th_sel0;
-	int battok_raising_th_sel1;
-	int user_cap_limit;
-	int maint_thres;
-	bool pcut_enable;
-	u8 pcut_max_time;
-	u8 pcut_flag_time;
-	u8 pcut_max_restart;
-	u8 pcut_debunce_time;
-};
-
-/**
- * struct ab8500_charger_maximization - struct used by the board config.
- * @use_maxi:		Enable maximization for this battery type
- * @maxi_chg_curr:	Maximum charger current allowed
- * @maxi_wait_cycles:	cycles to wait before setting charger current
- * @charger_curr_step	delta between two charger current settings (mA)
- */
-struct ab8500_maxim_parameters {
-	bool ena_maxi;
-	int chg_curr;
-	int wait_cycles;
-	int charger_curr_step;
-};
-
-/**
- * struct ab8500_bm_capacity_levels - ab8500 capacity level data
- * @critical:		critical capacity level in percent
- * @low:		low capacity level in percent
- * @normal:		normal capacity level in percent
- * @high:		high capacity level in percent
- * @full:		full capacity level in percent
- */
-struct ab8500_bm_capacity_levels {
-	int critical;
-	int low;
-	int normal;
-	int high;
-	int full;
-};
-
-/**
- * struct ab8500_bm_charger_parameters - Charger specific parameters
- * @usb_volt_max:	maximum allowed USB charger voltage in mV
- * @usb_curr_max:	maximum allowed USB charger current in mA
- * @ac_volt_max:	maximum allowed AC charger voltage in mV
- * @ac_curr_max:	maximum allowed AC charger current in mA
- */
-struct ab8500_bm_charger_parameters {
-	int usb_volt_max;
-	int usb_curr_max;
-	int ac_volt_max;
-	int ac_curr_max;
-};
-
-/**
- * struct ab8500_bm_data - ab8500 battery management data
- * @temp_under		under this temp, charging is stopped
- * @temp_low		between this temp and temp_under charging is reduced
- * @temp_high		between this temp and temp_over charging is reduced
- * @temp_over		over this temp, charging is stopped
- * @temp_interval_chg	temperature measurement interval in s when charging
- * @temp_interval_nochg	temperature measurement interval in s when not charging
- * @main_safety_tmr_h	safety timer for main charger
- * @usb_safety_tmr_h	safety timer for usb charger
- * @bkup_bat_v		voltage which we charge the backup battery with
- * @bkup_bat_i		current which we charge the backup battery with
- * @no_maintenance	indicates that maintenance charging is disabled
- * @capacity_scaling    indicates whether capacity scaling is to be used
- * @adc_therm		placement of thermistor, batctrl or battemp adc
- * @chg_unknown_bat	flag to enable charging of unknown batteries
- * @enable_overshoot	flag to enable VBAT overshoot control
- * @fg_res		resistance of FG resistor in 0.1mOhm
- * @n_btypes		number of elements in array bat_type
- * @batt_id		index of the identified battery in array bat_type
- * @interval_charging	charge alg cycle period time when charging (sec)
- * @interval_not_charging charge alg cycle period time when not charging (sec)
- * @temp_hysteresis	temperature hysteresis
- * @gnd_lift_resistance	Battery ground to phone ground resistance (mOhm)
- * @maxi:		maximization parameters
- * @cap_levels		capacity in percent for the different capacity levels
- * @bat_type		table of supported battery types
- * @chg_params		charger parameters
- * @fg_params		fuel gauge parameters
- */
-struct ab8500_bm_data {
-	int temp_under;
-	int temp_low;
-	int temp_high;
-	int temp_over;
-	int temp_interval_chg;
-	int temp_interval_nochg;
-	int main_safety_tmr_h;
-	int usb_safety_tmr_h;
-	int bkup_bat_v;
-	int bkup_bat_i;
-	bool no_maintenance;
-	bool capacity_scaling;
-	bool chg_unknown_bat;
-	bool enable_overshoot;
-	enum abx500_adc_therm adc_therm;
-	int fg_res;
-	int n_btypes;
-	int batt_id;
-	int interval_charging;
-	int interval_not_charging;
-	int temp_hysteresis;
-	int gnd_lift_resistance;
-	const struct ab8500_maxim_parameters *maxi;
-	const struct ab8500_bm_capacity_levels *cap_levels;
-	const struct ab8500_bm_charger_parameters *chg_params;
-	const struct ab8500_fg_parameters *fg_params;
-};
-
-struct ab8500_btemp;
-struct ab8500_gpadc;
-struct ab8500_fg;
-
-#ifdef CONFIG_AB8500_BM
-extern struct abx500_bm_data ab8500_bm_data;
-
-void ab8500_charger_usb_state_changed(u8 bm_usb_state, u16 mA);
-struct ab8500_btemp *ab8500_btemp_get(void);
-int ab8500_btemp_get_batctrl_temp(struct ab8500_btemp *btemp);
-int ab8500_btemp_get_temp(struct ab8500_btemp *btemp);
-struct ab8500_fg *ab8500_fg_get(void);
-int ab8500_fg_inst_curr_blocking(struct ab8500_fg *dev);
-int ab8500_fg_inst_curr_start(struct ab8500_fg *di);
-int ab8500_fg_inst_curr_finalize(struct ab8500_fg *di, int *res);
-int ab8500_fg_inst_curr_started(struct ab8500_fg *di);
-int ab8500_fg_inst_curr_done(struct ab8500_fg *di);
-
-#else
-static struct abx500_bm_data ab8500_bm_data;
-#endif
-#endif /* _AB8500_BM_H */
-- 
cgit v1.2.3


From a65aa0ce23bd3bc29c9f3a6a90fe032e1dbd8f34 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 12 Mar 2021 09:36:03 +0100
Subject: mfd/power: ab8500: Push algorithm to power supply code

The charging algorithm header is only used locally in the
power supply subsystem so push this down into
drivers/power/supply and rename from the confusing
"ux500_chargalg.h" to "ab8500-chargalg.h" for clarity:
it is only used with the AB8500.

This is another remnant of non-DT code needing to pass
data from boardfiles, which we don't do anymore.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/power/supply/ab8500-chargalg.h    | 51 +++++++++++++++++++++++++++++++
 drivers/power/supply/ab8500_charger.c     |  2 +-
 drivers/power/supply/abx500_chargalg.c    |  2 +-
 drivers/power/supply/pm2301_charger.c     |  2 +-
 include/linux/mfd/abx500/ux500_chargalg.h | 51 -------------------------------
 5 files changed, 54 insertions(+), 54 deletions(-)
 create mode 100644 drivers/power/supply/ab8500-chargalg.h
 delete mode 100644 include/linux/mfd/abx500/ux500_chargalg.h

(limited to 'include/linux')

diff --git a/drivers/power/supply/ab8500-chargalg.h b/drivers/power/supply/ab8500-chargalg.h
new file mode 100644
index 000000000000..94a6f9068bc5
--- /dev/null
+++ b/drivers/power/supply/ab8500-chargalg.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) ST-Ericsson SA 2012
+ * Author: Johan Gardsmark <johan.gardsmark@stericsson.com> for ST-Ericsson.
+ */
+
+#ifndef _AB8500_CHARGALG_H_
+#define _AB8500_CHARGALG_H_
+
+#include <linux/power_supply.h>
+
+/*
+ * Valid only for supplies of type:
+ * - POWER_SUPPLY_TYPE_MAINS,
+ * - POWER_SUPPLY_TYPE_USB,
+ * because only them store as drv_data pointer to struct ux500_charger.
+ */
+#define psy_to_ux500_charger(x) power_supply_get_drvdata(psy)
+
+/* Forward declaration */
+struct ux500_charger;
+
+struct ux500_charger_ops {
+	int (*enable) (struct ux500_charger *, int, int, int);
+	int (*check_enable) (struct ux500_charger *, int, int);
+	int (*kick_wd) (struct ux500_charger *);
+	int (*update_curr) (struct ux500_charger *, int);
+};
+
+/**
+ * struct ux500_charger - power supply ux500 charger sub class
+ * @psy			power supply base class
+ * @ops			ux500 charger operations
+ * @max_out_volt	maximum output charger voltage in mV
+ * @max_out_curr	maximum output charger current in mA
+ * @enabled		indicates if this charger is used or not
+ * @external		external charger unit (pm2xxx)
+ */
+struct ux500_charger {
+	struct power_supply *psy;
+	struct ux500_charger_ops ops;
+	int max_out_volt;
+	int max_out_curr;
+	int wdt_refresh;
+	bool enabled;
+	bool external;
+};
+
+extern struct blocking_notifier_head charger_notifier_list;
+
+#endif /* _AB8500_CHARGALG_H_ */
diff --git a/drivers/power/supply/ab8500_charger.c b/drivers/power/supply/ab8500_charger.c
index 50989a5ec95c..a9be10eb2c22 100644
--- a/drivers/power/supply/ab8500_charger.c
+++ b/drivers/power/supply/ab8500_charger.c
@@ -28,12 +28,12 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/abx500/ab8500.h>
 #include <linux/mfd/abx500.h>
-#include <linux/mfd/abx500/ux500_chargalg.h>
 #include <linux/usb/otg.h>
 #include <linux/mutex.h>
 #include <linux/iio/consumer.h>
 
 #include "ab8500-bm.h"
+#include "ab8500-chargalg.h"
 
 /* Charger constants */
 #define NO_PW_CONN			0
diff --git a/drivers/power/supply/abx500_chargalg.c b/drivers/power/supply/abx500_chargalg.c
index 5b28d58041b4..f5b792243727 100644
--- a/drivers/power/supply/abx500_chargalg.c
+++ b/drivers/power/supply/abx500_chargalg.c
@@ -28,10 +28,10 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/abx500/ux500_chargalg.h>
 #include <linux/notifier.h>
 
 #include "ab8500-bm.h"
+#include "ab8500-chargalg.h"
 
 /* Watchdog kick interval */
 #define CHG_WD_INTERVAL			(6 * HZ)
diff --git a/drivers/power/supply/pm2301_charger.c b/drivers/power/supply/pm2301_charger.c
index 5aeff75db33b..d53e0c37c059 100644
--- a/drivers/power/supply/pm2301_charger.c
+++ b/drivers/power/supply/pm2301_charger.c
@@ -18,13 +18,13 @@
 #include <linux/i2c.h>
 #include <linux/workqueue.h>
 #include <linux/mfd/abx500/ab8500.h>
-#include <linux/mfd/abx500/ux500_chargalg.h>
 #include <linux/pm2301_charger.h>
 #include <linux/gpio.h>
 #include <linux/pm_runtime.h>
 #include <linux/pm.h>
 
 #include "ab8500-bm.h"
+#include "ab8500-chargalg.h"
 #include "pm2301_charger.h"
 
 #define to_pm2xxx_charger_ac_device_info(x) container_of((x), \
diff --git a/include/linux/mfd/abx500/ux500_chargalg.h b/include/linux/mfd/abx500/ux500_chargalg.h
deleted file mode 100644
index 9b97d284d0ce..000000000000
--- a/include/linux/mfd/abx500/ux500_chargalg.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson SA 2012
- * Author: Johan Gardsmark <johan.gardsmark@stericsson.com> for ST-Ericsson.
- */
-
-#ifndef _UX500_CHARGALG_H
-#define _UX500_CHARGALG_H
-
-#include <linux/power_supply.h>
-
-/*
- * Valid only for supplies of type:
- * - POWER_SUPPLY_TYPE_MAINS,
- * - POWER_SUPPLY_TYPE_USB,
- * because only them store as drv_data pointer to struct ux500_charger.
- */
-#define psy_to_ux500_charger(x) power_supply_get_drvdata(psy)
-
-/* Forward declaration */
-struct ux500_charger;
-
-struct ux500_charger_ops {
-	int (*enable) (struct ux500_charger *, int, int, int);
-	int (*check_enable) (struct ux500_charger *, int, int);
-	int (*kick_wd) (struct ux500_charger *);
-	int (*update_curr) (struct ux500_charger *, int);
-};
-
-/**
- * struct ux500_charger - power supply ux500 charger sub class
- * @psy			power supply base class
- * @ops			ux500 charger operations
- * @max_out_volt	maximum output charger voltage in mV
- * @max_out_curr	maximum output charger current in mA
- * @enabled		indicates if this charger is used or not
- * @external		external charger unit (pm2xxx)
- */
-struct ux500_charger {
-	struct power_supply *psy;
-	struct ux500_charger_ops ops;
-	int max_out_volt;
-	int max_out_curr;
-	int wdt_refresh;
-	bool enabled;
-	bool external;
-};
-
-extern struct blocking_notifier_head charger_notifier_list;
-
-#endif
-- 
cgit v1.2.3


From ee0975c3089e1c3357ccc3ada7a94a95b61e708c Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 12 Mar 2021 09:36:04 +0100
Subject: mfd/power: ab8500: Push data to power supply code

There is a slew of defines, structs and enums and even a
function call only relevant for the charging code that
still lives in <linux/mfd/abx500.h>. Push it down to the
"ab8500-bm.h" header in the power supply subsystem where
it is actually used.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/power/supply/ab8500-bm.h | 278 ++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/abx500.h       | 276 --------------------------------------
 2 files changed, 274 insertions(+), 280 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/ab8500-bm.h b/drivers/power/supply/ab8500-bm.h
index a1b31c971a45..41c69a4f2a1f 100644
--- a/drivers/power/supply/ab8500-bm.h
+++ b/drivers/power/supply/ab8500-bm.h
@@ -4,7 +4,6 @@
 #define _AB8500_CHARGER_H_
 
 #include <linux/kernel.h>
-#include <linux/mfd/abx500.h>
 
 /*
  * System control 2 register offsets.
@@ -268,6 +267,277 @@ enum bup_vch_sel {
 #define BUS_PP_PRECHG_CURRENT_MASK		0x0E
 #define BUS_POWER_PATH_PRECHG_ENA		0x01
 
+/*
+ * ADC for the battery thermistor.
+ * When using the ABx500_ADC_THERM_BATCTRL the battery ID resistor is combined
+ * with a NTC resistor to both identify the battery and to measure its
+ * temperature. Different phone manufactures uses different techniques to both
+ * identify the battery and to read its temperature.
+ */
+enum abx500_adc_therm {
+	ABx500_ADC_THERM_BATCTRL,
+	ABx500_ADC_THERM_BATTEMP,
+};
+
+/**
+ * struct abx500_res_to_temp - defines one point in a temp to res curve. To
+ * be used in battery packs that combines the identification resistor with a
+ * NTC resistor.
+ * @temp:			battery pack temperature in Celsius
+ * @resist:			NTC resistor net total resistance
+ */
+struct abx500_res_to_temp {
+	int temp;
+	int resist;
+};
+
+/**
+ * struct abx500_v_to_cap - Table for translating voltage to capacity
+ * @voltage:		Voltage in mV
+ * @capacity:		Capacity in percent
+ */
+struct abx500_v_to_cap {
+	int voltage;
+	int capacity;
+};
+
+/* Forward declaration */
+struct abx500_fg;
+
+/**
+ * struct abx500_fg_parameters - Fuel gauge algorithm parameters, in seconds
+ * if not specified
+ * @recovery_sleep_timer:	Time between measurements while recovering
+ * @recovery_total_time:	Total recovery time
+ * @init_timer:			Measurement interval during startup
+ * @init_discard_time:		Time we discard voltage measurement at startup
+ * @init_total_time:		Total init time during startup
+ * @high_curr_time:		Time current has to be high to go to recovery
+ * @accu_charging:		FG accumulation time while charging
+ * @accu_high_curr:		FG accumulation time in high current mode
+ * @high_curr_threshold:	High current threshold, in mA
+ * @lowbat_threshold:		Low battery threshold, in mV
+ * @overbat_threshold:		Over battery threshold, in mV
+ * @battok_falling_th_sel0	Threshold in mV for battOk signal sel0
+ *				Resolution in 50 mV step.
+ * @battok_raising_th_sel1	Threshold in mV for battOk signal sel1
+ *				Resolution in 50 mV step.
+ * @user_cap_limit		Capacity reported from user must be within this
+ *				limit to be considered as sane, in percentage
+ *				points.
+ * @maint_thres			This is the threshold where we stop reporting
+ *				battery full while in maintenance, in per cent
+ * @pcut_enable:			Enable power cut feature in ab8505
+ * @pcut_max_time:		Max time threshold
+ * @pcut_flag_time:		Flagtime threshold
+ * @pcut_max_restart:		Max number of restarts
+ * @pcut_debounce_time:		Sets battery debounce time
+ */
+struct abx500_fg_parameters {
+	int recovery_sleep_timer;
+	int recovery_total_time;
+	int init_timer;
+	int init_discard_time;
+	int init_total_time;
+	int high_curr_time;
+	int accu_charging;
+	int accu_high_curr;
+	int high_curr_threshold;
+	int lowbat_threshold;
+	int overbat_threshold;
+	int battok_falling_th_sel0;
+	int battok_raising_th_sel1;
+	int user_cap_limit;
+	int maint_thres;
+	bool pcut_enable;
+	u8 pcut_max_time;
+	u8 pcut_flag_time;
+	u8 pcut_max_restart;
+	u8 pcut_debounce_time;
+};
+
+/**
+ * struct abx500_charger_maximization - struct used by the board config.
+ * @use_maxi:		Enable maximization for this battery type
+ * @maxi_chg_curr:	Maximum charger current allowed
+ * @maxi_wait_cycles:	cycles to wait before setting charger current
+ * @charger_curr_step	delta between two charger current settings (mA)
+ */
+struct abx500_maxim_parameters {
+	bool ena_maxi;
+	int chg_curr;
+	int wait_cycles;
+	int charger_curr_step;
+};
+
+/**
+ * struct abx500_battery_type - different batteries supported
+ * @name:			battery technology
+ * @resis_high:			battery upper resistance limit
+ * @resis_low:			battery lower resistance limit
+ * @charge_full_design:		Maximum battery capacity in mAh
+ * @nominal_voltage:		Nominal voltage of the battery in mV
+ * @termination_vol:		max voltage upto which battery can be charged
+ * @termination_curr		battery charging termination current in mA
+ * @recharge_cap		battery capacity limit that will trigger a new
+ *				full charging cycle in the case where maintenan-
+ *				-ce charging has been disabled
+ * @normal_cur_lvl:		charger current in normal state in mA
+ * @normal_vol_lvl:		charger voltage in normal state in mV
+ * @maint_a_cur_lvl:		charger current in maintenance A state in mA
+ * @maint_a_vol_lvl:		charger voltage in maintenance A state in mV
+ * @maint_a_chg_timer_h:	charge time in maintenance A state
+ * @maint_b_cur_lvl:		charger current in maintenance B state in mA
+ * @maint_b_vol_lvl:		charger voltage in maintenance B state in mV
+ * @maint_b_chg_timer_h:	charge time in maintenance B state
+ * @low_high_cur_lvl:		charger current in temp low/high state in mA
+ * @low_high_vol_lvl:		charger voltage in temp low/high state in mV'
+ * @battery_resistance:		battery inner resistance in mOhm.
+ * @n_r_t_tbl_elements:		number of elements in r_to_t_tbl
+ * @r_to_t_tbl:			table containing resistance to temp points
+ * @n_v_cap_tbl_elements:	number of elements in v_to_cap_tbl
+ * @v_to_cap_tbl:		Voltage to capacity (in %) table
+ * @n_batres_tbl_elements	number of elements in the batres_tbl
+ * @batres_tbl			battery internal resistance vs temperature table
+ */
+struct abx500_battery_type {
+	int name;
+	int resis_high;
+	int resis_low;
+	int charge_full_design;
+	int nominal_voltage;
+	int termination_vol;
+	int termination_curr;
+	int recharge_cap;
+	int normal_cur_lvl;
+	int normal_vol_lvl;
+	int maint_a_cur_lvl;
+	int maint_a_vol_lvl;
+	int maint_a_chg_timer_h;
+	int maint_b_cur_lvl;
+	int maint_b_vol_lvl;
+	int maint_b_chg_timer_h;
+	int low_high_cur_lvl;
+	int low_high_vol_lvl;
+	int battery_resistance;
+	int n_temp_tbl_elements;
+	const struct abx500_res_to_temp *r_to_t_tbl;
+	int n_v_cap_tbl_elements;
+	const struct abx500_v_to_cap *v_to_cap_tbl;
+	int n_batres_tbl_elements;
+	const struct batres_vs_temp *batres_tbl;
+};
+
+/**
+ * struct abx500_bm_capacity_levels - abx500 capacity level data
+ * @critical:		critical capacity level in percent
+ * @low:		low capacity level in percent
+ * @normal:		normal capacity level in percent
+ * @high:		high capacity level in percent
+ * @full:		full capacity level in percent
+ */
+struct abx500_bm_capacity_levels {
+	int critical;
+	int low;
+	int normal;
+	int high;
+	int full;
+};
+
+/**
+ * struct abx500_bm_charger_parameters - Charger specific parameters
+ * @usb_volt_max:	maximum allowed USB charger voltage in mV
+ * @usb_curr_max:	maximum allowed USB charger current in mA
+ * @ac_volt_max:	maximum allowed AC charger voltage in mV
+ * @ac_curr_max:	maximum allowed AC charger current in mA
+ */
+struct abx500_bm_charger_parameters {
+	int usb_volt_max;
+	int usb_curr_max;
+	int ac_volt_max;
+	int ac_curr_max;
+};
+
+/**
+ * struct abx500_bm_data - abx500 battery management data
+ * @temp_under		under this temp, charging is stopped
+ * @temp_low		between this temp and temp_under charging is reduced
+ * @temp_high		between this temp and temp_over charging is reduced
+ * @temp_over		over this temp, charging is stopped
+ * @temp_now		present battery temperature
+ * @temp_interval_chg	temperature measurement interval in s when charging
+ * @temp_interval_nochg	temperature measurement interval in s when not charging
+ * @main_safety_tmr_h	safety timer for main charger
+ * @usb_safety_tmr_h	safety timer for usb charger
+ * @bkup_bat_v		voltage which we charge the backup battery with
+ * @bkup_bat_i		current which we charge the backup battery with
+ * @no_maintenance	indicates that maintenance charging is disabled
+ * @capacity_scaling    indicates whether capacity scaling is to be used
+ * @abx500_adc_therm	placement of thermistor, batctrl or battemp adc
+ * @chg_unknown_bat	flag to enable charging of unknown batteries
+ * @enable_overshoot	flag to enable VBAT overshoot control
+ * @auto_trig		flag to enable auto adc trigger
+ * @fg_res		resistance of FG resistor in 0.1mOhm
+ * @n_btypes		number of elements in array bat_type
+ * @batt_id		index of the identified battery in array bat_type
+ * @interval_charging	charge alg cycle period time when charging (sec)
+ * @interval_not_charging charge alg cycle period time when not charging (sec)
+ * @temp_hysteresis	temperature hysteresis
+ * @gnd_lift_resistance	Battery ground to phone ground resistance (mOhm)
+ * @n_chg_out_curr		number of elements in array chg_output_curr
+ * @n_chg_in_curr		number of elements in array chg_input_curr
+ * @chg_output_curr	charger output current level map
+ * @chg_input_curr		charger input current level map
+ * @maxi		maximization parameters
+ * @cap_levels		capacity in percent for the different capacity levels
+ * @bat_type		table of supported battery types
+ * @chg_params		charger parameters
+ * @fg_params		fuel gauge parameters
+ */
+struct abx500_bm_data {
+	int temp_under;
+	int temp_low;
+	int temp_high;
+	int temp_over;
+	int temp_now;
+	int temp_interval_chg;
+	int temp_interval_nochg;
+	int main_safety_tmr_h;
+	int usb_safety_tmr_h;
+	int bkup_bat_v;
+	int bkup_bat_i;
+	bool autopower_cfg;
+	bool ac_enabled;
+	bool usb_enabled;
+	bool no_maintenance;
+	bool capacity_scaling;
+	bool chg_unknown_bat;
+	bool enable_overshoot;
+	bool auto_trig;
+	enum abx500_adc_therm adc_therm;
+	int fg_res;
+	int n_btypes;
+	int batt_id;
+	int interval_charging;
+	int interval_not_charging;
+	int temp_hysteresis;
+	int gnd_lift_resistance;
+	int n_chg_out_curr;
+	int n_chg_in_curr;
+	int *chg_output_curr;
+	int *chg_input_curr;
+	const struct abx500_maxim_parameters *maxi;
+	const struct abx500_bm_capacity_levels *cap_levels;
+	struct abx500_battery_type *bat_type;
+	const struct abx500_bm_charger_parameters *chg_params;
+	const struct abx500_fg_parameters *fg_params;
+};
+
+enum {
+	NTC_EXTERNAL = 0,
+	NTC_INTERNAL,
+};
+
 /**
  * struct res_to_temp - defines one point in a temp to res curve. To
  * be used in battery packs that combines the identification resistor with a
@@ -447,9 +717,6 @@ struct ab8500_bm_data {
 	const struct ab8500_fg_parameters *fg_params;
 };
 
-struct ab8500_btemp;
-struct ab8500_fg;
-
 extern struct abx500_bm_data ab8500_bm_data;
 
 void ab8500_charger_usb_state_changed(u8 bm_usb_state, u16 mA);
@@ -459,5 +726,8 @@ int ab8500_fg_inst_curr_start(struct ab8500_fg *di);
 int ab8500_fg_inst_curr_finalize(struct ab8500_fg *di, int *res);
 int ab8500_fg_inst_curr_started(struct ab8500_fg *di);
 int ab8500_fg_inst_curr_done(struct ab8500_fg *di);
+int ab8500_bm_of_probe(struct device *dev,
+		       struct device_node *np,
+		       struct abx500_bm_data *bm);
 
 #endif /* _AB8500_CHARGER_H_ */
diff --git a/include/linux/mfd/abx500.h b/include/linux/mfd/abx500.h
index 23040b6f1615..7f07cfe44753 100644
--- a/include/linux/mfd/abx500.h
+++ b/include/linux/mfd/abx500.h
@@ -28,282 +28,6 @@ struct abx500_init_settings {
 	u8 setting;
 };
 
-/* Battery driver related data */
-/*
- * ADC for the battery thermistor.
- * When using the ABx500_ADC_THERM_BATCTRL the battery ID resistor is combined
- * with a NTC resistor to both identify the battery and to measure its
- * temperature. Different phone manufactures uses different techniques to both
- * identify the battery and to read its temperature.
- */
-enum abx500_adc_therm {
-	ABx500_ADC_THERM_BATCTRL,
-	ABx500_ADC_THERM_BATTEMP,
-};
-
-/**
- * struct abx500_res_to_temp - defines one point in a temp to res curve. To
- * be used in battery packs that combines the identification resistor with a
- * NTC resistor.
- * @temp:			battery pack temperature in Celsius
- * @resist:			NTC resistor net total resistance
- */
-struct abx500_res_to_temp {
-	int temp;
-	int resist;
-};
-
-/**
- * struct abx500_v_to_cap - Table for translating voltage to capacity
- * @voltage:		Voltage in mV
- * @capacity:		Capacity in percent
- */
-struct abx500_v_to_cap {
-	int voltage;
-	int capacity;
-};
-
-/* Forward declaration */
-struct abx500_fg;
-
-/**
- * struct abx500_fg_parameters - Fuel gauge algorithm parameters, in seconds
- * if not specified
- * @recovery_sleep_timer:	Time between measurements while recovering
- * @recovery_total_time:	Total recovery time
- * @init_timer:			Measurement interval during startup
- * @init_discard_time:		Time we discard voltage measurement at startup
- * @init_total_time:		Total init time during startup
- * @high_curr_time:		Time current has to be high to go to recovery
- * @accu_charging:		FG accumulation time while charging
- * @accu_high_curr:		FG accumulation time in high current mode
- * @high_curr_threshold:	High current threshold, in mA
- * @lowbat_threshold:		Low battery threshold, in mV
- * @overbat_threshold:		Over battery threshold, in mV
- * @battok_falling_th_sel0	Threshold in mV for battOk signal sel0
- *				Resolution in 50 mV step.
- * @battok_raising_th_sel1	Threshold in mV for battOk signal sel1
- *				Resolution in 50 mV step.
- * @user_cap_limit		Capacity reported from user must be within this
- *				limit to be considered as sane, in percentage
- *				points.
- * @maint_thres			This is the threshold where we stop reporting
- *				battery full while in maintenance, in per cent
- * @pcut_enable:			Enable power cut feature in ab8505
- * @pcut_max_time:		Max time threshold
- * @pcut_flag_time:		Flagtime threshold
- * @pcut_max_restart:		Max number of restarts
- * @pcut_debounce_time:		Sets battery debounce time
- */
-struct abx500_fg_parameters {
-	int recovery_sleep_timer;
-	int recovery_total_time;
-	int init_timer;
-	int init_discard_time;
-	int init_total_time;
-	int high_curr_time;
-	int accu_charging;
-	int accu_high_curr;
-	int high_curr_threshold;
-	int lowbat_threshold;
-	int overbat_threshold;
-	int battok_falling_th_sel0;
-	int battok_raising_th_sel1;
-	int user_cap_limit;
-	int maint_thres;
-	bool pcut_enable;
-	u8 pcut_max_time;
-	u8 pcut_flag_time;
-	u8 pcut_max_restart;
-	u8 pcut_debounce_time;
-};
-
-/**
- * struct abx500_charger_maximization - struct used by the board config.
- * @use_maxi:		Enable maximization for this battery type
- * @maxi_chg_curr:	Maximum charger current allowed
- * @maxi_wait_cycles:	cycles to wait before setting charger current
- * @charger_curr_step	delta between two charger current settings (mA)
- */
-struct abx500_maxim_parameters {
-	bool ena_maxi;
-	int chg_curr;
-	int wait_cycles;
-	int charger_curr_step;
-};
-
-/**
- * struct abx500_battery_type - different batteries supported
- * @name:			battery technology
- * @resis_high:			battery upper resistance limit
- * @resis_low:			battery lower resistance limit
- * @charge_full_design:		Maximum battery capacity in mAh
- * @nominal_voltage:		Nominal voltage of the battery in mV
- * @termination_vol:		max voltage upto which battery can be charged
- * @termination_curr		battery charging termination current in mA
- * @recharge_cap		battery capacity limit that will trigger a new
- *				full charging cycle in the case where maintenan-
- *				-ce charging has been disabled
- * @normal_cur_lvl:		charger current in normal state in mA
- * @normal_vol_lvl:		charger voltage in normal state in mV
- * @maint_a_cur_lvl:		charger current in maintenance A state in mA
- * @maint_a_vol_lvl:		charger voltage in maintenance A state in mV
- * @maint_a_chg_timer_h:	charge time in maintenance A state
- * @maint_b_cur_lvl:		charger current in maintenance B state in mA
- * @maint_b_vol_lvl:		charger voltage in maintenance B state in mV
- * @maint_b_chg_timer_h:	charge time in maintenance B state
- * @low_high_cur_lvl:		charger current in temp low/high state in mA
- * @low_high_vol_lvl:		charger voltage in temp low/high state in mV'
- * @battery_resistance:		battery inner resistance in mOhm.
- * @n_r_t_tbl_elements:		number of elements in r_to_t_tbl
- * @r_to_t_tbl:			table containing resistance to temp points
- * @n_v_cap_tbl_elements:	number of elements in v_to_cap_tbl
- * @v_to_cap_tbl:		Voltage to capacity (in %) table
- * @n_batres_tbl_elements	number of elements in the batres_tbl
- * @batres_tbl			battery internal resistance vs temperature table
- */
-struct abx500_battery_type {
-	int name;
-	int resis_high;
-	int resis_low;
-	int charge_full_design;
-	int nominal_voltage;
-	int termination_vol;
-	int termination_curr;
-	int recharge_cap;
-	int normal_cur_lvl;
-	int normal_vol_lvl;
-	int maint_a_cur_lvl;
-	int maint_a_vol_lvl;
-	int maint_a_chg_timer_h;
-	int maint_b_cur_lvl;
-	int maint_b_vol_lvl;
-	int maint_b_chg_timer_h;
-	int low_high_cur_lvl;
-	int low_high_vol_lvl;
-	int battery_resistance;
-	int n_temp_tbl_elements;
-	const struct abx500_res_to_temp *r_to_t_tbl;
-	int n_v_cap_tbl_elements;
-	const struct abx500_v_to_cap *v_to_cap_tbl;
-	int n_batres_tbl_elements;
-	const struct batres_vs_temp *batres_tbl;
-};
-
-/**
- * struct abx500_bm_capacity_levels - abx500 capacity level data
- * @critical:		critical capacity level in percent
- * @low:		low capacity level in percent
- * @normal:		normal capacity level in percent
- * @high:		high capacity level in percent
- * @full:		full capacity level in percent
- */
-struct abx500_bm_capacity_levels {
-	int critical;
-	int low;
-	int normal;
-	int high;
-	int full;
-};
-
-/**
- * struct abx500_bm_charger_parameters - Charger specific parameters
- * @usb_volt_max:	maximum allowed USB charger voltage in mV
- * @usb_curr_max:	maximum allowed USB charger current in mA
- * @ac_volt_max:	maximum allowed AC charger voltage in mV
- * @ac_curr_max:	maximum allowed AC charger current in mA
- */
-struct abx500_bm_charger_parameters {
-	int usb_volt_max;
-	int usb_curr_max;
-	int ac_volt_max;
-	int ac_curr_max;
-};
-
-/**
- * struct abx500_bm_data - abx500 battery management data
- * @temp_under		under this temp, charging is stopped
- * @temp_low		between this temp and temp_under charging is reduced
- * @temp_high		between this temp and temp_over charging is reduced
- * @temp_over		over this temp, charging is stopped
- * @temp_now		present battery temperature
- * @temp_interval_chg	temperature measurement interval in s when charging
- * @temp_interval_nochg	temperature measurement interval in s when not charging
- * @main_safety_tmr_h	safety timer for main charger
- * @usb_safety_tmr_h	safety timer for usb charger
- * @bkup_bat_v		voltage which we charge the backup battery with
- * @bkup_bat_i		current which we charge the backup battery with
- * @no_maintenance	indicates that maintenance charging is disabled
- * @capacity_scaling    indicates whether capacity scaling is to be used
- * @abx500_adc_therm	placement of thermistor, batctrl or battemp adc
- * @chg_unknown_bat	flag to enable charging of unknown batteries
- * @enable_overshoot	flag to enable VBAT overshoot control
- * @auto_trig		flag to enable auto adc trigger
- * @fg_res		resistance of FG resistor in 0.1mOhm
- * @n_btypes		number of elements in array bat_type
- * @batt_id		index of the identified battery in array bat_type
- * @interval_charging	charge alg cycle period time when charging (sec)
- * @interval_not_charging charge alg cycle period time when not charging (sec)
- * @temp_hysteresis	temperature hysteresis
- * @gnd_lift_resistance	Battery ground to phone ground resistance (mOhm)
- * @n_chg_out_curr		number of elements in array chg_output_curr
- * @n_chg_in_curr		number of elements in array chg_input_curr
- * @chg_output_curr	charger output current level map
- * @chg_input_curr		charger input current level map
- * @maxi		maximization parameters
- * @cap_levels		capacity in percent for the different capacity levels
- * @bat_type		table of supported battery types
- * @chg_params		charger parameters
- * @fg_params		fuel gauge parameters
- */
-struct abx500_bm_data {
-	int temp_under;
-	int temp_low;
-	int temp_high;
-	int temp_over;
-	int temp_now;
-	int temp_interval_chg;
-	int temp_interval_nochg;
-	int main_safety_tmr_h;
-	int usb_safety_tmr_h;
-	int bkup_bat_v;
-	int bkup_bat_i;
-	bool autopower_cfg;
-	bool ac_enabled;
-	bool usb_enabled;
-	bool no_maintenance;
-	bool capacity_scaling;
-	bool chg_unknown_bat;
-	bool enable_overshoot;
-	bool auto_trig;
-	enum abx500_adc_therm adc_therm;
-	int fg_res;
-	int n_btypes;
-	int batt_id;
-	int interval_charging;
-	int interval_not_charging;
-	int temp_hysteresis;
-	int gnd_lift_resistance;
-	int n_chg_out_curr;
-	int n_chg_in_curr;
-	int *chg_output_curr;
-	int *chg_input_curr;
-	const struct abx500_maxim_parameters *maxi;
-	const struct abx500_bm_capacity_levels *cap_levels;
-	struct abx500_battery_type *bat_type;
-	const struct abx500_bm_charger_parameters *chg_params;
-	const struct abx500_fg_parameters *fg_params;
-};
-
-enum {
-	NTC_EXTERNAL = 0,
-	NTC_INTERNAL,
-};
-
-int ab8500_bm_of_probe(struct device *dev,
-		       struct device_node *np,
-		       struct abx500_bm_data *bm);
-
 int abx500_set_register_interruptible(struct device *dev, u8 bank, u8 reg,
 	u8 value);
 int abx500_get_register_interruptible(struct device *dev, u8 bank, u8 reg,
-- 
cgit v1.2.3


From 7abb18bd7567480e34f46d3512369ec49499064e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 25 Feb 2021 16:10:38 -0800
Subject: rcu: Provide polling interfaces for Tree RCU grace periods

There is a need for a non-blocking polling interface for RCU grace
periods, so this commit supplies start_poll_synchronize_rcu() and
poll_state_synchronize_rcu() for this purpose.  Note that the existing
get_state_synchronize_rcu() may be used if future grace periods are
inevitable (perhaps due to a later call_rcu() invocation).  The new
start_poll_synchronize_rcu() is to be used if future grace periods
might not otherwise happen.  Finally, poll_state_synchronize_rcu()
provides a lockless check for a grace period having elapsed since
the corresponding call to either of the get_state_synchronize_rcu()
or start_poll_synchronize_rcu().

As with get_state_synchronize_rcu(), the return value from either
get_state_synchronize_rcu() or start_poll_synchronize_rcu() is passed in
to a later call to either poll_state_synchronize_rcu() or the existing
(might_sleep) cond_synchronize_rcu().

[ paulmck: Remove redundant smp_mb() per Frederic Weisbecker feedback. ]
[ Update poll_state_synchronize_rcu() docbook per Frederic Weisbecker feedback. ]
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutree.h |  2 ++
 kernel/rcu/tree.c       | 75 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 70 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index df578b73960f..b89b54130f49 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -41,6 +41,8 @@ void rcu_momentary_dyntick_idle(void);
 void kfree_rcu_scheduler_running(void);
 bool rcu_gp_might_be_stalled(void);
 unsigned long get_state_synchronize_rcu(void);
+unsigned long start_poll_synchronize_rcu(void);
+bool poll_state_synchronize_rcu(unsigned long oldstate);
 void cond_synchronize_rcu(unsigned long oldstate);
 
 void rcu_idle_enter(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index da6f5213fb74..07e812261474 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3774,8 +3774,8 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
  * get_state_synchronize_rcu - Snapshot current RCU state
  *
  * Returns a cookie that is used by a later call to cond_synchronize_rcu()
- * to determine whether or not a full grace period has elapsed in the
- * meantime.
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime.
  */
 unsigned long get_state_synchronize_rcu(void)
 {
@@ -3788,14 +3788,77 @@ unsigned long get_state_synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 
+/**
+ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime.  If the needed grace period
+ * is not already slated to start, notifies RCU core of the need for that
+ * grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+	unsigned long flags;
+	unsigned long gp_seq = get_state_synchronize_rcu();
+	bool needwake;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	lockdep_assert_irqs_enabled();
+	local_irq_save(flags);
+	rdp = this_cpu_ptr(&rcu_data);
+	rnp = rdp->mynode;
+	raw_spin_lock_rcu_node(rnp); // irqs already disabled.
+	needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	if (needwake)
+		rcu_gp_kthread_wake();
+	return gp_seq;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
+
+/**
+ * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
+ *
+ * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which oldstate was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibilty to invoke this
+ * function later on until it does return @true.  Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @oldstate
+ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!).
+ * Those needing to keep oldstate values for very long time periods
+ * (many hours even on 32-bit systems) should check them occasionally
+ * and either refresh them or set a flag indicating that the grace period
+ * has completed.
+ */
+bool poll_state_synchronize_rcu(unsigned long oldstate)
+{
+	if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) {
+		smp_mb(); /* Ensure GP ends before subsequent accesses. */
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+
 /**
  * cond_synchronize_rcu - Conditionally wait for an RCU grace period
  *
  * @oldstate: return value from earlier call to get_state_synchronize_rcu()
  *
  * If a full RCU grace period has elapsed since the earlier call to
- * get_state_synchronize_rcu(), just return.  Otherwise, invoke
- * synchronize_rcu() to wait for a full grace period.
+ * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
+ * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
  *
  * Yes, this function does not take counter wrap into account.  But
  * counter wrap is harmless.  If the counter wraps, we have waited for
@@ -3804,10 +3867,8 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
  */
 void cond_synchronize_rcu(unsigned long oldstate)
 {
-	if (!rcu_seq_done(&rcu_state.gp_seq, oldstate))
+	if (!poll_state_synchronize_rcu(oldstate))
 		synchronize_rcu();
-	else
-		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
-- 
cgit v1.2.3


From 2d669ceb69c276f7637cf760287ca4187add082e Mon Sep 17 00:00:00 2001
From: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Date: Tue, 16 Mar 2021 13:36:02 +0900
Subject: dm table: Fix zoned model check and zone sectors check

Commit 24f6b6036c9e ("dm table: fix zoned iterate_devices based device
capability checks") triggered dm table load failure when dm-zoned device
is set up for zoned block devices and a regular device for cache.

The commit inverted logic of two callback functions for iterate_devices:
device_is_zoned_model() and device_matches_zone_sectors(). The logic of
device_is_zoned_model() was inverted then all destination devices of all
targets in dm table are required to have the expected zoned model. This
is fine for dm-linear, dm-flakey and dm-crypt on zoned block devices
since each target has only one destination device. However, this results
in failure for dm-zoned with regular cache device since that target has
both regular block device and zoned block devices.

As for device_matches_zone_sectors(), the commit inverted the logic to
require all zoned block devices in each target have the specified
zone_sectors. This check also fails for regular block device which does
not have zones.

To avoid the check failures, fix the zone model check and the zone
sectors check. For zone model check, introduce the new feature flag
DM_TARGET_MIXED_ZONED_MODEL, and set it to dm-zoned target. When the
target has this flag, allow it to have destination devices with any
zoned model. For zone sectors check, skip the check if the destination
device is not a zoned block device. Also add comments and improve an
error message to clarify expectations to the two checks.

Fixes: 24f6b6036c9e ("dm table: fix zoned iterate_devices based device capability checks")
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c         | 33 +++++++++++++++++++++++++--------
 drivers/md/dm-zoned-target.c  |  2 +-
 include/linux/device-mapper.h | 15 ++++++++++++++-
 3 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 95391f78b8d5..e5f0f1703c5d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1594,6 +1594,13 @@ static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
 	return blk_queue_zoned_model(q) != *zoned_model;
 }
 
+/*
+ * Check the device zoned model based on the target feature flag. If the target
+ * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are
+ * also accepted but all devices must have the same zoned model. If the target
+ * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any
+ * zoned model with all zoned devices having the same zone size.
+ */
 static bool dm_table_supports_zoned_model(struct dm_table *t,
 					  enum blk_zoned_model zoned_model)
 {
@@ -1603,13 +1610,15 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
 	for (i = 0; i < dm_table_get_num_targets(t); i++) {
 		ti = dm_table_get_target(t, i);
 
-		if (zoned_model == BLK_ZONED_HM &&
-		    !dm_target_supports_zoned_hm(ti->type))
-			return false;
-
-		if (!ti->type->iterate_devices ||
-		    ti->type->iterate_devices(ti, device_not_zoned_model, &zoned_model))
-			return false;
+		if (dm_target_supports_zoned_hm(ti->type)) {
+			if (!ti->type->iterate_devices ||
+			    ti->type->iterate_devices(ti, device_not_zoned_model,
+						      &zoned_model))
+				return false;
+		} else if (!dm_target_supports_mixed_zoned_model(ti->type)) {
+			if (zoned_model == BLK_ZONED_HM)
+				return false;
+		}
 	}
 
 	return true;
@@ -1621,9 +1630,17 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *
 	struct request_queue *q = bdev_get_queue(dev->bdev);
 	unsigned int *zone_sectors = data;
 
+	if (!blk_queue_is_zoned(q))
+		return 0;
+
 	return blk_queue_zone_sectors(q) != *zone_sectors;
 }
 
+/*
+ * Check consistency of zoned model and zone sectors across all targets. For
+ * zone sectors, if the destination device is a zoned block device, it shall
+ * have the specified zone_sectors.
+ */
 static int validate_hardware_zoned_model(struct dm_table *table,
 					 enum blk_zoned_model zoned_model,
 					 unsigned int zone_sectors)
@@ -1642,7 +1659,7 @@ static int validate_hardware_zoned_model(struct dm_table *table,
 		return -EINVAL;
 
 	if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) {
-		DMERR("%s: zone sectors is not consistent across all devices",
+		DMERR("%s: zone sectors is not consistent across all zoned devices",
 		      dm_device_name(table->md));
 		return -EINVAL;
 	}
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 697f9de37355..7e88df64d197 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1143,7 +1143,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
 static struct target_type dmz_type = {
 	.name		 = "zoned",
 	.version	 = {2, 0, 0},
-	.features	 = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
+	.features	 = DM_TARGET_SINGLETON | DM_TARGET_MIXED_ZONED_MODEL,
 	.module		 = THIS_MODULE,
 	.ctr		 = dmz_ctr,
 	.dtr		 = dmz_dtr,
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 7f4ac87c0b32..5c641f930caf 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -253,7 +253,11 @@ struct target_type {
 #define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
 
 /*
- * Indicates that a target supports host-managed zoned block devices.
+ * Indicates support for zoned block devices:
+ * - DM_TARGET_ZONED_HM: the target also supports host-managed zoned
+ *   block devices but does not support combining different zoned models.
+ * - DM_TARGET_MIXED_ZONED_MODEL: the target supports combining multiple
+ *   devices with different zoned models.
  */
 #ifdef CONFIG_BLK_DEV_ZONED
 #define DM_TARGET_ZONED_HM		0x00000040
@@ -275,6 +279,15 @@ struct target_type {
 #define DM_TARGET_PASSES_CRYPTO		0x00000100
 #define dm_target_passes_crypto(type) ((type)->features & DM_TARGET_PASSES_CRYPTO)
 
+#ifdef CONFIG_BLK_DEV_ZONED
+#define DM_TARGET_MIXED_ZONED_MODEL	0x00000200
+#define dm_target_supports_mixed_zoned_model(type) \
+	((type)->features & DM_TARGET_MIXED_ZONED_MODEL)
+#else
+#define DM_TARGET_MIXED_ZONED_MODEL	0x00000000
+#define dm_target_supports_mixed_zoned_model(type) (false)
+#endif
+
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;
-- 
cgit v1.2.3


From c558d47596867ff1082fd7475b63670f63f7f5cf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Mar 2021 18:32:30 -0500
Subject: svcrdma: Maintain a Receive water mark

Post more Receives when the number of pending Receives drops below
a water mark. The batch mechanism is disabled if the underlying
device cannot support a reasonably-sized Receive Queue.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h          |  2 ++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 10 ++++++++--
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  5 ++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 1e76ed688044..722fc7c48725 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -94,6 +94,8 @@ struct svcxprt_rdma {
 	spinlock_t	     sc_rw_ctxt_lock;
 	struct list_head     sc_rw_ctxts;
 
+	u32		     sc_pending_recvs;
+	u32		     sc_recv_batch;
 	struct list_head     sc_rq_dto_q;
 	spinlock_t	     sc_rq_dto_lock;
 	struct ib_qp         *sc_qp;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 1e7381ff948b..2571188ef7f2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -285,6 +285,7 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma,
 		ctxt->rc_temp = temp;
 		ctxt->rc_recv_wr.next = recv_chain;
 		recv_chain = &ctxt->rc_recv_wr;
+		rdma->sc_pending_recvs++;
 	}
 	if (!recv_chain)
 		return false;
@@ -302,6 +303,8 @@ err_free:
 		bad_wr = bad_wr->next;
 		svc_rdma_recv_ctxt_put(rdma, ctxt);
 	}
+	/* Since we're destroying the xprt, no need to reset
+	 * sc_pending_recvs. */
 	return false;
 }
 
@@ -328,6 +331,8 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct svc_rdma_recv_ctxt *ctxt;
 
+	rdma->sc_pending_recvs--;
+
 	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
 	ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
 
@@ -344,8 +349,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	 * to reduce the likelihood of replayed requests once the
 	 * client reconnects.
 	 */
-	if (!svc_rdma_refresh_recvs(rdma, 1, false))
-		goto flushed;
+	if (rdma->sc_pending_recvs < rdma->sc_max_requests)
+		if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false))
+			goto flushed;
 
 	/* All wc fields are now known to be valid */
 	ctxt->rc_byte_len = wc->byte_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 046a07da5cf9..e629eacfedfc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -407,11 +407,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	newxprt->sc_max_req_size = svcrdma_max_req_size;
 	newxprt->sc_max_requests = svcrdma_max_requests;
 	newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
-	rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests;
+	newxprt->sc_recv_batch = RPCRDMA_MAX_RECV_BATCH;
+	rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests +
+		   newxprt->sc_recv_batch;
 	if (rq_depth > dev->attrs.max_qp_wr) {
 		pr_warn("svcrdma: reducing receive depth to %d\n",
 			dev->attrs.max_qp_wr);
 		rq_depth = dev->attrs.max_qp_wr;
+		newxprt->sc_recv_batch = 1;
 		newxprt->sc_max_requests = rq_depth - 2;
 		newxprt->sc_max_bc_requests = 2;
 	}
-- 
cgit v1.2.3


From e844d307d46cfa7e09cdb671941bfd5f1be86773 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 20 Feb 2021 18:53:40 -0500
Subject: svcrdma: Add a "deferred close" helper

Refactor a bit of commonly used logic so that every site that wants
a close deferred to an nfsd thread does all the right things
(set_bit(XPT_CLOSE) then enqueue).

Also, once XPT_CLOSE is set on a transport, it is never cleared. If
XPT_CLOSE is already set, then the close is already being handled
and the enqueue can be skipped.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_xprt.h          |  1 +
 net/sunrpc/svc_xprt.c                    | 14 ++++++++++++++
 net/sunrpc/svcsock.c                     | 15 ++++++---------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  |  3 +--
 net/sunrpc/xprtrdma/svc_rdma_rw.c        |  5 ++---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 10 ++++------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  6 ++----
 7 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 92455e0d5244..34dacadfe517 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -143,6 +143,7 @@ struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
 int	svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen);
 void	svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *xprt);
 void	svc_age_temp_xprts_now(struct svc_serv *, struct sockaddr *);
+void	svc_xprt_deferred_close(struct svc_xprt *xprt);
 
 static inline void svc_xprt_get(struct svc_xprt *xprt)
 {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 3cdd71a8df1e..b134fc5f3b8d 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -139,6 +139,20 @@ int svc_print_xprts(char *buf, int maxlen)
 	return len;
 }
 
+/**
+ * svc_xprt_deferred_close - Close a transport
+ * @xprt: transport instance
+ *
+ * Used in contexts that need to defer the work of shutting down
+ * the transport to an nfsd thread.
+ */
+void svc_xprt_deferred_close(struct svc_xprt *xprt)
+{
+	if (!test_and_set_bit(XPT_CLOSE, &xprt->xpt_flags))
+		svc_xprt_enqueue(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_deferred_close);
+
 static void svc_xprt_free(struct kref *kref)
 {
 	struct svc_xprt *xprt =
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 2e2f007dfc9f..22454b2df5be 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -728,10 +728,8 @@ static void svc_tcp_state_change(struct sock *sk)
 		rmb();
 		svsk->sk_ostate(sk);
 		trace_svcsock_tcp_state(&svsk->sk_xprt, svsk->sk_sock);
-		if (sk->sk_state != TCP_ESTABLISHED) {
-			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
-			svc_xprt_enqueue(&svsk->sk_xprt);
-		}
+		if (sk->sk_state != TCP_ESTABLISHED)
+			svc_xprt_deferred_close(&svsk->sk_xprt);
 	}
 }
 
@@ -901,7 +899,7 @@ err_too_large:
 	net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n",
 			       __func__, svsk->sk_xprt.xpt_server->sv_name,
 			       svc_sock_reclen(svsk));
-	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+	svc_xprt_deferred_close(&svsk->sk_xprt);
 err_short:
 	return -EAGAIN;
 }
@@ -1057,7 +1055,7 @@ err_nuts:
 	svsk->sk_datalen = 0;
 err_delete:
 	trace_svcsock_tcp_recv_err(&svsk->sk_xprt, len);
-	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+	svc_xprt_deferred_close(&svsk->sk_xprt);
 err_noclose:
 	return 0;	/* record not complete */
 }
@@ -1188,8 +1186,7 @@ out_close:
 		  xprt->xpt_server->sv_name,
 		  (err < 0) ? "got error" : "sent",
 		  (err < 0) ? err : sent, xdr->len);
-	set_bit(XPT_CLOSE, &xprt->xpt_flags);
-	svc_xprt_enqueue(xprt);
+	svc_xprt_deferred_close(xprt);
 	atomic_dec(&svsk->sk_sendqlen);
 	mutex_unlock(&xprt->xpt_mutex);
 	return -EAGAIN;
@@ -1268,7 +1265,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 		case TCP_ESTABLISHED:
 			break;
 		default:
-			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+			svc_xprt_deferred_close(&svsk->sk_xprt);
 		}
 	}
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2571188ef7f2..8d93d26e0318 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -367,8 +367,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 
 flushed:
 	svc_rdma_recv_ctxt_put(rdma, ctxt);
-	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
-	svc_xprt_enqueue(&rdma->sc_xprt);
+	svc_xprt_deferred_close(&rdma->sc_xprt);
 }
 
 /**
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 693d139a8633..d7054e3a8e33 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -250,7 +250,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
 	wake_up(&rdma->sc_send_wait);
 
 	if (unlikely(wc->status != IB_WC_SUCCESS))
-		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		svc_xprt_deferred_close(&rdma->sc_xprt);
 
 	svc_rdma_write_info_free(info);
 }
@@ -334,7 +334,6 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
 static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
 {
 	struct svcxprt_rdma *rdma = cc->cc_rdma;
-	struct svc_xprt *xprt = &rdma->sc_xprt;
 	struct ib_send_wr *first_wr;
 	const struct ib_send_wr *bad_wr;
 	struct list_head *tmp;
@@ -373,7 +372,7 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
 	} while (1);
 
 	trace_svcrdma_sq_post_err(rdma, ret);
-	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+	svc_xprt_deferred_close(&rdma->sc_xprt);
 
 	/* If even one was posted, there will be a completion. */
 	if (bad_wr != first_wr)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index e6fab5dd20d0..4471a0fcd3a3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -285,10 +285,8 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 
 	svc_rdma_send_ctxt_put(rdma, ctxt);
 
-	if (unlikely(wc->status != IB_WC_SUCCESS)) {
-		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
-		svc_xprt_enqueue(&rdma->sc_xprt);
-	}
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		svc_xprt_deferred_close(&rdma->sc_xprt);
 }
 
 /**
@@ -334,7 +332,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 	}
 
 	trace_svcrdma_sq_post_err(rdma, ret);
-	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+	svc_xprt_deferred_close(&rdma->sc_xprt);
 	wake_up(&rdma->sc_send_wait);
 	return ret;
 }
@@ -994,7 +992,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	svc_rdma_send_ctxt_put(rdma, sctxt);
  err0:
 	trace_svcrdma_send_err(rqstp, ret);
-	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+	svc_xprt_deferred_close(&rdma->sc_xprt);
 	return -ENOTCONN;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e629eacfedfc..3646216211c5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -119,8 +119,7 @@ static void qp_event_handler(struct ib_event *event, void *context)
 	case IB_EVENT_QP_ACCESS_ERR:
 	case IB_EVENT_DEVICE_FATAL:
 	default:
-		set_bit(XPT_CLOSE, &xprt->xpt_flags);
-		svc_xprt_enqueue(xprt);
+		svc_xprt_deferred_close(xprt);
 		break;
 	}
 }
@@ -286,8 +285,7 @@ static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id,
 		break;
 	case RDMA_CM_EVENT_DISCONNECTED:
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
-		set_bit(XPT_CLOSE, &xprt->xpt_flags);
-		svc_xprt_enqueue(xprt);
+		svc_xprt_deferred_close(xprt);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 2a1e4f21d84184f7ff5768ee3d3d0c30b1135867 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 13 Jan 2021 13:57:18 -0500
Subject: svcrdma: Normalize Send page handling

Currently svc_rdma_sendto() migrates xdr_buf pages into a separate
page list and NULLs out a bunch of entries in rq_pages while the
pages are under I/O. The Send completion handler then frees those
pages later.

Instead, let's wait for the Send completion, then handle page
releasing in the nfsd thread. I'd like to avoid the cost of 250+
put_page() calls in the Send completion handler, which is single-
threaded.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h            |  1 +
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  8 +++++++-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 29 ++++++++++++++++-------------
 3 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 722fc7c48725..5841978550c6 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -160,6 +160,7 @@ struct svc_rdma_send_ctxt {
 
 	struct ib_send_wr	sc_send_wr;
 	struct ib_cqe		sc_cqe;
+	struct completion	sc_done;
 	struct xdr_buf		sc_hdrbuf;
 	struct xdr_stream	sc_stream;
 	void			*sc_xprt_buf;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 9150df35fb6f..16897fcb659c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -93,7 +93,13 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	 */
 	get_page(virt_to_page(rqst->rq_buffer));
 	sctxt->sc_send_wr.opcode = IB_WR_SEND;
-	return svc_rdma_send(rdma, sctxt);
+	ret = svc_rdma_send(rdma, sctxt);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_killable(&sctxt->sc_done);
+	svc_rdma_send_ctxt_put(rdma, sctxt);
+	return ret;
 }
 
 /* Server-side transport endpoint wants a whole page for its send
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4471a0fcd3a3..62d55850ca54 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -155,6 +155,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
 	ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
 	ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
 	ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
+	init_completion(&ctxt->sc_done);
 	ctxt->sc_cqe.done = svc_rdma_wc_send;
 	ctxt->sc_xprt_buf = buffer;
 	xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
@@ -280,11 +281,11 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 
 	trace_svcrdma_wc_send(wc, &ctxt->sc_cid);
 
+	complete(&ctxt->sc_done);
+
 	atomic_inc(&rdma->sc_sq_avail);
 	wake_up(&rdma->sc_send_wait);
 
-	svc_rdma_send_ctxt_put(rdma, ctxt);
-
 	if (unlikely(wc->status != IB_WC_SUCCESS))
 		svc_xprt_deferred_close(&rdma->sc_xprt);
 }
@@ -294,7 +295,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
  * @rdma: transport on which to post the WR
  * @ctxt: send ctxt with a Send WR ready to post
  *
- * Returns zero the Send WR was posted successfully. Otherwise, a
+ * Returns zero if the Send WR was posted successfully. Otherwise, a
  * negative errno is returned.
  */
 int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
@@ -302,7 +303,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
 	struct ib_send_wr *wr = &ctxt->sc_send_wr;
 	int ret;
 
-	might_sleep();
+	reinit_completion(&ctxt->sc_done);
 
 	/* Sync the transport header buffer */
 	ib_dma_sync_single_for_device(rdma->sc_pd->device,
@@ -795,7 +796,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
  * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
  * so they are released by the Send completion handler.
  */
-static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+static inline void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
 				   struct svc_rdma_send_ctxt *ctxt)
 {
 	int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
@@ -839,15 +840,20 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
 	if (ret < 0)
 		return ret;
 
-	svc_rdma_save_io_pages(rqstp, sctxt);
-
 	if (rctxt->rc_inv_rkey) {
 		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
 		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
 	} else {
 		sctxt->sc_send_wr.opcode = IB_WR_SEND;
 	}
-	return svc_rdma_send(rdma, sctxt);
+
+	ret = svc_rdma_send(rdma, sctxt);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_killable(&sctxt->sc_done);
+	svc_rdma_send_ctxt_put(rdma, sctxt);
+	return ret;
 }
 
 /**
@@ -913,7 +919,8 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
 	sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
 	if (svc_rdma_send(rdma, sctxt))
 		goto put_ctxt;
-	return;
+
+	wait_for_completion_killable(&sctxt->sc_done);
 
 put_ctxt:
 	svc_rdma_send_ctxt_put(rdma, sctxt);
@@ -981,10 +988,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	if (ret != -E2BIG && ret != -EINVAL)
 		goto err1;
 
-	/* Send completion releases payload pages that were part
-	 * of previously posted RDMA Writes.
-	 */
-	svc_rdma_save_io_pages(rqstp, sctxt);
 	svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
 	return 0;
 
-- 
cgit v1.2.3


From 579900670ac770a547ff607a60c02c56a7d27bd7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 28 Jan 2021 16:47:56 -0500
Subject: svcrdma: Remove unused sc_pages field

Clean up. This significantly reduces the size of struct
svc_rdma_send_ctxt.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h       |  3 +--
 net/sunrpc/xprtrdma/svc_rdma_sendto.c | 25 -------------------------
 2 files changed, 1 insertion(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 5841978550c6..6e621e1f56b8 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -164,9 +164,8 @@ struct svc_rdma_send_ctxt {
 	struct xdr_buf		sc_hdrbuf;
 	struct xdr_stream	sc_stream;
 	void			*sc_xprt_buf;
-	int			sc_page_count;
 	int			sc_cur_sge_no;
-	struct page		*sc_pages[RPCSVC_MAXPAGES];
+
 	struct ib_sge		sc_sges[];
 };
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 62d55850ca54..f093c9b536ff 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -219,7 +219,6 @@ out:
 
 	ctxt->sc_send_wr.num_sge = 0;
 	ctxt->sc_cur_sge_no = 0;
-	ctxt->sc_page_count = 0;
 	return ctxt;
 
 out_empty:
@@ -234,8 +233,6 @@ out_empty:
  * svc_rdma_send_ctxt_put - Return send_ctxt to free list
  * @rdma: controlling svcxprt_rdma
  * @ctxt: object to return to the free list
- *
- * Pages left in sc_pages are DMA unmapped and released.
  */
 void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 			    struct svc_rdma_send_ctxt *ctxt)
@@ -256,9 +253,6 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
 					     ctxt->sc_sges[i].length);
 	}
 
-	for (i = 0; i < ctxt->sc_page_count; ++i)
-		put_page(ctxt->sc_pages[i]);
-
 	spin_lock(&rdma->sc_send_lock);
 	list_add(&ctxt->sc_list, &rdma->sc_send_ctxts);
 	spin_unlock(&rdma->sc_send_lock);
@@ -792,25 +786,6 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
 				       svc_rdma_xb_dma_map, &args);
 }
 
-/* The svc_rqst and all resources it owns are released as soon as
- * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
- * so they are released by the Send completion handler.
- */
-static inline void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
-				   struct svc_rdma_send_ctxt *ctxt)
-{
-	int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
-
-	ctxt->sc_page_count += pages;
-	for (i = 0; i < pages; i++) {
-		ctxt->sc_pages[i] = rqstp->rq_respages[i];
-		rqstp->rq_respages[i] = NULL;
-	}
-
-	/* Prevent svc_xprt_release from releasing pages in rq_pages */
-	rqstp->rq_next_page = rqstp->rq_respages;
-}
-
 /* Prepare the portion of the RPC Reply that will be transmitted
  * via RDMA Send. The RPC-over-RDMA transport header is prepared
  * in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
-- 
cgit v1.2.3


From 7dcfbd86adc45f6d6b37278efd22530cf80ab474 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 29 Jan 2021 13:04:04 -0500
Subject: SUNRPC: Export svc_xprt_received()

Prepare svc_xprt_received() to be called from transport code instead
of from generic RPC server code.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_xprt.h |  1 +
 include/trace/events/sunrpc.h   |  1 +
 net/sunrpc/svc_xprt.c           | 13 +++++++++----
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 34dacadfe517..571f605bc91e 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -130,6 +130,7 @@ void	svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *,
 int	svc_create_xprt(struct svc_serv *, const char *, struct net *,
 			const int, const unsigned short, int,
 			const struct cred *);
+void	svc_xprt_received(struct svc_xprt *xprt);
 void	svc_xprt_do_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_put(struct svc_xprt *xprt);
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 036eb1f5c133..bda16e9e6ba7 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1781,6 +1781,7 @@ DECLARE_EVENT_CLASS(svc_xprt_event,
 			), \
 			TP_ARGS(xprt))
 
+DEFINE_SVC_XPRT_EVENT(received);
 DEFINE_SVC_XPRT_EVENT(no_write_space);
 DEFINE_SVC_XPRT_EVENT(close);
 DEFINE_SVC_XPRT_EVENT(detach);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b134fc5f3b8d..9d1374e82e90 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -247,21 +247,25 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	return xprt;
 }
 
-/*
- * svc_xprt_received conditionally queues the transport for processing
- * by another thread. The caller must hold the XPT_BUSY bit and must
+/**
+ * svc_xprt_received - start next receiver thread
+ * @xprt: controlling transport
+ *
+ * The caller must hold the XPT_BUSY bit and must
  * not thereafter touch transport data.
  *
  * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
  * insufficient) data.
  */
-static void svc_xprt_received(struct svc_xprt *xprt)
+void svc_xprt_received(struct svc_xprt *xprt)
 {
 	if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) {
 		WARN_ONCE(1, "xprt=0x%p already busy!", xprt);
 		return;
 	}
 
+	trace_svc_xprt_received(xprt);
+
 	/* As soon as we clear busy, the xprt could be closed and
 	 * 'put', so we need a reference to call svc_enqueue_xprt with:
 	 */
@@ -271,6 +275,7 @@ static void svc_xprt_received(struct svc_xprt *xprt)
 	xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
 	svc_xprt_put(xprt);
 }
+EXPORT_SYMBOL_GPL(svc_xprt_received);
 
 void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
 {
-- 
cgit v1.2.3


From 1f7ea1cd6a3748427512ccc9582e18cd9efea966 Mon Sep 17 00:00:00 2001
From: Qi Zhang <qi.z.zhang@intel.com>
Date: Tue, 9 Mar 2021 11:08:04 +0800
Subject: ice: Enable FDIR Configure for AVF

The virtual channel is going to be extended to support FDIR and
RSS configure from AVF. New data structures and OP codes will be
added, the patch enable the FDIR part.

To support above advanced AVF feature, we need to figure out
what kind of data structure should be passed from VF to PF to describe
an FDIR rule or RSS config rule. The common part of the requirement is
we need a data structure to represent the input set selection of a rule's
hash key.

An input set selection is a group of fields be selected from one or more
network protocol layers that could be identified as a specific flow.
For example, select dst IP address from an IPv4 header combined with
dst port from the TCP header as the input set for an IPv4/TCP flow.

The patch adds a new data structure virtchnl_proto_hdrs to abstract
a network protocol headers group which is composed of layers of network
protocol header(virtchnl_proto_hdr).

A protocol header contains a 32 bits mask (field_selector) to describe
which fields are selected as input sets, as well as a header type
(enum virtchnl_proto_hdr_type). Each bit is mapped to a field in
enum virtchnl_proto_hdr_field guided by its header type.

+------------+-----------+------------------------------+
|            | Proto Hdr | Header Type A                |
|            |           +------------------------------+
|            |           | BIT 31 | ... | BIT 1 | BIT 0 |
|            |-----------+------------------------------+
|Proto Hdrs  | Proto Hdr | Header Type B                |
|            |           +------------------------------+
|            |           | BIT 31 | ... | BIT 1 | BIT 0 |
|            |-----------+------------------------------+
|            | Proto Hdr | Header Type C                |
|            |           +------------------------------+
|            |           | BIT 31 | ... | BIT 1 | BIT 0 |
|            |-----------+------------------------------+
|            |    ....                                  |
+-------------------------------------------------------+

All fields in enum virtchnl_proto_hdr_fields are grouped with header type
and the value of the first field of a header type is always 32 aligned.

enum proto_hdr_type {
        header_type_A = 0;
        header_type_B = 1;
        ....
}

enum proto_hdr_field {
        /* header type A */
        header_A_field_0 = 0,
        header_A_field_1 = 1,
        header_A_field_2 = 2,
        header_A_field_3 = 3,

        /* header type B */
        header_B_field_0 = 32, // = header_type_B << 5
        header_B_field_0 = 33,
        header_B_field_0 = 34
        header_B_field_0 = 35,
        ....
};

So we have:
proto_hdr_type = proto_hdr_field / 32
bit offset = proto_hdr_field % 32

To simply the protocol header's operations, couple help macros are added.
For example, to select src IP and dst port as input set for an IPv4/UDP
flow.

we have:
struct virtchnl_proto_hdr hdr[2];

VIRTCHNL_SET_PROTO_HDR_TYPE(&hdr[0], IPV4)
VIRTCHNL_ADD_PROTO_HDR_FIELD(&hdr[0], IPV4, SRC)

VIRTCHNL_SET_PROTO_HDR_TYPE(&hdr[1], UDP)
VIRTCHNL_ADD_PROTO_HDR_FIELD(&hdr[1], UDP, DST)

The byte array is used to store the protocol header of a training package.
The byte array must be network order.

The patch added virtual channel support for iAVF FDIR add/validate/delete
filter. iAVF FDIR is Flow Director for Intel Adaptive Virtual Function
which can direct Ethernet packets to the queues of the Network Interface
Card. Add/delete command is adding or deleting one rule for each virtual
channel message, while validate command is just verifying if this rule
is valid without any other operations.

To add or delete one rule, driver needs to config TCAM and Profile,
build training packets which contains the input set value, and send
the training packets through FDIR Tx queue. In addition, driver needs to
manage the software context to avoid adding duplicated rules, deleting
non-existent rule, input set conflicts and other invalid cases.

NOTE:
Supported pattern/actions and their parse functions are not be included in
this patch, they will be added in a separate one.

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Signed-off-by: Yahui Cao <yahui.cao@intel.com>
Signed-off-by: Simei Su <simei.su@intel.com>
Signed-off-by: Beilei Xing <beilei.xing@intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Tested-by: Chen Bo <BoX.C.Chen@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/Makefile            |    2 +-
 drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c  |    4 +
 drivers/net/ethernet/intel/ice/ice_fdir.c          |    6 +-
 drivers/net/ethernet/intel/ice/ice_fdir.h          |    5 +
 drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h     |    2 +
 drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c | 1034 ++++++++++++++++++++
 drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h |   24 +
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c   |   17 +-
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h   |    6 +
 include/linux/avf/virtchnl.h                       |  278 ++++++
 10 files changed, 1372 insertions(+), 6 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile
index 73da4f71f530..f391691e2c7e 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -26,7 +26,7 @@ ice-y := ice_main.o	\
 	 ice_fw_update.o \
 	 ice_lag.o	\
 	 ice_ethtool.o
-ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o
+ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o ice_virtchnl_fdir.o
 ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o
 ice-$(CONFIG_RFS_ACCEL) += ice_arfs.o
 ice-$(CONFIG_XDP_SOCKETS) += ice_xsk.o
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c
index 192729546bbf..440964defa4a 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c
@@ -1679,6 +1679,10 @@ int ice_add_fdir_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd)
 		input->flex_offset = userdata.flex_offset;
 	}
 
+	input->cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+	input->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_THREE;
+	input->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
+
 	/* input struct is added to the HW filter list */
 	ice_fdir_update_list_entry(pf, input, fsp->location);
 
diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.c b/drivers/net/ethernet/intel/ice/ice_fdir.c
index 0c2066c0ab1f..8f3e61c6bfd6 100644
--- a/drivers/net/ethernet/intel/ice/ice_fdir.c
+++ b/drivers/net/ethernet/intel/ice/ice_fdir.c
@@ -378,7 +378,7 @@ ice_fdir_get_prgm_desc(struct ice_hw *hw, struct ice_fdir_fltr *input,
 		fdir_fltr_ctx.drop = ICE_FXD_FLTR_QW0_DROP_NO;
 		fdir_fltr_ctx.qindex = input->q_index;
 	}
-	fdir_fltr_ctx.cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS;
+	fdir_fltr_ctx.cnt_ena = input->cnt_ena;
 	fdir_fltr_ctx.cnt_index = input->cnt_index;
 	fdir_fltr_ctx.fd_vsi = ice_get_hw_vsi_num(hw, input->dest_vsi);
 	fdir_fltr_ctx.evict_ena = ICE_FXD_FLTR_QW0_EVICT_ENA_FALSE;
@@ -387,8 +387,8 @@ ice_fdir_get_prgm_desc(struct ice_hw *hw, struct ice_fdir_fltr *input,
 		ICE_FXD_FLTR_QW1_PCMD_REMOVE;
 	fdir_fltr_ctx.swap = ICE_FXD_FLTR_QW1_SWAP_NOT_SET;
 	fdir_fltr_ctx.comp_q = ICE_FXD_FLTR_QW0_COMP_Q_ZERO;
-	fdir_fltr_ctx.comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
-	fdir_fltr_ctx.fdid_prio = 3;
+	fdir_fltr_ctx.comp_report = input->comp_report;
+	fdir_fltr_ctx.fdid_prio = input->fdid_prio;
 	fdir_fltr_ctx.desc_prof = 1;
 	fdir_fltr_ctx.desc_prof_prio = 3;
 	ice_set_fd_desc_val(&fdir_fltr_ctx, fdesc);
diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.h b/drivers/net/ethernet/intel/ice/ice_fdir.h
index 84b40298a513..93f3f0d9d37b 100644
--- a/drivers/net/ethernet/intel/ice/ice_fdir.h
+++ b/drivers/net/ethernet/intel/ice/ice_fdir.h
@@ -31,6 +31,8 @@
 #define ICE_IPV6_HLIM_OFFSET		21
 #define ICE_IPV6_PROTO_OFFSET		20
 
+#define ICE_FDIR_MAX_FLTRS		16384
+
 /* IP v4 has 2 flag bits that enable fragment processing: DF and MF. DF
  * requests that the packet not be fragmented. MF indicates that a packet has
  * been fragmented.
@@ -138,9 +140,12 @@ struct ice_fdir_fltr {
 	u16 q_index;
 	u16 dest_vsi;
 	u8 dest_ctl;
+	u8 cnt_ena;
 	u8 fltr_status;
 	u16 cnt_index;
 	u32 fltr_id;
+	u8 fdid_prio;
+	u8 comp_report;
 };
 
 /* Dummy packet filter definition structure */
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
index 4ec24c3e813f..b30c22358c0a 100644
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -55,6 +55,7 @@ struct ice_fltr_desc {
 #define ICE_FXD_FLTR_QW0_COMP_REPORT_M	\
 				(0x3ULL << ICE_FXD_FLTR_QW0_COMP_REPORT_S)
 #define ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL	0x1ULL
+#define ICE_FXD_FLTR_QW0_COMP_REPORT_SW		0x2ULL
 
 #define ICE_FXD_FLTR_QW0_FD_SPACE_S	14
 #define ICE_FXD_FLTR_QW0_FD_SPACE_M	(0x3ULL << ICE_FXD_FLTR_QW0_FD_SPACE_S)
@@ -128,6 +129,7 @@ struct ice_fltr_desc {
 #define ICE_FXD_FLTR_QW1_FDID_PRI_S	25
 #define ICE_FXD_FLTR_QW1_FDID_PRI_M	(0x7ULL << ICE_FXD_FLTR_QW1_FDID_PRI_S)
 #define ICE_FXD_FLTR_QW1_FDID_PRI_ONE	0x1ULL
+#define ICE_FXD_FLTR_QW1_FDID_PRI_THREE	0x3ULL
 
 #define ICE_FXD_FLTR_QW1_FDID_MDID_S	28
 #define ICE_FXD_FLTR_QW1_FDID_MDID_M	(0xFULL << ICE_FXD_FLTR_QW1_FDID_MDID_S)
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
new file mode 100644
index 000000000000..6e7e8531d6ec
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
@@ -0,0 +1,1034 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021, Intel Corporation. */
+
+#include "ice.h"
+#include "ice_base.h"
+#include "ice_lib.h"
+#include "ice_flow.h"
+
+#define to_fltr_conf_from_desc(p) \
+	container_of(p, struct virtchnl_fdir_fltr_conf, input)
+
+#define ICE_FLOW_PROF_TYPE_S	0
+#define ICE_FLOW_PROF_TYPE_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_TYPE_S)
+#define ICE_FLOW_PROF_VSI_S	32
+#define ICE_FLOW_PROF_VSI_M	(0xFFFFFFFFULL << ICE_FLOW_PROF_VSI_S)
+
+/* Flow profile ID format:
+ * [0:31] - flow type, flow + tun_offs
+ * [32:63] - VSI index
+ */
+#define ICE_FLOW_PROF_FD(vsi, flow, tun_offs) \
+	((u64)(((((flow) + (tun_offs)) & ICE_FLOW_PROF_TYPE_M)) | \
+	      (((u64)(vsi) << ICE_FLOW_PROF_VSI_S) & ICE_FLOW_PROF_VSI_M)))
+
+struct virtchnl_fdir_fltr_conf {
+	struct ice_fdir_fltr input;
+};
+
+struct virtchnl_fdir_inset_map {
+	enum virtchnl_proto_hdr_field field;
+	enum ice_flow_field fld;
+};
+
+static const struct virtchnl_fdir_inset_map fdir_inset_map[] = {
+	{VIRTCHNL_PROTO_HDR_IPV4_SRC, ICE_FLOW_FIELD_IDX_IPV4_SA},
+	{VIRTCHNL_PROTO_HDR_IPV4_DST, ICE_FLOW_FIELD_IDX_IPV4_DA},
+	{VIRTCHNL_PROTO_HDR_IPV4_DSCP, ICE_FLOW_FIELD_IDX_IPV4_DSCP},
+	{VIRTCHNL_PROTO_HDR_IPV4_TTL, ICE_FLOW_FIELD_IDX_IPV4_TTL},
+	{VIRTCHNL_PROTO_HDR_IPV4_PROT, ICE_FLOW_FIELD_IDX_IPV4_PROT},
+	{VIRTCHNL_PROTO_HDR_IPV6_SRC, ICE_FLOW_FIELD_IDX_IPV6_SA},
+	{VIRTCHNL_PROTO_HDR_IPV6_DST, ICE_FLOW_FIELD_IDX_IPV6_DA},
+	{VIRTCHNL_PROTO_HDR_IPV6_TC, ICE_FLOW_FIELD_IDX_IPV6_DSCP},
+	{VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT, ICE_FLOW_FIELD_IDX_IPV6_TTL},
+	{VIRTCHNL_PROTO_HDR_IPV6_PROT, ICE_FLOW_FIELD_IDX_IPV6_PROT},
+	{VIRTCHNL_PROTO_HDR_UDP_SRC_PORT, ICE_FLOW_FIELD_IDX_UDP_SRC_PORT},
+	{VIRTCHNL_PROTO_HDR_UDP_DST_PORT, ICE_FLOW_FIELD_IDX_UDP_DST_PORT},
+	{VIRTCHNL_PROTO_HDR_TCP_SRC_PORT, ICE_FLOW_FIELD_IDX_TCP_SRC_PORT},
+	{VIRTCHNL_PROTO_HDR_TCP_DST_PORT, ICE_FLOW_FIELD_IDX_TCP_DST_PORT},
+	{VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT, ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT},
+	{VIRTCHNL_PROTO_HDR_SCTP_DST_PORT, ICE_FLOW_FIELD_IDX_SCTP_DST_PORT},
+};
+
+/**
+ * ice_vc_fdir_param_check
+ * @vf: pointer to the VF structure
+ * @vsi_id: VF relative VSI ID
+ *
+ * Check for the valid VSI ID, PF's state and VF's state
+ *
+ * Return: 0 on success, and -EINVAL on error.
+ */
+static int
+ice_vc_fdir_param_check(struct ice_vf *vf, u16 vsi_id)
+{
+	struct ice_pf *pf = vf->pf;
+
+	if (!test_bit(ICE_FLAG_FD_ENA, pf->flags))
+		return -EINVAL;
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
+		return -EINVAL;
+
+	if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF))
+		return -EINVAL;
+
+	if (vsi_id != vf->lan_vsi_num)
+		return -EINVAL;
+
+	if (!ice_vc_isvalid_vsi_id(vf, vsi_id))
+		return -EINVAL;
+
+	if (!pf->vsi[vf->lan_vsi_idx])
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ice_vf_start_ctrl_vsi
+ * @vf: pointer to the VF structure
+ *
+ * Allocate ctrl_vsi for the first time and open the ctrl_vsi port for VF
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int ice_vf_start_ctrl_vsi(struct ice_vf *vf)
+{
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *ctrl_vsi;
+	struct device *dev;
+	int err;
+
+	dev = ice_pf_to_dev(pf);
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI)
+		return -EEXIST;
+
+	ctrl_vsi = ice_vf_ctrl_vsi_setup(vf);
+	if (!ctrl_vsi) {
+		dev_dbg(dev, "Could not setup control VSI for VF %d\n",
+			vf->vf_id);
+		return -ENOMEM;
+	}
+
+	err = ice_vsi_open_ctrl(ctrl_vsi);
+	if (err) {
+		dev_dbg(dev, "Could not open control VSI for VF %d\n",
+			vf->vf_id);
+		goto err_vsi_open;
+	}
+
+	return 0;
+
+err_vsi_open:
+	ice_vsi_release(ctrl_vsi);
+	if (vf->ctrl_vsi_idx != ICE_NO_VSI) {
+		pf->vsi[vf->ctrl_vsi_idx] = NULL;
+		vf->ctrl_vsi_idx = ICE_NO_VSI;
+	}
+	return err;
+}
+
+/**
+ * ice_vc_fdir_alloc_prof - allocate profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_alloc_prof(struct ice_vf *vf, enum ice_fltr_ptype flow)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	if (!fdir->fdir_prof) {
+		fdir->fdir_prof = devm_kcalloc(ice_pf_to_dev(vf->pf),
+					       ICE_FLTR_PTYPE_MAX,
+					       sizeof(*fdir->fdir_prof),
+					       GFP_KERNEL);
+		if (!fdir->fdir_prof)
+			return -ENOMEM;
+	}
+
+	if (!fdir->fdir_prof[flow]) {
+		fdir->fdir_prof[flow] = devm_kzalloc(ice_pf_to_dev(vf->pf),
+						     sizeof(**fdir->fdir_prof),
+						     GFP_KERNEL);
+		if (!fdir->fdir_prof[flow])
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_free_prof - free profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ */
+static void
+ice_vc_fdir_free_prof(struct ice_vf *vf, enum ice_fltr_ptype flow)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	if (!fdir->fdir_prof)
+		return;
+
+	if (!fdir->fdir_prof[flow])
+		return;
+
+	devm_kfree(ice_pf_to_dev(vf->pf), fdir->fdir_prof[flow]);
+	fdir->fdir_prof[flow] = NULL;
+}
+
+/**
+ * ice_vc_fdir_free_prof_all - free all the profile for this VF
+ * @vf: pointer to the VF structure
+ */
+static void ice_vc_fdir_free_prof_all(struct ice_vf *vf)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	enum ice_fltr_ptype flow;
+
+	if (!fdir->fdir_prof)
+		return;
+
+	for (flow = ICE_FLTR_PTYPE_NONF_NONE; flow < ICE_FLTR_PTYPE_MAX; flow++)
+		ice_vc_fdir_free_prof(vf, flow);
+
+	devm_kfree(ice_pf_to_dev(vf->pf), fdir->fdir_prof);
+	fdir->fdir_prof = NULL;
+}
+
+/**
+ * ice_vc_fdir_parse_flow_fld
+ * @proto_hdr: virtual channel protocol filter header
+ * @conf: FDIR configuration for each filter
+ * @fld: field type array
+ * @fld_cnt: field counter
+ *
+ * Parse the virtual channel filter header and store them into field type array
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_parse_flow_fld(struct virtchnl_proto_hdr *proto_hdr,
+			   struct virtchnl_fdir_fltr_conf *conf,
+			   enum ice_flow_field *fld, int *fld_cnt)
+{
+	struct virtchnl_proto_hdr hdr;
+	u32 i;
+
+	memcpy(&hdr, proto_hdr, sizeof(hdr));
+
+	for (i = 0; (i < ARRAY_SIZE(fdir_inset_map)) &&
+	     VIRTCHNL_GET_PROTO_HDR_FIELD(&hdr); i++)
+		if (VIRTCHNL_TEST_PROTO_HDR(&hdr, fdir_inset_map[i].field)) {
+			fld[*fld_cnt] = fdir_inset_map[i].fld;
+			*fld_cnt += 1;
+			if (*fld_cnt >= ICE_FLOW_FIELD_IDX_MAX)
+				return -EINVAL;
+			VIRTCHNL_DEL_PROTO_HDR_FIELD(&hdr,
+						     fdir_inset_map[i].field);
+		}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_set_flow_fld
+ * @vf: pointer to the VF structure
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ * @seg: array of one or more packet segments that describe the flow
+ *
+ * Parse the virtual channel add msg buffer's field vector and store them into
+ * flow's packet segment field
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_set_flow_fld(struct ice_vf *vf, struct virtchnl_fdir_add *fltr,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 struct ice_flow_seg_info *seg)
+{
+	struct virtchnl_fdir_rule *rule = &fltr->rule_cfg;
+	enum ice_flow_field fld[ICE_FLOW_FIELD_IDX_MAX];
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct virtchnl_proto_hdrs *proto;
+	int fld_cnt = 0;
+	int i;
+
+	proto = &rule->proto_hdrs;
+	for (i = 0; i < proto->count; i++) {
+		struct virtchnl_proto_hdr *hdr = &proto->proto_hdr[i];
+		int ret;
+
+		ret = ice_vc_fdir_parse_flow_fld(hdr, conf, fld, &fld_cnt);
+		if (ret)
+			return ret;
+	}
+
+	if (fld_cnt == 0) {
+		dev_dbg(dev, "Empty input set for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < fld_cnt; i++)
+		ice_flow_set_fld(seg, fld[i],
+				 ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL,
+				 ICE_FLOW_FLD_OFF_INVAL, false);
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_set_flow_hdr - config the flow's packet segment header
+ * @vf: pointer to the VF structure
+ * @conf: FDIR configuration for each filter
+ * @seg: array of one or more packet segments that describe the flow
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_set_flow_hdr(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf,
+			 struct ice_flow_seg_info *seg)
+{
+	enum ice_fltr_ptype flow = conf->input.flow_type;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+
+	switch (flow) {
+	case ICE_FLTR_PTYPE_NONF_IPV4_OTHER:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_TCP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_TCP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_UDP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV4_SCTP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_SCTP |
+				  ICE_FLOW_SEG_HDR_IPV4 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_OTHER:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_TCP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_TCP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_UDP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	case ICE_FLTR_PTYPE_NONF_IPV6_SCTP:
+		ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_SCTP |
+				  ICE_FLOW_SEG_HDR_IPV6 |
+				  ICE_FLOW_SEG_HDR_IPV_OTHER);
+		break;
+	default:
+		dev_dbg(dev, "Invalid flow type 0x%x for VF %d failed\n",
+			flow, vf->vf_id);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_rem_prof - remove profile for this filter flow type
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ */
+static void
+ice_vc_fdir_rem_prof(struct ice_vf *vf, enum ice_fltr_ptype flow, int tun)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	struct ice_fd_hw_prof *vf_prof;
+	struct ice_pf *pf = vf->pf;
+	struct ice_vsi *vf_vsi;
+	struct device *dev;
+	struct ice_hw *hw;
+	u64 prof_id;
+	int i;
+
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	if (!fdir->fdir_prof || !fdir->fdir_prof[flow])
+		return;
+
+	vf_prof = fdir->fdir_prof[flow];
+
+	vf_vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vf_vsi) {
+		dev_dbg(dev, "NULL vf %d vsi pointer\n", vf->vf_id);
+		return;
+	}
+
+	if (!fdir->prof_entry_cnt[flow][tun])
+		return;
+
+	prof_id = ICE_FLOW_PROF_FD(vf_vsi->vsi_num,
+				   flow, tun ? ICE_FLTR_PTYPE_MAX : 0);
+
+	for (i = 0; i < fdir->prof_entry_cnt[flow][tun]; i++)
+		if (vf_prof->entry_h[i][tun]) {
+			u16 vsi_num = ice_get_hw_vsi_num(hw, vf_prof->vsi_h[i]);
+
+			ice_rem_prof_id_flow(hw, ICE_BLK_FD, vsi_num, prof_id);
+			ice_flow_rem_entry(hw, ICE_BLK_FD,
+					   vf_prof->entry_h[i][tun]);
+			vf_prof->entry_h[i][tun] = 0;
+		}
+
+	ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id);
+	devm_kfree(dev, vf_prof->fdir_seg[tun]);
+	vf_prof->fdir_seg[tun] = NULL;
+
+	for (i = 0; i < vf_prof->cnt; i++)
+		vf_prof->vsi_h[i] = 0;
+
+	fdir->prof_entry_cnt[flow][tun] = 0;
+}
+
+/**
+ * ice_vc_fdir_rem_prof_all - remove profile for this VF
+ * @vf: pointer to the VF structure
+ */
+static void ice_vc_fdir_rem_prof_all(struct ice_vf *vf)
+{
+	enum ice_fltr_ptype flow;
+
+	for (flow = ICE_FLTR_PTYPE_NONF_NONE;
+	     flow < ICE_FLTR_PTYPE_MAX; flow++) {
+		ice_vc_fdir_rem_prof(vf, flow, 0);
+		ice_vc_fdir_rem_prof(vf, flow, 1);
+	}
+}
+
+/**
+ * ice_vc_fdir_write_flow_prof
+ * @vf: pointer to the VF structure
+ * @flow: filter flow type
+ * @seg: array of one or more packet segments that describe the flow
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ *
+ * Write the flow's profile config and packet segment into the hardware
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_write_flow_prof(struct ice_vf *vf, enum ice_fltr_ptype flow,
+			    struct ice_flow_seg_info *seg, int tun)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+	struct ice_vsi *vf_vsi, *ctrl_vsi;
+	struct ice_flow_seg_info *old_seg;
+	struct ice_flow_prof *prof = NULL;
+	struct ice_fd_hw_prof *vf_prof;
+	enum ice_status status;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	u64 entry1_h = 0;
+	u64 entry2_h = 0;
+	u64 prof_id;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	vf_vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vf_vsi)
+		return -EINVAL;
+
+	ctrl_vsi = pf->vsi[vf->ctrl_vsi_idx];
+	if (!ctrl_vsi)
+		return -EINVAL;
+
+	vf_prof = fdir->fdir_prof[flow];
+	old_seg = vf_prof->fdir_seg[tun];
+	if (old_seg) {
+		if (!memcmp(old_seg, seg, sizeof(*seg))) {
+			dev_dbg(dev, "Duplicated profile for VF %d!\n",
+				vf->vf_id);
+			return -EEXIST;
+		}
+
+		if (fdir->fdir_fltr_cnt[flow][tun]) {
+			ret = -EINVAL;
+			dev_dbg(dev, "Input set conflicts for VF %d\n",
+				vf->vf_id);
+			goto err_exit;
+		}
+
+		/* remove previously allocated profile */
+		ice_vc_fdir_rem_prof(vf, flow, tun);
+	}
+
+	prof_id = ICE_FLOW_PROF_FD(vf_vsi->vsi_num, flow,
+				   tun ? ICE_FLTR_PTYPE_MAX : 0);
+
+	status = ice_flow_add_prof(hw, ICE_BLK_FD, ICE_FLOW_RX, prof_id, seg,
+				   tun + 1, &prof);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Could not add VSI flow 0x%x for VF %d\n",
+			flow, vf->vf_id);
+		goto err_exit;
+	}
+
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, vf_vsi->idx,
+				    vf_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, &entry1_h);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Could not add flow 0x%x VSI entry for VF %d\n",
+			flow, vf->vf_id);
+		goto err_prof;
+	}
+
+	status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, vf_vsi->idx,
+				    ctrl_vsi->idx, ICE_FLOW_PRIO_NORMAL,
+				    seg, &entry2_h);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev,
+			"Could not add flow 0x%x Ctrl VSI entry for VF %d\n",
+			flow, vf->vf_id);
+		goto err_entry_1;
+	}
+
+	vf_prof->fdir_seg[tun] = seg;
+	vf_prof->cnt = 0;
+	fdir->prof_entry_cnt[flow][tun] = 0;
+
+	vf_prof->entry_h[vf_prof->cnt][tun] = entry1_h;
+	vf_prof->vsi_h[vf_prof->cnt] = vf_vsi->idx;
+	vf_prof->cnt++;
+	fdir->prof_entry_cnt[flow][tun]++;
+
+	vf_prof->entry_h[vf_prof->cnt][tun] = entry2_h;
+	vf_prof->vsi_h[vf_prof->cnt] = ctrl_vsi->idx;
+	vf_prof->cnt++;
+	fdir->prof_entry_cnt[flow][tun]++;
+
+	return 0;
+
+err_entry_1:
+	ice_rem_prof_id_flow(hw, ICE_BLK_FD,
+			     ice_get_hw_vsi_num(hw, vf_vsi->idx), prof_id);
+	ice_flow_rem_entry(hw, ICE_BLK_FD, entry1_h);
+err_prof:
+	ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id);
+err_exit:
+	return ret;
+}
+
+/**
+ * ice_vc_fdir_config_input_set
+ * @vf: pointer to the VF structure
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter
+ *
+ * Config the input set type and value for virtual channel add msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_fdir_config_input_set(struct ice_vf *vf, struct virtchnl_fdir_add *fltr,
+			     struct virtchnl_fdir_fltr_conf *conf, int tun)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_flow_seg_info *seg;
+	enum ice_fltr_ptype flow;
+	int ret;
+
+	flow = input->flow_type;
+	ret = ice_vc_fdir_alloc_prof(vf, flow);
+	if (ret) {
+		dev_dbg(dev, "Alloc flow prof for VF %d failed\n", vf->vf_id);
+		return ret;
+	}
+
+	seg = devm_kzalloc(dev, sizeof(*seg), GFP_KERNEL);
+	if (!seg)
+		return -ENOMEM;
+
+	ret = ice_vc_fdir_set_flow_fld(vf, fltr, conf, seg);
+	if (ret) {
+		dev_dbg(dev, "Set flow field for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_set_flow_hdr(vf, conf, seg);
+	if (ret) {
+		dev_dbg(dev, "Set flow hdr for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_write_flow_prof(vf, flow, seg, tun);
+	if (ret == -EEXIST) {
+		devm_kfree(dev, seg);
+	} else if (ret) {
+		dev_dbg(dev, "Write flow profile for VF %d failed\n",
+			vf->vf_id);
+		goto err_exit;
+	}
+
+	return 0;
+
+err_exit:
+	devm_kfree(dev, seg);
+	return ret;
+}
+
+/**
+ * ice_vc_validate_fdir_fltr - validate the virtual channel filter
+ * @vf: pointer to the VF info
+ * @fltr: virtual channel add cmd buffer
+ * @conf: FDIR configuration for each filter
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int
+ice_vc_validate_fdir_fltr(struct ice_vf *vf, struct virtchnl_fdir_add *fltr,
+			  struct virtchnl_fdir_fltr_conf *conf)
+{
+	/* Todo: rule validation */
+	return -EINVAL;
+}
+
+/**
+ * ice_vc_fdir_comp_rules - compare if two filter rules have the same value
+ * @conf_a: FDIR configuration for filter a
+ * @conf_b: FDIR configuration for filter b
+ *
+ * Return: 0 on success, and other on error.
+ */
+static bool
+ice_vc_fdir_comp_rules(struct virtchnl_fdir_fltr_conf *conf_a,
+		       struct virtchnl_fdir_fltr_conf *conf_b)
+{
+	struct ice_fdir_fltr *a = &conf_a->input;
+	struct ice_fdir_fltr *b = &conf_b->input;
+
+	if (a->flow_type != b->flow_type)
+		return false;
+	if (memcmp(&a->ip, &b->ip, sizeof(a->ip)))
+		return false;
+	if (memcmp(&a->mask, &b->mask, sizeof(a->mask)))
+		return false;
+	if (memcmp(&a->ext_data, &b->ext_data, sizeof(a->ext_data)))
+		return false;
+	if (memcmp(&a->ext_mask, &b->ext_mask, sizeof(a->ext_mask)))
+		return false;
+
+	return true;
+}
+
+/**
+ * ice_vc_fdir_is_dup_fltr
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ *
+ * Check if there is duplicated rule with same conf value
+ *
+ * Return: 0 true success, and false on error.
+ */
+static bool
+ice_vc_fdir_is_dup_fltr(struct ice_vf *vf, struct virtchnl_fdir_fltr_conf *conf)
+{
+	struct ice_fdir_fltr *desc;
+	bool ret;
+
+	list_for_each_entry(desc, &vf->fdir.fdir_rule_list, fltr_node) {
+		struct virtchnl_fdir_fltr_conf *node =
+				to_fltr_conf_from_desc(desc);
+
+		ret = ice_vc_fdir_comp_rules(node, conf);
+		if (ret)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * ice_vc_fdir_insert_entry
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @id: pointer to ID value allocated by driver
+ *
+ * Insert FDIR conf entry into list and allocate ID for this filter
+ *
+ * Return: 0 true success, and other on error.
+ */
+static int
+ice_vc_fdir_insert_entry(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf, u32 *id)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	int i;
+
+	/* alloc ID corresponding with conf */
+	i = idr_alloc(&vf->fdir.fdir_rule_idr, conf, 0,
+		      ICE_FDIR_MAX_FLTRS, GFP_KERNEL);
+	if (i < 0)
+		return -EINVAL;
+	*id = i;
+
+	list_add(&input->fltr_node, &vf->fdir.fdir_rule_list);
+	return 0;
+}
+
+/**
+ * ice_vc_fdir_remove_entry - remove FDIR conf entry by ID value
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @id: filter rule's ID
+ */
+static void
+ice_vc_fdir_remove_entry(struct ice_vf *vf,
+			 struct virtchnl_fdir_fltr_conf *conf, u32 id)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+
+	idr_remove(&vf->fdir.fdir_rule_idr, id);
+	list_del(&input->fltr_node);
+}
+
+/**
+ * ice_vc_fdir_lookup_entry - lookup FDIR conf entry by ID value
+ * @vf: pointer to the VF info
+ * @id: filter rule's ID
+ *
+ * Return: NULL on error, and other on success.
+ */
+static struct virtchnl_fdir_fltr_conf *
+ice_vc_fdir_lookup_entry(struct ice_vf *vf, u32 id)
+{
+	return idr_find(&vf->fdir.fdir_rule_idr, id);
+}
+
+/**
+ * ice_vc_fdir_flush_entry - remove all FDIR conf entry
+ * @vf: pointer to the VF info
+ */
+static void ice_vc_fdir_flush_entry(struct ice_vf *vf)
+{
+	struct virtchnl_fdir_fltr_conf *conf;
+	struct ice_fdir_fltr *desc, *temp;
+
+	list_for_each_entry_safe(desc, temp,
+				 &vf->fdir.fdir_rule_list, fltr_node) {
+		conf = to_fltr_conf_from_desc(desc);
+		list_del(&desc->fltr_node);
+		devm_kfree(ice_pf_to_dev(vf->pf), conf);
+	}
+}
+
+/**
+ * ice_vc_fdir_write_fltr - write filter rule into hardware
+ * @vf: pointer to the VF info
+ * @conf: FDIR configuration for each filter
+ * @add: true implies add rule, false implies del rules
+ * @is_tun: false implies non-tunnel type filter, true implies tunnel filter
+ *
+ * Return: 0 on success, and other on error.
+ */
+static int ice_vc_fdir_write_fltr(struct ice_vf *vf,
+				  struct virtchnl_fdir_fltr_conf *conf,
+				  bool add, bool is_tun)
+{
+	struct ice_fdir_fltr *input = &conf->input;
+	struct ice_vsi *vsi, *ctrl_vsi;
+	struct ice_fltr_desc desc;
+	enum ice_status status;
+	struct device *dev;
+	struct ice_pf *pf;
+	struct ice_hw *hw;
+	int ret;
+	u8 *pkt;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	hw = &pf->hw;
+	vsi = pf->vsi[vf->lan_vsi_idx];
+	if (!vsi) {
+		dev_dbg(dev, "Invalid vsi for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	input->dest_vsi = vsi->idx;
+	input->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL;
+
+	ctrl_vsi = pf->vsi[vf->ctrl_vsi_idx];
+	if (!ctrl_vsi) {
+		dev_dbg(dev, "Invalid ctrl_vsi for VF %d\n", vf->vf_id);
+		return -EINVAL;
+	}
+
+	pkt = devm_kzalloc(dev, ICE_FDIR_MAX_RAW_PKT_SIZE, GFP_KERNEL);
+	if (!pkt)
+		return -ENOMEM;
+
+	ice_fdir_get_prgm_desc(hw, input, &desc, add);
+	status = ice_fdir_get_gen_prgm_pkt(hw, input, pkt, false, is_tun);
+	ret = ice_status_to_errno(status);
+	if (ret) {
+		dev_dbg(dev, "Gen training pkt for VF %d ptype %d failed\n",
+			vf->vf_id, input->flow_type);
+		goto err_free_pkt;
+	}
+
+	ret = ice_prgm_fdir_fltr(ctrl_vsi, &desc, pkt);
+	if (ret)
+		goto err_free_pkt;
+
+	return 0;
+
+err_free_pkt:
+	devm_kfree(dev, pkt);
+	return ret;
+}
+
+/**
+ * ice_vc_add_fdir_fltr - add a FDIR filter for VF by the msg buffer
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_fdir_add *fltr = (struct virtchnl_fdir_add *)msg;
+	struct virtchnl_fdir_add *stat = NULL;
+	struct virtchnl_fdir_fltr_conf *conf;
+	enum virtchnl_status_code v_ret;
+	struct device *dev;
+	struct ice_pf *pf;
+	int is_tun = 0;
+	int len = 0;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	ret = ice_vc_fdir_param_check(vf, fltr->vsi_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_dbg(dev, "Parameter check for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vf_start_ctrl_vsi(vf);
+	if (ret && (ret != -EEXIST)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_err(dev, "Init FDIR for VF %d failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_exit;
+	}
+
+	stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+	if (!stat) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc stat for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	conf = devm_kzalloc(dev, sizeof(*conf), GFP_KERNEL);
+	if (!conf) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc conf for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	len = sizeof(*stat);
+	ret = ice_vc_validate_fdir_fltr(vf, fltr, conf);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_INVALID;
+		dev_dbg(dev, "Invalid FDIR filter from VF %d\n", vf->vf_id);
+		goto err_free_conf;
+	}
+
+	if (fltr->validate_only) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_SUCCESS;
+		devm_kfree(dev, conf);
+		ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER,
+					    v_ret, (u8 *)stat, len);
+		goto exit;
+	}
+
+	ret = ice_vc_fdir_config_input_set(vf, fltr, conf, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT;
+		dev_err(dev, "VF %d: FDIR input set configure failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_is_dup_fltr(vf, conf);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_EXIST;
+		dev_dbg(dev, "VF %d: duplicated FDIR rule detected\n",
+			vf->vf_id);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_insert_entry(vf, conf, &stat->flow_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_dbg(dev, "VF %d: insert FDIR list failed\n", vf->vf_id);
+		goto err_free_conf;
+	}
+
+	ret = ice_vc_fdir_write_fltr(vf, conf, true, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "VF %d: writing FDIR rule failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_rem_entry;
+	}
+
+	vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]++;
+
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+	stat->status = VIRTCHNL_FDIR_SUCCESS;
+exit:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER, v_ret,
+				    (u8 *)stat, len);
+	kfree(stat);
+	return ret;
+
+err_rem_entry:
+	ice_vc_fdir_remove_entry(vf, conf, stat->flow_id);
+err_free_conf:
+	devm_kfree(dev, conf);
+err_exit:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER, v_ret,
+				    (u8 *)stat, len);
+	kfree(stat);
+	return ret;
+}
+
+/**
+ * ice_vc_del_fdir_fltr - delete a FDIR filter for VF by the msg buffer
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ *
+ * Return: 0 on success, and other on error.
+ */
+int ice_vc_del_fdir_fltr(struct ice_vf *vf, u8 *msg)
+{
+	struct virtchnl_fdir_del *fltr = (struct virtchnl_fdir_del *)msg;
+	struct virtchnl_fdir_del *stat = NULL;
+	struct virtchnl_fdir_fltr_conf *conf;
+	enum virtchnl_status_code v_ret;
+	struct device *dev;
+	struct ice_pf *pf;
+	int is_tun = 0;
+	int len = 0;
+	int ret;
+
+	pf = vf->pf;
+	dev = ice_pf_to_dev(pf);
+	ret = ice_vc_fdir_param_check(vf, fltr->vsi_id);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		dev_dbg(dev, "Parameter check for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+	if (!stat) {
+		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+		dev_dbg(dev, "Alloc stat for VF %d failed\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	len = sizeof(*stat);
+
+	conf = ice_vc_fdir_lookup_entry(vf, fltr->flow_id);
+	if (!conf) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST;
+		dev_dbg(dev, "VF %d: FDIR invalid flow_id:0x%X\n",
+			vf->vf_id, fltr->flow_id);
+		goto err_exit;
+	}
+
+	/* Just return failure when ctrl_vsi idx is invalid */
+	if (vf->ctrl_vsi_idx == ICE_NO_VSI) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "Invalid FDIR ctrl_vsi for VF %d\n", vf->vf_id);
+		goto err_exit;
+	}
+
+	ret = ice_vc_fdir_write_fltr(vf, conf, false, is_tun);
+	if (ret) {
+		v_ret = VIRTCHNL_STATUS_SUCCESS;
+		stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE;
+		dev_err(dev, "VF %d: writing FDIR rule failed, ret:%d\n",
+			vf->vf_id, ret);
+		goto err_exit;
+	}
+
+	ice_vc_fdir_remove_entry(vf, conf, fltr->flow_id);
+	devm_kfree(dev, conf);
+	vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]--;
+
+	v_ret = VIRTCHNL_STATUS_SUCCESS;
+	stat->status = VIRTCHNL_FDIR_SUCCESS;
+
+err_exit:
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_FDIR_FILTER, v_ret,
+				    (u8 *)stat, len);
+	kfree(stat);
+	return ret;
+}
+
+/**
+ * ice_vf_fdir_init - init FDIR resource for VF
+ * @vf: pointer to the VF info
+ */
+void ice_vf_fdir_init(struct ice_vf *vf)
+{
+	struct ice_vf_fdir *fdir = &vf->fdir;
+
+	idr_init(&fdir->fdir_rule_idr);
+	INIT_LIST_HEAD(&fdir->fdir_rule_list);
+}
+
+/**
+ * ice_vf_fdir_exit - destroy FDIR resource for VF
+ * @vf: pointer to the VF info
+ */
+void ice_vf_fdir_exit(struct ice_vf *vf)
+{
+	ice_vc_fdir_flush_entry(vf);
+	idr_destroy(&vf->fdir.fdir_rule_idr);
+	ice_vc_fdir_rem_prof_all(vf);
+	ice_vc_fdir_free_prof_all(vf);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h
new file mode 100644
index 000000000000..2a2e0e598559
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021, Intel Corporation. */
+
+#ifndef _ICE_VIRTCHNL_FDIR_H_
+#define _ICE_VIRTCHNL_FDIR_H_
+
+struct ice_vf;
+
+/* VF FDIR information structure */
+struct ice_vf_fdir {
+	u16 fdir_fltr_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX];
+	int prof_entry_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX];
+	struct ice_fd_hw_prof **fdir_prof;
+
+	struct idr fdir_rule_idr;
+	struct list_head fdir_rule_list;
+};
+
+int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg);
+int ice_vc_del_fdir_fltr(struct ice_vf *vf, u8 *msg);
+void ice_vf_fdir_init(struct ice_vf *vf);
+void ice_vf_fdir_exit(struct ice_vf *vf);
+
+#endif /* _ICE_VIRTCHNL_FDIR_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index fa72b7e2e433..20343a0fe726 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -233,6 +233,7 @@ static void ice_free_vf_res(struct ice_vf *vf)
 	 * accessing the VF's VSI after it's freed or invalidated.
 	 */
 	clear_bit(ICE_VF_STATE_INIT, vf->vf_states);
+	ice_vf_fdir_exit(vf);
 	/* free VF control VSI */
 	if (vf->ctrl_vsi_idx != ICE_NO_VSI)
 		ice_vf_ctrl_vsi_release(vf);
@@ -1300,6 +1301,7 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 	ice_for_each_vf(pf, v) {
 		vf = &pf->vf[v];
 
+		ice_vf_fdir_exit(vf);
 		/* clean VF control VSI when resetting VFs since it should be
 		 * setup only when VF creates its first FDIR rule.
 		 */
@@ -1424,6 +1426,7 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 			dev_err(dev, "disabling promiscuous mode failed\n");
 	}
 
+	ice_vf_fdir_exit(vf);
 	/* clean VF control VSI when resetting VF since it should be setup
 	 * only when VF creates its first FDIR rule.
 	 */
@@ -1610,6 +1613,7 @@ static void ice_set_dflt_settings_vfs(struct ice_pf *pf)
 		 * creates its first fdir rule.
 		 */
 		ice_vf_ctrl_invalidate_vsi(vf);
+		ice_vf_fdir_init(vf);
 	}
 }
 
@@ -1909,7 +1913,7 @@ ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event)
  *
  * send msg to VF
  */
-static int
+int
 ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
 		      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen)
 {
@@ -2057,6 +2061,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_REG;
 	}
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_FDIR_PF;
+
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2;
 
@@ -2145,7 +2152,7 @@ static struct ice_vsi *ice_find_vsi_from_id(struct ice_pf *pf, u16 id)
  *
  * check for the valid VSI ID
  */
-static bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id)
+bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id)
 {
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
@@ -3877,6 +3884,12 @@ error_handler:
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
 		err = ice_vc_dis_vlan_stripping(vf);
 		break;
+	case VIRTCHNL_OP_ADD_FDIR_FILTER:
+		err = ice_vc_add_fdir_fltr(vf, msg);
+		break;
+	case VIRTCHNL_OP_DEL_FDIR_FILTER:
+		err = ice_vc_del_fdir_fltr(vf, msg);
+		break;
 	case VIRTCHNL_OP_UNKNOWN:
 	default:
 		dev_err(dev, "Unsupported opcode %d from VF %d\n", v_opcode,
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
index faa879d744a1..46abc5388fc7 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
@@ -4,6 +4,7 @@
 #ifndef _ICE_VIRTCHNL_PF_H_
 #define _ICE_VIRTCHNL_PF_H_
 #include "ice.h"
+#include "ice_virtchnl_fdir.h"
 
 /* Restrict number of MAC Addr and VLAN that non-trusted VF can programmed */
 #define ICE_MAX_VLAN_PER_VF		8
@@ -71,6 +72,7 @@ struct ice_vf {
 	u16 vf_id;			/* VF ID in the PF space */
 	u16 lan_vsi_idx;		/* index into PF struct */
 	u16 ctrl_vsi_idx;
+	struct ice_vf_fdir fdir;
 	/* first vector index of this VF in the PF space */
 	int first_vector_idx;
 	struct ice_sw *vf_sw_id;	/* switch ID the VF VSIs connect to */
@@ -140,6 +142,10 @@ ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event);
 void ice_print_vfs_mdd_events(struct ice_pf *pf);
 void ice_print_vf_rx_mdd_event(struct ice_vf *vf);
 struct ice_vsi *ice_vf_ctrl_vsi_setup(struct ice_vf *vf);
+int
+ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
+		      enum virtchnl_status_code v_retval, u8 *msg, u16 msglen);
+bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id);
 #else /* CONFIG_PCI_IOV */
 #define ice_process_vflr_event(pf) do {} while (0)
 #define ice_free_vfs(pf) do {} while (0)
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 40bad71865ea..47482049f640 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -136,6 +136,9 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_DISABLE_CHANNELS = 31,
 	VIRTCHNL_OP_ADD_CLOUD_FILTER = 32,
 	VIRTCHNL_OP_DEL_CLOUD_FILTER = 33,
+	/* opcode 34 - 46 are reserved */
+	VIRTCHNL_OP_ADD_FDIR_FILTER = 47,
+	VIRTCHNL_OP_DEL_FDIR_FILTER = 48,
 };
 
 /* These macros are used to generate compilation errors if a structure/union
@@ -247,6 +250,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM		0X00200000
 #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	0X00400000
 #define VIRTCHNL_VF_OFFLOAD_ADQ			0X00800000
+#define VIRTCHNL_VF_OFFLOAD_FDIR_PF		0X10000000
 
 /* Define below the capability flags that are not offloads */
 #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED		0x00000080
@@ -559,6 +563,11 @@ enum virtchnl_action {
 	/* action types */
 	VIRTCHNL_ACTION_DROP = 0,
 	VIRTCHNL_ACTION_TC_REDIRECT,
+	VIRTCHNL_ACTION_PASSTHRU,
+	VIRTCHNL_ACTION_QUEUE,
+	VIRTCHNL_ACTION_Q_REGION,
+	VIRTCHNL_ACTION_MARK,
+	VIRTCHNL_ACTION_COUNT,
 };
 
 enum virtchnl_flow_type {
@@ -668,6 +677,269 @@ enum virtchnl_vfr_states {
 	VIRTCHNL_VFR_VFACTIVE,
 };
 
+#define VIRTCHNL_MAX_NUM_PROTO_HDRS	32
+#define PROTO_HDR_SHIFT			5
+#define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT)
+#define PROTO_HDR_FIELD_MASK ((1UL << PROTO_HDR_SHIFT) - 1)
+
+/* VF use these macros to configure each protocol header.
+ * Specify which protocol headers and protocol header fields base on
+ * virtchnl_proto_hdr_type and virtchnl_proto_hdr_field.
+ * @param hdr: a struct of virtchnl_proto_hdr
+ * @param hdr_type: ETH/IPV4/TCP, etc
+ * @param field: SRC/DST/TEID/SPI, etc
+ */
+#define VIRTCHNL_ADD_PROTO_HDR_FIELD(hdr, field) \
+	((hdr)->field_selector |= BIT((field) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_DEL_PROTO_HDR_FIELD(hdr, field) \
+	((hdr)->field_selector &= ~BIT((field) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_TEST_PROTO_HDR_FIELD(hdr, val) \
+	((hdr)->field_selector & BIT((val) & PROTO_HDR_FIELD_MASK))
+#define VIRTCHNL_GET_PROTO_HDR_FIELD(hdr)	((hdr)->field_selector)
+
+#define VIRTCHNL_ADD_PROTO_HDR_FIELD_BIT(hdr, hdr_type, field) \
+	(VIRTCHNL_ADD_PROTO_HDR_FIELD(hdr, \
+		VIRTCHNL_PROTO_HDR_ ## hdr_type ## _ ## field))
+#define VIRTCHNL_DEL_PROTO_HDR_FIELD_BIT(hdr, hdr_type, field) \
+	(VIRTCHNL_DEL_PROTO_HDR_FIELD(hdr, \
+		VIRTCHNL_PROTO_HDR_ ## hdr_type ## _ ## field))
+
+#define VIRTCHNL_SET_PROTO_HDR_TYPE(hdr, hdr_type) \
+	((hdr)->type = VIRTCHNL_PROTO_HDR_ ## hdr_type)
+#define VIRTCHNL_GET_PROTO_HDR_TYPE(hdr) \
+	(((hdr)->type) >> PROTO_HDR_SHIFT)
+#define VIRTCHNL_TEST_PROTO_HDR_TYPE(hdr, val) \
+	((hdr)->type == ((val) >> PROTO_HDR_SHIFT))
+#define VIRTCHNL_TEST_PROTO_HDR(hdr, val) \
+	(VIRTCHNL_TEST_PROTO_HDR_TYPE((hdr), (val)) && \
+	 VIRTCHNL_TEST_PROTO_HDR_FIELD((hdr), (val)))
+
+/* Protocol header type within a packet segment. A segment consists of one or
+ * more protocol headers that make up a logical group of protocol headers. Each
+ * logical group of protocol headers encapsulates or is encapsulated using/by
+ * tunneling or encapsulation protocols for network virtualization.
+ */
+enum virtchnl_proto_hdr_type {
+	VIRTCHNL_PROTO_HDR_NONE,
+	VIRTCHNL_PROTO_HDR_ETH,
+	VIRTCHNL_PROTO_HDR_S_VLAN,
+	VIRTCHNL_PROTO_HDR_C_VLAN,
+	VIRTCHNL_PROTO_HDR_IPV4,
+	VIRTCHNL_PROTO_HDR_IPV6,
+	VIRTCHNL_PROTO_HDR_TCP,
+	VIRTCHNL_PROTO_HDR_UDP,
+	VIRTCHNL_PROTO_HDR_SCTP,
+	VIRTCHNL_PROTO_HDR_GTPU_IP,
+	VIRTCHNL_PROTO_HDR_GTPU_EH,
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN,
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP,
+	VIRTCHNL_PROTO_HDR_PPPOE,
+	VIRTCHNL_PROTO_HDR_L2TPV3,
+	VIRTCHNL_PROTO_HDR_ESP,
+	VIRTCHNL_PROTO_HDR_AH,
+	VIRTCHNL_PROTO_HDR_PFCP,
+};
+
+/* Protocol header field within a protocol header. */
+enum virtchnl_proto_hdr_field {
+	/* ETHER */
+	VIRTCHNL_PROTO_HDR_ETH_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ETH),
+	VIRTCHNL_PROTO_HDR_ETH_DST,
+	VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE,
+	/* S-VLAN */
+	VIRTCHNL_PROTO_HDR_S_VLAN_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_S_VLAN),
+	/* C-VLAN */
+	VIRTCHNL_PROTO_HDR_C_VLAN_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_C_VLAN),
+	/* IPV4 */
+	VIRTCHNL_PROTO_HDR_IPV4_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4),
+	VIRTCHNL_PROTO_HDR_IPV4_DST,
+	VIRTCHNL_PROTO_HDR_IPV4_DSCP,
+	VIRTCHNL_PROTO_HDR_IPV4_TTL,
+	VIRTCHNL_PROTO_HDR_IPV4_PROT,
+	/* IPV6 */
+	VIRTCHNL_PROTO_HDR_IPV6_SRC =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6),
+	VIRTCHNL_PROTO_HDR_IPV6_DST,
+	VIRTCHNL_PROTO_HDR_IPV6_TC,
+	VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT,
+	VIRTCHNL_PROTO_HDR_IPV6_PROT,
+	/* TCP */
+	VIRTCHNL_PROTO_HDR_TCP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_TCP),
+	VIRTCHNL_PROTO_HDR_TCP_DST_PORT,
+	/* UDP */
+	VIRTCHNL_PROTO_HDR_UDP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_UDP),
+	VIRTCHNL_PROTO_HDR_UDP_DST_PORT,
+	/* SCTP */
+	VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_SCTP),
+	VIRTCHNL_PROTO_HDR_SCTP_DST_PORT,
+	/* GTPU_IP */
+	VIRTCHNL_PROTO_HDR_GTPU_IP_TEID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_IP),
+	/* GTPU_EH */
+	VIRTCHNL_PROTO_HDR_GTPU_EH_PDU =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH),
+	VIRTCHNL_PROTO_HDR_GTPU_EH_QFI,
+	/* PPPOE */
+	VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PPPOE),
+	/* L2TPV3 */
+	VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_L2TPV3),
+	/* ESP */
+	VIRTCHNL_PROTO_HDR_ESP_SPI =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ESP),
+	/* AH */
+	VIRTCHNL_PROTO_HDR_AH_SPI =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_AH),
+	/* PFCP */
+	VIRTCHNL_PROTO_HDR_PFCP_S_FIELD =
+		PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PFCP),
+	VIRTCHNL_PROTO_HDR_PFCP_SEID,
+};
+
+struct virtchnl_proto_hdr {
+	enum virtchnl_proto_hdr_type type;
+	u32 field_selector; /* a bit mask to select field for header type */
+	u8 buffer[64];
+	/**
+	 * binary buffer in network order for specific header type.
+	 * For example, if type = VIRTCHNL_PROTO_HDR_IPV4, a IPv4
+	 * header is expected to be copied into the buffer.
+	 */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_proto_hdr);
+
+struct virtchnl_proto_hdrs {
+	u8 tunnel_level;
+	/**
+	 * specify where protocol header start from.
+	 * 0 - from the outer layer
+	 * 1 - from the first inner layer
+	 * 2 - from the second inner layer
+	 * ....
+	 **/
+	int count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */
+	struct virtchnl_proto_hdr proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs);
+
+/* action configuration for FDIR */
+struct virtchnl_filter_action {
+	enum virtchnl_action type;
+	union {
+		/* used for queue and qgroup action */
+		struct {
+			u16 index;
+			u8 region;
+		} queue;
+		/* used for count action */
+		struct {
+			/* share counter ID with other flow rules */
+			u8 shared;
+			u32 id; /* counter ID */
+		} count;
+		/* used for mark action */
+		u32 mark_id;
+		u8 reserve[32];
+	} act_conf;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(36, virtchnl_filter_action);
+
+#define VIRTCHNL_MAX_NUM_ACTIONS  8
+
+struct virtchnl_filter_action_set {
+	/* action number must be less then VIRTCHNL_MAX_NUM_ACTIONS */
+	int count;
+	struct virtchnl_filter_action actions[VIRTCHNL_MAX_NUM_ACTIONS];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(292, virtchnl_filter_action_set);
+
+/* pattern and action for FDIR rule */
+struct virtchnl_fdir_rule {
+	struct virtchnl_proto_hdrs proto_hdrs;
+	struct virtchnl_filter_action_set action_set;
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2604, virtchnl_fdir_rule);
+
+/* Status returned to VF after VF requests FDIR commands
+ * VIRTCHNL_FDIR_SUCCESS
+ * VF FDIR related request is successfully done by PF
+ * The request can be OP_ADD/DEL.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE
+ * OP_ADD_FDIR_FILTER request is failed due to no Hardware resource.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_EXIST
+ * OP_ADD_FDIR_FILTER request is failed due to the rule is already existed.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT
+ * OP_ADD_FDIR_FILTER request is failed due to conflict with existing rule.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST
+ * OP_DEL_FDIR_FILTER request is failed due to this rule doesn't exist.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_INVALID
+ * OP_ADD_FDIR_FILTER request is failed due to parameters validation
+ * or HW doesn't support.
+ *
+ * VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT
+ * OP_ADD/DEL_FDIR_FILTER request is failed due to timing out
+ * for programming.
+ */
+enum virtchnl_fdir_prgm_status {
+	VIRTCHNL_FDIR_SUCCESS = 0,
+	VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE,
+	VIRTCHNL_FDIR_FAILURE_RULE_EXIST,
+	VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT,
+	VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST,
+	VIRTCHNL_FDIR_FAILURE_RULE_INVALID,
+	VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT,
+};
+
+/* VIRTCHNL_OP_ADD_FDIR_FILTER
+ * VF sends this request to PF by filling out vsi_id,
+ * validate_only and rule_cfg. PF will return flow_id
+ * if the request is successfully done and return add_status to VF.
+ */
+struct virtchnl_fdir_add {
+	u16 vsi_id;  /* INPUT */
+	/*
+	 * 1 for validating a fdir rule, 0 for creating a fdir rule.
+	 * Validate and create share one ops: VIRTCHNL_OP_ADD_FDIR_FILTER.
+	 */
+	u16 validate_only; /* INPUT */
+	u32 flow_id;       /* OUTPUT */
+	struct virtchnl_fdir_rule rule_cfg; /* INPUT */
+	enum virtchnl_fdir_prgm_status status; /* OUTPUT */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2616, virtchnl_fdir_add);
+
+/* VIRTCHNL_OP_DEL_FDIR_FILTER
+ * VF sends this request to PF by filling out vsi_id
+ * and flow_id. PF will return del_status to VF.
+ */
+struct virtchnl_fdir_del {
+	u16 vsi_id;  /* INPUT */
+	u16 pad;
+	u32 flow_id; /* INPUT */
+	enum virtchnl_fdir_prgm_status status; /* OUTPUT */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del);
+
 /**
  * virtchnl_vc_validate_vf_msg
  * @ver: Virtchnl version info
@@ -828,6 +1100,12 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
 		valid_len = sizeof(struct virtchnl_filter);
 		break;
+	case VIRTCHNL_OP_ADD_FDIR_FILTER:
+		valid_len = sizeof(struct virtchnl_fdir_add);
+		break;
+	case VIRTCHNL_OP_DEL_FDIR_FILTER:
+		valid_len = sizeof(struct virtchnl_fdir_del);
+		break;
 	/* These are always errors coming from the VF. */
 	case VIRTCHNL_OP_EVENT:
 	case VIRTCHNL_OP_UNKNOWN:
-- 
cgit v1.2.3


From 69c4a42d72eb9b41e1c6e4bc9ab7f3650bf35f62 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Fri, 26 Feb 2021 22:37:55 -0500
Subject: lsm,selinux: add new hook to compare new mount to an existing mount

Add a new hook that takes an existing super block and a new mount
with new options and determines if new options confict with an
existing mount or not.

A filesystem can use this new hook to determine if it can share
the an existing superblock with a new superblock for the new mount.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Acked-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
[PM: tweak the subject line, fix tab/space problems]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h |  1 +
 include/linux/lsm_hooks.h     |  6 +++++
 include/linux/security.h      |  8 +++++++
 security/security.c           |  7 ++++++
 security/selinux/hooks.c      | 56 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 477a597db013..1b61bc5dc215 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -62,6 +62,7 @@ LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
 LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
+LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts)
 LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts)
 LSM_HOOK(int, 0, sb_kern_mount, struct super_block *sb)
 LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index fb7f3193753d..97bb36d7e994 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -142,6 +142,12 @@
  *	@orig the original mount data copied from userspace.
  *	@copy copied data which will be passed to the security module.
  *	Returns 0 if the copy was successful.
+ * @sb_mnt_opts_compat:
+ *	Determine if the new mount options in @mnt_opts are allowed given
+ *	the existing mounted filesystem at @sb.
+ *	@sb superblock being compared
+ *	@mnt_opts new mount options
+ *	Return 0 if options are compatible.
  * @sb_remount:
  *	Extracts security system specific mount options and verifies no changes
  *	are being made to those options.
diff --git a/include/linux/security.h b/include/linux/security.h
index 8aeebd6646dc..f1e5833bfedc 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -294,6 +294,7 @@ int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 void security_free_mnt_opts(void **mnt_opts);
 int security_sb_eat_lsm_opts(char *options, void **mnt_opts);
+int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts);
 int security_sb_remount(struct super_block *sb, void *mnt_opts);
 int security_sb_kern_mount(struct super_block *sb);
 int security_sb_show_options(struct seq_file *m, struct super_block *sb);
@@ -646,6 +647,13 @@ static inline int security_sb_remount(struct super_block *sb,
 	return 0;
 }
 
+static inline int security_sb_mnt_opts_compat(struct super_block *sb,
+					      void *mnt_opts)
+{
+	return 0;
+}
+
+
 static inline int security_sb_kern_mount(struct super_block *sb)
 {
 	return 0;
diff --git a/security/security.c b/security/security.c
index 5ac96b16f8fa..a4e7d50c3e39 100644
--- a/security/security.c
+++ b/security/security.c
@@ -890,6 +890,13 @@ int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
 }
 EXPORT_SYMBOL(security_sb_eat_lsm_opts);
 
+int security_sb_mnt_opts_compat(struct super_block *sb,
+				void *mnt_opts)
+{
+	return call_int_hook(sb_mnt_opts_compat, 0, sb, mnt_opts);
+}
+EXPORT_SYMBOL(security_sb_mnt_opts_compat);
+
 int security_sb_remount(struct super_block *sb,
 			void *mnt_opts)
 {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index eca9fc0ba764..07ca2ebf979e 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2685,6 +2685,61 @@ free_opt:
 	return rc;
 }
 
+static int selinux_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts)
+{
+	struct selinux_mnt_opts *opts = mnt_opts;
+	struct superblock_security_struct *sbsec = sb->s_security;
+	u32 sid;
+	int rc;
+
+	/*
+	 * Superblock not initialized (i.e. no options) - reject if any
+	 * options specified, otherwise accept.
+	 */
+	if (!(sbsec->flags & SE_SBINITIALIZED))
+		return opts ? 1 : 0;
+
+	/*
+	 * Superblock initialized and no options specified - reject if
+	 * superblock has any options set, otherwise accept.
+	 */
+	if (!opts)
+		return (sbsec->flags & SE_MNTMASK) ? 1 : 0;
+
+	if (opts->fscontext) {
+		rc = parse_sid(sb, opts->fscontext, &sid);
+		if (rc)
+			return 1;
+		if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid, sid))
+			return 1;
+	}
+	if (opts->context) {
+		rc = parse_sid(sb, opts->context, &sid);
+		if (rc)
+			return 1;
+		if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid, sid))
+			return 1;
+	}
+	if (opts->rootcontext) {
+		struct inode_security_struct *root_isec;
+
+		root_isec = backing_inode_security(sb->s_root);
+		rc = parse_sid(sb, opts->rootcontext, &sid);
+		if (rc)
+			return 1;
+		if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, sid))
+			return 1;
+	}
+	if (opts->defcontext) {
+		rc = parse_sid(sb, opts->defcontext, &sid);
+		if (rc)
+			return 1;
+		if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid, sid))
+			return 1;
+	}
+	return 0;
+}
+
 static int selinux_sb_remount(struct super_block *sb, void *mnt_opts)
 {
 	struct selinux_mnt_opts *opts = mnt_opts;
@@ -7078,6 +7133,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 
 	LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security),
 	LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts),
+	LSM_HOOK_INIT(sb_mnt_opts_compat, selinux_sb_mnt_opts_compat),
 	LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
 	LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount),
 	LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options),
-- 
cgit v1.2.3


From ec1ade6a0448e3bfb07bb905aca1bc18836220c7 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Fri, 19 Feb 2021 17:22:33 -0500
Subject: nfs: account for selinux security context when deciding to share
 superblock

Keep track of whether or not there were LSM security context
options passed during mount (ie creation of the superblock).
Then, while deciding if the superblock can be shared for the new
mount, check if the newly passed in LSM security context options
are compatible with the existing superblock's ones by calling
security_sb_mnt_opts_compat().

Previously, with selinux enabled, NFS wasn't able to do the
following 2mounts:
mount -o vers=4.2,sec=sys,context=system_u:object_r:root_t:s0
<serverip>:/ /mnt
mount -o vers=4.2,sec=sys,context=system_u:object_r:swapfile_t:s0
<serverip>:/scratch /scratch

2nd mount would fail with "mount.nfs: an incorrect mount option was
specified" and var log messages would have:
"SElinux: mount invalid. Same superblock, different security
settings for.."

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
[PM: tweak subject line]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/nfs/fs_context.c       | 3 +++
 fs/nfs/internal.h         | 1 +
 fs/nfs/super.c            | 4 ++++
 include/linux/nfs_fs_sb.h | 1 +
 4 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 971a9251c1d9..a06d213d7689 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -463,6 +463,9 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 	if (opt < 0)
 		return ctx->sloppy ? 1 : opt;
 
+	if (fc->security)
+		ctx->has_sec_mnt_opts = 1;
+
 	switch (opt) {
 	case Opt_source:
 		if (fc->source)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 25fb43b69e5a..c3f57b423611 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -96,6 +96,7 @@ struct nfs_fs_context {
 	char			*fscache_uniq;
 	unsigned short		protofamily;
 	unsigned short		mountfamily;
+	bool			has_sec_mnt_opts;
 
 	struct {
 		union {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index de83c61a6733..4aaa1f5dd381 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1077,6 +1077,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
 						 &sb->s_blocksize_bits);
 
 	nfs_super_set_maxbytes(sb, server->maxfilesize);
+	server->has_sec_mnt_opts = ctx->has_sec_mnt_opts;
 }
 
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b,
@@ -1193,6 +1194,9 @@ static int nfs_compare_super(struct super_block *sb, struct fs_context *fc)
 		return 0;
 	if (!nfs_compare_userns(old, server))
 		return 0;
+	if ((old->has_sec_mnt_opts || fc->security) &&
+			security_sb_mnt_opts_compat(sb, fc->security))
+		return 0;
 	return nfs_compare_mount_options(sb, server, fc);
 }
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6f76b32a0238..a28d71b45b5f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -256,6 +256,7 @@ struct nfs_server {
 
 	/* User namespace info */
 	const struct cred	*cred;
+	bool			has_sec_mnt_opts;
 };
 
 /* Server capabilities */
-- 
cgit v1.2.3


From 4ebd7651bfc8992ba05b355a8036cb7fd0e8d7de Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 19 Feb 2021 14:26:21 -0500
Subject: lsm: separate security_task_getsecid() into subjective and objective
 variants

Of the three LSMs that implement the security_task_getsecid() LSM
hook, all three LSMs provide the task's objective security
credentials.  This turns out to be unfortunate as most of the hook's
callers seem to expect the task's subjective credentials, although
a small handful of callers do correctly expect the objective
credentials.

This patch is the first step towards fixing the problem: it splits
the existing security_task_getsecid() hook into two variants, one
for the subjective creds, one for the objective creds.

  void security_task_getsecid_subj(struct task_struct *p,
				   u32 *secid);
  void security_task_getsecid_obj(struct task_struct *p,
				  u32 *secid);

While this patch does fix all of the callers to use the correct
variant, in order to keep this patch focused on the callers and to
ease review, the LSMs continue to use the same implementation for
both hooks.  The net effect is that this patch should not change
the behavior of the kernel in any way, it will be up to the latter
LSM specific patches in this series to change the hook
implementations and return the correct credentials.

Acked-by: Mimi Zohar <zohar@linux.ibm.com> (IMA)
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 drivers/android/binder.c              | 11 ++++++++++-
 include/linux/cred.h                  |  2 +-
 include/linux/lsm_hook_defs.h         |  5 ++++-
 include/linux/lsm_hooks.h             | 12 +++++++++---
 include/linux/security.h              | 10 ++++++++--
 kernel/audit.c                        |  4 ++--
 kernel/auditfilter.c                  |  3 ++-
 kernel/auditsc.c                      |  8 ++++----
 kernel/bpf/bpf_lsm.c                  |  3 ++-
 net/netlabel/netlabel_unlabeled.c     |  2 +-
 net/netlabel/netlabel_user.h          |  2 +-
 security/apparmor/lsm.c               |  3 ++-
 security/integrity/ima/ima_appraise.c |  2 +-
 security/integrity/ima/ima_main.c     | 14 +++++++-------
 security/security.c                   | 13 ++++++++++---
 security/selinux/hooks.c              |  3 ++-
 security/smack/smack_lsm.c            |  3 ++-
 17 files changed, 68 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index c119736ca56a..61d235b6ccd8 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2700,7 +2700,16 @@ static void binder_transaction(struct binder_proc *proc,
 		u32 secid;
 		size_t added_size;
 
-		security_task_getsecid(proc->tsk, &secid);
+		/*
+		 * Arguably this should be the task's subjective LSM secid but
+		 * we can't reliably access the subjective creds of a task
+		 * other than our own so we must use the objective creds, which
+		 * are safe to access.  The downside is that if a task is
+		 * temporarily overriding it's creds it will not be reflected
+		 * here; however, it isn't clear that binder would handle that
+		 * case well anyway.
+		 */
+		security_task_getsecid_obj(proc->tsk, &secid);
 		ret = security_secid_to_secctx(secid, &secctx, &secctx_sz);
 		if (ret) {
 			return_error = BR_FAILED_REPLY;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4c6350503697..ac0e5f97d7d8 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -140,7 +140,7 @@ struct cred {
 	struct key	*request_key_auth; /* assumed request_key authority */
 #endif
 #ifdef CONFIG_SECURITY
-	void		*security;	/* subjective LSM security */
+	void		*security;	/* LSM security */
 #endif
 	struct user_struct *user;	/* real user ID subscription */
 	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 1b61bc5dc215..61f04f7dc1a4 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -204,7 +204,10 @@ LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old,
 LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid)
 LSM_HOOK(int, 0, task_getpgid, struct task_struct *p)
 LSM_HOOK(int, 0, task_getsid, struct task_struct *p)
-LSM_HOOK(void, LSM_RET_VOID, task_getsecid, struct task_struct *p, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, task_getsecid_subj,
+	 struct task_struct *p, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj,
+	 struct task_struct *p, u32 *secid)
 LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice)
 LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio)
 LSM_HOOK(int, 0, task_getioprio, struct task_struct *p)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 97bb36d7e994..ba2ccd950833 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -713,9 +713,15 @@
  *	@p.
  *	@p contains the task_struct for the process.
  *	Return 0 if permission is granted.
- * @task_getsecid:
- *	Retrieve the security identifier of the process @p.
- *	@p contains the task_struct for the process and place is into @secid.
+ * @task_getsecid_subj:
+ *	Retrieve the subjective security identifier of the task_struct in @p
+ *	and return it in @secid.  Special care must be taken to ensure that @p
+ *	is the either the "current" task, or the caller has exclusive access
+ *	to @p.
+ *	In case of failure, @secid will be set to zero.
+ * @task_getsecid_obj:
+ *	Retrieve the objective security identifier of the task_struct in @p
+ *	and return it in @secid.
  *	In case of failure, @secid will be set to zero.
  *
  * @task_setnice:
diff --git a/include/linux/security.h b/include/linux/security.h
index f1e5833bfedc..9aeda3f9e838 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -415,7 +415,8 @@ int security_task_fix_setgid(struct cred *new, const struct cred *old,
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
 int security_task_getsid(struct task_struct *p);
-void security_task_getsecid(struct task_struct *p, u32 *secid);
+void security_task_getsecid_subj(struct task_struct *p, u32 *secid);
+void security_task_getsecid_obj(struct task_struct *p, u32 *secid);
 int security_task_setnice(struct task_struct *p, int nice);
 int security_task_setioprio(struct task_struct *p, int ioprio);
 int security_task_getioprio(struct task_struct *p);
@@ -1106,7 +1107,12 @@ static inline int security_task_getsid(struct task_struct *p)
 	return 0;
 }
 
-static inline void security_task_getsecid(struct task_struct *p, u32 *secid)
+static inline void security_task_getsecid_subj(struct task_struct *p, u32 *secid)
+{
+	*secid = 0;
+}
+
+static inline void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
 {
 	*secid = 0;
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 551a394bc8f4..121d37e700a6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2132,7 +2132,7 @@ int audit_log_task_context(struct audit_buffer *ab)
 	int error;
 	u32 sid;
 
-	security_task_getsecid(current, &sid);
+	security_task_getsecid_subj(current, &sid);
 	if (!sid)
 		return 0;
 
@@ -2353,7 +2353,7 @@ int audit_signal_info(int sig, struct task_struct *t)
 			audit_sig_uid = auid;
 		else
 			audit_sig_uid = uid;
-		security_task_getsecid(current, &audit_sig_sid);
+		security_task_getsecid_subj(current, &audit_sig_sid);
 	}
 
 	return audit_signal_info_syscall(t);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 333b3bcfc545..db2c6b59dfc3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1359,7 +1359,8 @@ int audit_filter(int msgtype, unsigned int listtype)
 			case AUDIT_SUBJ_SEN:
 			case AUDIT_SUBJ_CLR:
 				if (f->lsm_rule) {
-					security_task_getsecid(current, &sid);
+					security_task_getsecid_subj(current,
+								    &sid);
 					result = security_audit_rule_match(sid,
 						   f->type, f->op, f->lsm_rule);
 				}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 47fb48f42c93..9973865cbf13 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -667,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			   logged upon error */
 			if (f->lsm_rule) {
 				if (need_sid) {
-					security_task_getsecid(tsk, &sid);
+					security_task_getsecid_subj(tsk, &sid);
 					need_sid = 0;
 				}
 				result = security_audit_rule_match(sid, f->type,
@@ -2400,7 +2400,7 @@ void __audit_ptrace(struct task_struct *t)
 	context->target_auid = audit_get_loginuid(t);
 	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
-	security_task_getsecid(t, &context->target_sid);
+	security_task_getsecid_obj(t, &context->target_sid);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
 }
 
@@ -2427,7 +2427,7 @@ int audit_signal_info_syscall(struct task_struct *t)
 		ctx->target_auid = audit_get_loginuid(t);
 		ctx->target_uid = t_uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
-		security_task_getsecid(t, &ctx->target_sid);
+		security_task_getsecid_obj(t, &ctx->target_sid);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
 		return 0;
 	}
@@ -2448,7 +2448,7 @@ int audit_signal_info_syscall(struct task_struct *t)
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
 	axp->target_uid[axp->pid_count] = t_uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
-	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+	security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
 	axp->pid_count++;
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 1622a44d1617..0ff58259ccf8 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -209,7 +209,8 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
 
 BTF_ID(func, bpf_lsm_syslog)
 BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_task_getsecid)
+BTF_ID(func, bpf_lsm_task_getsecid_subj)
+BTF_ID(func, bpf_lsm_task_getsecid_obj)
 BTF_ID(func, bpf_lsm_task_prctl)
 BTF_ID(func, bpf_lsm_task_setscheduler)
 BTF_ID(func, bpf_lsm_task_to_inode)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index ccb491642811..3e6ac9b790b1 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1539,7 +1539,7 @@ int __init netlbl_unlabel_defconf(void)
 	/* Only the kernel is allowed to call this function and the only time
 	 * it is called is at bootup before the audit subsystem is reporting
 	 * messages so don't worry to much about these values. */
-	security_task_getsecid(current, &audit_info.secid);
+	security_task_getsecid_subj(current, &audit_info.secid);
 	audit_info.loginuid = GLOBAL_ROOT_UID;
 	audit_info.sessionid = 0;
 
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
index 3c67afce64f1..b9ba8112b3c5 100644
--- a/net/netlabel/netlabel_user.h
+++ b/net/netlabel/netlabel_user.h
@@ -34,7 +34,7 @@
 static inline void netlbl_netlink_auditinfo(struct sk_buff *skb,
 					    struct netlbl_audit *audit_info)
 {
-	security_task_getsecid(current, &audit_info->secid);
+	security_task_getsecid_subj(current, &audit_info->secid);
 	audit_info->loginuid = audit_get_loginuid(current);
 	audit_info->sessionid = audit_get_sessionid(current);
 }
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 240a53387e6b..f72406fe1bf2 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1252,7 +1252,8 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = {
 
 	LSM_HOOK_INIT(task_free, apparmor_task_free),
 	LSM_HOOK_INIT(task_alloc, apparmor_task_alloc),
-	LSM_HOOK_INIT(task_getsecid, apparmor_task_getsecid),
+	LSM_HOOK_INIT(task_getsecid_subj, apparmor_task_getsecid),
+	LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid),
 	LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit),
 	LSM_HOOK_INIT(task_kill, apparmor_task_kill),
 
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 565e33ff19d0..4e5eb0236278 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -76,7 +76,7 @@ int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
 	if (!ima_appraise)
 		return 0;
 
-	security_task_getsecid(current, &secid);
+	security_task_getsecid_subj(current, &secid);
 	return ima_match_policy(mnt_userns, inode, current_cred(), secid, func,
 				mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
 }
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 9ef748ea829f..b85d9e429426 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -391,7 +391,7 @@ int ima_file_mmap(struct file *file, unsigned long prot)
 	u32 secid;
 
 	if (file && (prot & PROT_EXEC)) {
-		security_task_getsecid(current, &secid);
+		security_task_getsecid_subj(current, &secid);
 		return process_measurement(file, current_cred(), secid, NULL,
 					   0, MAY_EXEC, MMAP_CHECK);
 	}
@@ -429,7 +429,7 @@ int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot)
 	    !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC))
 		return 0;
 
-	security_task_getsecid(current, &secid);
+	security_task_getsecid_subj(current, &secid);
 	inode = file_inode(vma->vm_file);
 	action = ima_get_action(file_mnt_user_ns(vma->vm_file), inode,
 				current_cred(), secid, MAY_EXEC, MMAP_CHECK,
@@ -470,7 +470,7 @@ int ima_bprm_check(struct linux_binprm *bprm)
 	int ret;
 	u32 secid;
 
-	security_task_getsecid(current, &secid);
+	security_task_getsecid_subj(current, &secid);
 	ret = process_measurement(bprm->file, current_cred(), secid, NULL, 0,
 				  MAY_EXEC, BPRM_CHECK);
 	if (ret)
@@ -495,7 +495,7 @@ int ima_file_check(struct file *file, int mask)
 {
 	u32 secid;
 
-	security_task_getsecid(current, &secid);
+	security_task_getsecid_subj(current, &secid);
 	return process_measurement(file, current_cred(), secid, NULL, 0,
 				   mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
 					   MAY_APPEND), FILE_CHECK);
@@ -686,7 +686,7 @@ int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
 
 	/* Read entire file for all partial reads. */
 	func = read_idmap[read_id] ?: FILE_CHECK;
-	security_task_getsecid(current, &secid);
+	security_task_getsecid_subj(current, &secid);
 	return process_measurement(file, current_cred(), secid, NULL,
 				   0, MAY_READ, func);
 }
@@ -729,7 +729,7 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size,
 	}
 
 	func = read_idmap[read_id] ?: FILE_CHECK;
-	security_task_getsecid(current, &secid);
+	security_task_getsecid_subj(current, &secid);
 	return process_measurement(file, current_cred(), secid, buf, size,
 				   MAY_READ, func);
 }
@@ -872,7 +872,7 @@ void process_buffer_measurement(struct user_namespace *mnt_userns,
 	 * buffer measurements.
 	 */
 	if (func) {
-		security_task_getsecid(current, &secid);
+		security_task_getsecid_subj(current, &secid);
 		action = ima_get_action(mnt_userns, inode, current_cred(),
 					secid, 0, func, &pcr, &template,
 					func_data);
diff --git a/security/security.c b/security/security.c
index a4e7d50c3e39..94383f83ba42 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1769,12 +1769,19 @@ int security_task_getsid(struct task_struct *p)
 	return call_int_hook(task_getsid, 0, p);
 }
 
-void security_task_getsecid(struct task_struct *p, u32 *secid)
+void security_task_getsecid_subj(struct task_struct *p, u32 *secid)
 {
 	*secid = 0;
-	call_void_hook(task_getsecid, p, secid);
+	call_void_hook(task_getsecid_subj, p, secid);
 }
-EXPORT_SYMBOL(security_task_getsecid);
+EXPORT_SYMBOL(security_task_getsecid_subj);
+
+void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
+{
+	*secid = 0;
+	call_void_hook(task_getsecid_obj, p, secid);
+}
+EXPORT_SYMBOL(security_task_getsecid_obj);
 
 int security_task_setnice(struct task_struct *p, int nice)
 {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 07ca2ebf979e..327dbc3acefc 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -7205,7 +7205,8 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
 	LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
 	LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
-	LSM_HOOK_INIT(task_getsecid, selinux_task_getsecid),
+	LSM_HOOK_INIT(task_getsecid_subj, selinux_task_getsecid),
+	LSM_HOOK_INIT(task_getsecid_obj, selinux_task_getsecid),
 	LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
 	LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
 	LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 12a45e61c1a5..f546fb832f30 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4759,7 +4759,8 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(task_setpgid, smack_task_setpgid),
 	LSM_HOOK_INIT(task_getpgid, smack_task_getpgid),
 	LSM_HOOK_INIT(task_getsid, smack_task_getsid),
-	LSM_HOOK_INIT(task_getsecid, smack_task_getsecid),
+	LSM_HOOK_INIT(task_getsecid_subj, smack_task_getsecid),
+	LSM_HOOK_INIT(task_getsecid_obj, smack_task_getsecid),
 	LSM_HOOK_INIT(task_setnice, smack_task_setnice),
 	LSM_HOOK_INIT(task_setioprio, smack_task_setioprio),
 	LSM_HOOK_INIT(task_getioprio, smack_task_getioprio),
-- 
cgit v1.2.3


From f57bac3c33e761fdd78fef159fdc677056c706d0 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Sun, 21 Mar 2021 22:48:49 +0900
Subject: netdev: add netdev_queue_set_dql_min_limit()

Add a function to set the dynamic queue limit minimum value.

Some specific drivers might have legitimate reasons to configure
dql.min_limit to a given value. Typically, this is the case when the
PDU of the protocol is smaller than the packet size to used to
carry those frames to the device.

Concrete example: a CAN (Control Area Network) device with an USB 2.0
interface.  The PDU of classical CAN protocol are roughly 16 bytes but
the USB packet size (which is used to carry the CAN frames to the
device) might be up to 512 bytes.  Wen small traffic burst occurs, BQL
algorithm is not able to immediately adjust and this would result in
having to send many small USB packets (i.e packet of 16 bytes for each
CAN frame). Filling up the USB packet with CAN frames is relatively
fast (small latency issue) but the gain of not having to send several
small USB packets is huge (big throughput increase). In this case,
forcing dql.min_limit to a given value that would allow to stuff the
USB packet is always a win.

This function is to be used by network drivers which are able to prove
through a rationale and through empirical tests on several environment
(with other applications, heavy context switching, virtualization...),
that they constantly reach better performances with a specific
predefined dql.min_limit value with no noticeable latency impact.

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8f003955c485..33b8ea08996e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3446,6 +3446,24 @@ netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue)
 	return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
 }
 
+/**
+ *	netdev_queue_set_dql_min_limit - set dql minimum limit
+ *	@dev_queue: pointer to transmit queue
+ *	@min_limit: dql minimum limit
+ *
+ * Forces xmit_more() to return true until the minimum threshold
+ * defined by @min_limit is reached (or until the tx queue is
+ * empty). Warning: to be use with care, misuse will impact the
+ * latency.
+ */
+static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
+						  unsigned int min_limit)
+{
+#ifdef CONFIG_BQL
+	dev_queue->dql.min_limit = min_limit;
+#endif
+}
+
 /**
  *	netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
  *	@dev_queue: pointer to transmit queue
-- 
cgit v1.2.3


From 405a129f59384c474343d6261a2e0a75650d29a8 Mon Sep 17 00:00:00 2001
From: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Date: Mon, 22 Mar 2021 08:25:16 +0530
Subject: linux/qed: Mundane spelling fixes throughout the file

s/unrequired/"not required"/
s/consme/consume/ .....two different places
s/accros/across/

Signed-off-by: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Acked-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_chain.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index e339b48de32d..f34dbd0db795 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -19,7 +19,7 @@ enum qed_chain_mode {
 	/* Each Page contains a next pointer at its end */
 	QED_CHAIN_MODE_NEXT_PTR,
 
-	/* Chain is a single page (next ptr) is unrequired */
+	/* Chain is a single page (next ptr) is not required */
 	QED_CHAIN_MODE_SINGLE,
 
 	/* Page pointers are located in a side list */
@@ -56,13 +56,13 @@ struct qed_chain_pbl_u32 {
 };
 
 struct qed_chain_u16 {
-	/* Cyclic index of next element to produce/consme */
+	/* Cyclic index of next element to produce/consume */
 	u16						prod_idx;
 	u16						cons_idx;
 };
 
 struct qed_chain_u32 {
-	/* Cyclic index of next element to produce/consme */
+	/* Cyclic index of next element to produce/consume */
 	u32						prod_idx;
 	u32						cons_idx;
 };
@@ -270,7 +270,7 @@ static inline dma_addr_t qed_chain_get_pbl_phys(const struct qed_chain *chain)
 /**
  * @brief qed_chain_advance_page -
  *
- * Advance the next element accros pages for a linked chain
+ * Advance the next element across pages for a linked chain
  *
  * @param p_chain
  * @param p_next_elem
-- 
cgit v1.2.3


From 744b8376632208137fe4acc9967b93e2970732a3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 22 Mar 2021 13:31:48 +0200
Subject: net: move the ptype_all and ptype_base declarations to
 include/linux/netdevice.h

ptype_all and ptype_base are declared in net/core/dev.c as non-static,
because they are used by net-procfs.c too. However, a "make W=1" build
complains that there was no previous declaration of ptype_all and
ptype_base in a header file, so this way of declaring things constitutes
a violation of coding style.

Let's move the extern declarations of ptype_all and ptype_base to the
linux/netdevice.h file, which is included by net-procfs.c too.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 +++
 net/core/net-procfs.c     | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 33b8ea08996e..e4a503288d9b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5336,6 +5336,9 @@ do {								\
 #define PTYPE_HASH_SIZE	(16)
 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
 
+extern struct list_head ptype_all __read_mostly;
+extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+
 extern struct net_device *blackhole_netdev;
 
 #endif	/* _LINUX_NETDEVICE_H */
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index c714e6a9dad4..d8b9dbabd4a4 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -10,9 +10,6 @@
 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
 
-extern struct list_head ptype_all __read_mostly;
-extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-
 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
 {
 	struct net *net = seq_file_net(seq);
-- 
cgit v1.2.3


From 4bf07f6562a01a488877e05267808da7147f44a5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 22 Mar 2021 22:39:03 +0100
Subject: timekeeping, clocksource: Fix various typos in comments

Fix ~56 single-word typos in timekeeping & clocksource code comments.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: linux-kernel@vger.kernel.org
---
 drivers/clocksource/clksrc-dbx500-prcmu.c           |  8 ++++----
 drivers/clocksource/dw_apb_timer_of.c               |  2 +-
 drivers/clocksource/hyperv_timer.c                  |  2 +-
 drivers/clocksource/timer-atmel-tcb.c               |  4 ++--
 drivers/clocksource/timer-fsl-ftm.c                 |  2 +-
 drivers/clocksource/timer-microchip-pit64b.c        |  2 +-
 drivers/clocksource/timer-of.c                      |  4 ++--
 drivers/clocksource/timer-ti-dm-systimer.c          |  2 +-
 drivers/clocksource/timer-vf-pit.c                  |  2 +-
 include/linux/clocksource.h                         |  2 +-
 include/linux/timex.h                               |  2 +-
 kernel/time/alarmtimer.c                            |  6 +++---
 kernel/time/clocksource.c                           |  4 ++--
 kernel/time/hrtimer.c                               | 18 +++++++++---------
 kernel/time/jiffies.c                               |  2 +-
 kernel/time/ntp.c                                   |  2 +-
 kernel/time/posix-cpu-timers.c                      |  6 +++---
 kernel/time/tick-broadcast-hrtimer.c                |  2 +-
 kernel/time/tick-broadcast.c                        |  4 ++--
 kernel/time/tick-oneshot.c                          |  2 +-
 kernel/time/tick-sched.c                            |  2 +-
 kernel/time/tick-sched.h                            |  2 +-
 kernel/time/time.c                                  |  2 +-
 kernel/time/timekeeping.c                           | 10 +++++-----
 kernel/time/timer.c                                 |  4 ++--
 kernel/time/vsyscall.c                              |  2 +-
 tools/testing/selftests/timers/clocksource-switch.c |  4 ++--
 tools/testing/selftests/timers/leap-a-day.c         |  2 +-
 tools/testing/selftests/timers/leapcrash.c          |  4 ++--
 tools/testing/selftests/timers/threadtest.c         |  2 +-
 30 files changed, 56 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/clksrc-dbx500-prcmu.c b/drivers/clocksource/clksrc-dbx500-prcmu.c
index 996900d017c6..2fc93e46cea3 100644
--- a/drivers/clocksource/clksrc-dbx500-prcmu.c
+++ b/drivers/clocksource/clksrc-dbx500-prcmu.c
@@ -18,7 +18,7 @@
 
 #define RATE_32K		32768
 
-#define TIMER_MODE_CONTINOUS	0x1
+#define TIMER_MODE_CONTINUOUS	0x1
 #define TIMER_DOWNCOUNT_VAL	0xffffffff
 
 #define PRCMU_TIMER_REF		0
@@ -55,13 +55,13 @@ static int __init clksrc_dbx500_prcmu_init(struct device_node *node)
 
 	/*
 	 * The A9 sub system expects the timer to be configured as
-	 * a continous looping timer.
+	 * a continuous looping timer.
 	 * The PRCMU should configure it but if it for some reason
 	 * don't we do it here.
 	 */
 	if (readl(clksrc_dbx500_timer_base + PRCMU_TIMER_MODE) !=
-	    TIMER_MODE_CONTINOUS) {
-		writel(TIMER_MODE_CONTINOUS,
+	    TIMER_MODE_CONTINUOUS) {
+		writel(TIMER_MODE_CONTINUOUS,
 		       clksrc_dbx500_timer_base + PRCMU_TIMER_MODE);
 		writel(TIMER_DOWNCOUNT_VAL,
 		       clksrc_dbx500_timer_base + PRCMU_TIMER_REF);
diff --git a/drivers/clocksource/dw_apb_timer_of.c b/drivers/clocksource/dw_apb_timer_of.c
index 42e7e43b8fcd..2b2c3b586987 100644
--- a/drivers/clocksource/dw_apb_timer_of.c
+++ b/drivers/clocksource/dw_apb_timer_of.c
@@ -38,7 +38,7 @@ static int __init timer_get_base_and_rate(struct device_node *np,
 	}
 
 	/*
-	 * Not all implementations use a periphal clock, so don't panic
+	 * Not all implementations use a peripheral clock, so don't panic
 	 * if it's not present
 	 */
 	pclk = of_clk_get_by_name(np, "pclk");
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 269a691bd2c4..a02b0a224807 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -457,7 +457,7 @@ void __init hv_init_clocksource(void)
 {
 	/*
 	 * Try to set up the TSC page clocksource. If it succeeds, we're
-	 * done. Otherwise, set up the MSR clocksoruce.  At least one of
+	 * done. Otherwise, set up the MSR clocksource.  At least one of
 	 * these will always be available except on very old versions of
 	 * Hyper-V on x86.  In that case we won't have a Hyper-V
 	 * clocksource, but Linux will still run with a clocksource based
diff --git a/drivers/clocksource/timer-atmel-tcb.c b/drivers/clocksource/timer-atmel-tcb.c
index 787dbebbb432..27af17c99590 100644
--- a/drivers/clocksource/timer-atmel-tcb.c
+++ b/drivers/clocksource/timer-atmel-tcb.c
@@ -455,9 +455,9 @@ static int __init tcb_clksrc_init(struct device_node *node)
 	tcaddr = tc.regs;
 
 	if (bits == 32) {
-		/* use apropriate function to read 32 bit counter */
+		/* use appropriate function to read 32 bit counter */
 		clksrc.read = tc_get_cycles32;
-		/* setup ony channel 0 */
+		/* setup only channel 0 */
 		tcb_setup_single_chan(&tc, best_divisor_idx);
 		tc_sched_clock = tc_sched_clock_read32;
 		tc_delay_timer.read_current_timer = tc_delay_timer_read32;
diff --git a/drivers/clocksource/timer-fsl-ftm.c b/drivers/clocksource/timer-fsl-ftm.c
index 12a2ed7cfaff..93f336ec875a 100644
--- a/drivers/clocksource/timer-fsl-ftm.c
+++ b/drivers/clocksource/timer-fsl-ftm.c
@@ -116,7 +116,7 @@ static int ftm_set_next_event(unsigned long delta,
 	 * to the MOD register latches the value into a buffer. The MOD
 	 * register is updated with the value of its write buffer with
 	 * the following scenario:
-	 * a, the counter source clock is diabled.
+	 * a, the counter source clock is disabled.
 	 */
 	ftm_counter_disable(priv->clkevt_base);
 
diff --git a/drivers/clocksource/timer-microchip-pit64b.c b/drivers/clocksource/timer-microchip-pit64b.c
index ab623b25a47b..cfa4ec7ef396 100644
--- a/drivers/clocksource/timer-microchip-pit64b.c
+++ b/drivers/clocksource/timer-microchip-pit64b.c
@@ -237,7 +237,7 @@ static void __init mchp_pit64b_pres_compute(u32 *pres, u32 clk_rate,
 			break;
 	}
 
-	/* Use the bigest prescaler if we didn't match one. */
+	/* Use the biggest prescaler if we didn't match one. */
 	if (*pres == MCHP_PIT64B_PRES_MAX)
 		*pres = MCHP_PIT64B_PRES_MAX - 1;
 }
diff --git a/drivers/clocksource/timer-of.c b/drivers/clocksource/timer-of.c
index 572da477c6d3..529cc6a51cdb 100644
--- a/drivers/clocksource/timer-of.c
+++ b/drivers/clocksource/timer-of.c
@@ -211,10 +211,10 @@ out_fail:
 }
 
 /**
- * timer_of_cleanup - release timer_of ressources
+ * timer_of_cleanup - release timer_of resources
  * @to: timer_of structure
  *
- * Release the ressources that has been used in timer_of_init().
+ * Release the resources that has been used in timer_of_init().
  * This function should be called in init error cases
  */
 void __init timer_of_cleanup(struct timer_of *to)
diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c
index 33b3e8aa2cc5..614c8380f3e9 100644
--- a/drivers/clocksource/timer-ti-dm-systimer.c
+++ b/drivers/clocksource/timer-ti-dm-systimer.c
@@ -589,7 +589,7 @@ static int __init dmtimer_clockevent_init(struct device_node *np)
 		"always-on " : "", t->rate, np->parent);
 
 	clockevents_config_and_register(dev, t->rate,
-					3, /* Timer internal resynch latency */
+					3, /* Timer internal resync latency */
 					0xffffffff);
 
 	if (of_machine_is_compatible("ti,am33xx") ||
diff --git a/drivers/clocksource/timer-vf-pit.c b/drivers/clocksource/timer-vf-pit.c
index 1a86a4e7e344..911c92146eca 100644
--- a/drivers/clocksource/timer-vf-pit.c
+++ b/drivers/clocksource/timer-vf-pit.c
@@ -136,7 +136,7 @@ static int __init pit_clockevent_init(unsigned long rate, int irq)
 	/*
 	 * The value for the LDVAL register trigger is calculated as:
 	 * LDVAL trigger = (period / clock period) - 1
-	 * The pit is a 32-bit down count timer, when the conter value
+	 * The pit is a 32-bit down count timer, when the counter value
 	 * reaches 0, it will generate an interrupt, thus the minimal
 	 * LDVAL trigger value is 1. And then the min_delta is
 	 * minimal LDVAL trigger value + 1, and the max_delta is full 32-bit.
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 86d143db6523..a247b089ca78 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -70,7 +70,7 @@ struct module;
  * @mark_unstable:	Optional function to inform the clocksource driver that
  *			the watchdog marked the clocksource unstable
  * @tick_stable:        Optional function called periodically from the watchdog
- *			code to provide stable syncrhonization points
+ *			code to provide stable synchronization points
  * @wd_list:		List head to enqueue into the watchdog list (internal)
  * @cs_last:		Last clocksource value for clocksource watchdog
  * @wd_last:		Last watchdog value corresponding to @cs_last
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 9c2e54faf9b7..059b18eb1f1f 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -133,7 +133,7 @@
 
 /*
  * kernel variables
- * Note: maximum error = NTP synch distance = dispersion + delay / 2;
+ * Note: maximum error = NTP sync distance = dispersion + delay / 2;
  * estimated error = NTP dispersion.
  */
 extern unsigned long tick_usec;		/* USER_HZ period (usec) */
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 98d7a15e8cf6..e9af8fae0bfb 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -2,13 +2,13 @@
 /*
  * Alarmtimer interface
  *
- * This interface provides a timer which is similarto hrtimers,
+ * This interface provides a timer which is similar to hrtimers,
  * but triggers a RTC alarm if the box is suspend.
  *
  * This interface is influenced by the Android RTC Alarm timer
  * interface.
  *
- * Copyright (C) 2010 IBM Corperation
+ * Copyright (C) 2010 IBM Corporation
  *
  * Author: John Stultz <john.stultz@linaro.org>
  */
@@ -811,7 +811,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
 /**
  * alarm_timer_nsleep - alarmtimer nanosleep
  * @which_clock: clockid
- * @flags: determins abstime or relative
+ * @flags: determines abstime or relative
  * @tsreq: requested sleep time (abs or rel)
  *
  * Handles clock_nanosleep calls against _ALARM clockids
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cce484a2cc7c..1d1a61371b5a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -38,7 +38,7 @@
  * calculated mult and shift factors. This guarantees that no 64bit
  * overflow happens when the input value of the conversion is
  * multiplied with the calculated mult factor. Larger ranges may
- * reduce the conversion accuracy by chosing smaller mult and shift
+ * reduce the conversion accuracy by choosing smaller mult and shift
  * factors.
  */
 void
@@ -518,7 +518,7 @@ static void clocksource_suspend_select(bool fallback)
  * the suspend time when resuming system.
  *
  * This function is called late in the suspend process from timekeeping_suspend(),
- * that means processes are freezed, non-boot cpus and interrupts are disabled
+ * that means processes are frozen, non-boot cpus and interrupts are disabled
  * now. It is therefore possible to start the suspend timer without taking the
  * clocksource mutex.
  */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 788b9d137de4..30b356c93c78 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -683,7 +683,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 	 * T1 is removed, so this code is called and would reprogram
 	 * the hardware to 5s from now. Any hrtimer_start after that
 	 * will not reprogram the hardware due to hang_detected being
-	 * set. So we'd effectivly block all timers until the T2 event
+	 * set. So we'd effectively block all timers until the T2 event
 	 * fires.
 	 */
 	if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
@@ -1019,7 +1019,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
 	 * cpu_base->next_timer. This happens when we remove the first
 	 * timer on a remote cpu. No harm as we never dereference
 	 * cpu_base->next_timer. So the worst thing what can happen is
-	 * an superflous call to hrtimer_force_reprogram() on the
+	 * an superfluous call to hrtimer_force_reprogram() on the
 	 * remote cpu later on if the same timer gets enqueued again.
 	 */
 	if (reprogram && timer == cpu_base->next_timer)
@@ -1212,7 +1212,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
  * The counterpart to hrtimer_cancel_wait_running().
  *
  * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
- * the timer callback to finish. Drop expiry_lock and reaquire it. That
+ * the timer callback to finish. Drop expiry_lock and reacquire it. That
  * allows the waiter to acquire the lock and make progress.
  */
 static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
@@ -1398,7 +1398,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	int base;
 
 	/*
-	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
 	 * marked for hard interrupt expiry mode are moved into soft
 	 * interrupt context for latency reasons and because the callbacks
 	 * can invoke functions which might sleep on RT, e.g. spin_lock().
@@ -1430,7 +1430,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
  * hrtimer_init - initialize a timer to the given clock
  * @timer:	the timer to be initialized
  * @clock_id:	the clock to be used
- * @mode:       The modes which are relevant for intitialization:
+ * @mode:       The modes which are relevant for initialization:
  *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
  *              HRTIMER_MODE_REL_SOFT
  *
@@ -1487,7 +1487,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
  * insufficient for that.
  *
  * The sequence numbers are required because otherwise we could still observe
- * a false negative if the read side got smeared over multiple consequtive
+ * a false negative if the read side got smeared over multiple consecutive
  * __run_hrtimer() invocations.
  */
 
@@ -1588,7 +1588,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
 			 * minimizing wakeups, not running timers at the
 			 * earliest interrupt after their soft expiration.
 			 * This allows us to avoid using a Priority Search
-			 * Tree, which can answer a stabbing querry for
+			 * Tree, which can answer a stabbing query for
 			 * overlapping intervals and instead use the simple
 			 * BST we already have.
 			 * We don't add extra wakeups by delaying timers that
@@ -1822,7 +1822,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				   clockid_t clock_id, enum hrtimer_mode mode)
 {
 	/*
-	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
 	 * marked for hard interrupt expiry mode are moved into soft
 	 * interrupt context either for latency reasons or because the
 	 * hrtimer callback takes regular spinlocks or invokes other
@@ -1835,7 +1835,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 	 * the same CPU. That causes a latency spike due to the wakeup of
 	 * a gazillion threads.
 	 *
-	 * OTOH, priviledged real-time user space applications rely on the
+	 * OTOH, privileged real-time user space applications rely on the
 	 * low latency of hard interrupt wakeups. If the current task is in
 	 * a real-time scheduling class, mark the mode for hard interrupt
 	 * expiry.
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a5cffe2a1770..a492e4da69ba 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -44,7 +44,7 @@ static u64 jiffies_read(struct clocksource *cs)
  * the timer interrupt frequency HZ and it suffers
  * inaccuracies caused by missed or lost timer
  * interrupts and the inability for the timer
- * interrupt hardware to accuratly tick at the
+ * interrupt hardware to accurately tick at the
  * requested HZ value. It is also not recommended
  * for "tick-less" systems.
  */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5247afd7f345..406dccb79c2b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -544,7 +544,7 @@ static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec,
 				  struct timespec64 *to_set,
 				  const struct timespec64 *now)
 {
-	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
+	/* Allowed error in tv_nsec, arbitrarily set to 5 jiffies in ns. */
 	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
 	struct timespec64 delay = {.tv_sec = -1,
 				   .tv_nsec = set_offset_nsec};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..b145e6835e34 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -279,7 +279,7 @@ void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
  * @tsk:	Task for which cputime needs to be started
  * @samples:	Storage for time samples
  *
- * The thread group cputime accouting is avoided when there are no posix
+ * The thread group cputime accounting is avoided when there are no posix
  * CPU timers armed. Before starting a timer it's required to check whether
  * the time accounting is active. If not, a full update of the atomic
  * accounting store needs to be done and the accounting enabled.
@@ -390,7 +390,7 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
 	/*
 	 * If posix timer expiry is handled in task work context then
 	 * timer::it_lock can be taken without disabling interrupts as all
-	 * other locking happens in task context. This requires a seperate
+	 * other locking happens in task context. This requires a separate
 	 * lock class key otherwise regular posix timer expiry would record
 	 * the lock class being taken in interrupt context and generate a
 	 * false positive warning.
@@ -1216,7 +1216,7 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
 		check_process_timers(tsk, &firing);
 
 		/*
-		 * The above timer checks have updated the exipry cache and
+		 * The above timer checks have updated the expiry cache and
 		 * because nothing can have queued or modified timers after
 		 * sighand lock was taken above it is guaranteed to be
 		 * consistent. So the next timer interrupt fastpath check
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index b5a65e212df2..797eb93103ad 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -53,7 +53,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 	 * reasons.
 	 *
 	 * Each caller tries to arm the hrtimer on its own CPU, but if the
-	 * hrtimer callbback function is currently running, then
+	 * hrtimer callback function is currently running, then
 	 * hrtimer_start() cannot move it and the timer stays on the CPU on
 	 * which it is assigned at the moment.
 	 *
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5a23829372c7..6ec7855ab88d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -157,7 +157,7 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
 }
 
 /*
- * Check, if the device is disfunctional and a place holder, which
+ * Check, if the device is dysfunctional and a placeholder, which
  * needs to be handled by the broadcast device.
  */
 int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
@@ -391,7 +391,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
 			 * - the broadcast device exists
 			 * - the broadcast device is not a hrtimer based one
 			 * - the broadcast device is in periodic mode to
-			 *   avoid a hickup during switch to oneshot mode
+			 *   avoid a hiccup during switch to oneshot mode
 			 */
 			if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) &&
 			    tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f9745d47425a..475ecceda768 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -45,7 +45,7 @@ int tick_program_event(ktime_t expires, int force)
 }
 
 /**
- * tick_resume_onshot - resume oneshot mode
+ * tick_resume_oneshot - resume oneshot mode
  */
 void tick_resume_oneshot(void)
 {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e10a4af88737..128735e3e77e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -751,7 +751,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 	 * Aside of that check whether the local timer softirq is
 	 * pending. If so its a bad idea to call get_next_timer_interrupt()
 	 * because there is an already expired timer, so it will request
-	 * immeditate expiry, which rearms the hardware timer with a
+	 * immediate expiry, which rearms the hardware timer with a
 	 * minimal delta which brings us back to this place
 	 * immediately. Lather, rinse and repeat...
 	 */
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 4fb06527cf64..d952ae393423 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -29,7 +29,7 @@ enum tick_nohz_mode {
  * @inidle:		Indicator that the CPU is in the tick idle mode
  * @tick_stopped:	Indicator that the idle tick has been stopped
  * @idle_active:	Indicator that the CPU is actively in the tick idle mode;
- *			it is resetted during irq handling phases.
+ *			it is reset during irq handling phases.
  * @do_timer_lst:	CPU was the last one doing do_timer before going idle
  * @got_idle_tick:	Tick timer function has run with @inidle set
  * @last_tick:		Store the last tick expiry time when the tick
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 3985b2b32d08..29923b20e0e4 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -571,7 +571,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
 /*
  * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
  * that a remainder subtract here would not do the right thing as the
- * resolution values don't fall on second boundries.  I.e. the line:
+ * resolution values don't fall on second boundaries.  I.e. the line:
  * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
  * Note that due to the small error in the multiplier here, this
  * rounding is incorrect for sufficiently large values of tv_nsec, but
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6aee5768c86f..77bafd8c8df2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -596,14 +596,14 @@ EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
  * careful cache layout of the timekeeper because the sequence count and
  * struct tk_read_base would then need two cache lines instead of one.
  *
- * Access to the time keeper clock source is disabled accross the innermost
+ * Access to the time keeper clock source is disabled across the innermost
  * steps of suspend/resume. The accessors still work, but the timestamps
  * are frozen until time keeping is resumed which happens very early.
  *
  * For regular suspend/resume there is no observable difference vs. sched
  * clock, but it might affect some of the nasty low level debug printks.
  *
- * OTOH, access to sched clock is not guaranteed accross suspend/resume on
+ * OTOH, access to sched clock is not guaranteed across suspend/resume on
  * all systems either so it depends on the hardware in use.
  *
  * If that turns out to be a real problem then this could be mitigated by
@@ -899,7 +899,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
 EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
 
 /**
- * ktime_mono_to_any() - convert mononotic time to any other time
+ * ktime_mono_to_any() - convert monotonic time to any other time
  * @tmono:	time to convert.
  * @offs:	which offset to use
  */
@@ -1948,7 +1948,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
 	 *	xtime_nsec_1 = offset + xtime_nsec_2
 	 * Which gives us:
 	 *	xtime_nsec_2 = xtime_nsec_1 - offset
-	 * Which simplfies to:
+	 * Which simplifies to:
 	 *	xtime_nsec -= offset
 	 */
 	if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
@@ -2336,7 +2336,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
 
 		/*
 		 * Validate if a timespec/timeval used to inject a time
-		 * offset is valid.  Offsets can be postive or negative, so
+		 * offset is valid.  Offsets can be positive or negative, so
 		 * we don't check tv_sec. The value of the timeval/timespec
 		 * is the sum of its fields,but *NOTE*:
 		 * The field tv_usec/tv_nsec must always be non-negative and
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f475f1a027c8..d111adf4a0cb 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -894,7 +894,7 @@ static inline void forward_timer_base(struct timer_base *base)
 	/*
 	 * No need to forward if we are close enough below jiffies.
 	 * Also while executing timers, base->clk is 1 offset ahead
-	 * of jiffies to avoid endless requeuing to current jffies.
+	 * of jiffies to avoid endless requeuing to current jiffies.
 	 */
 	if ((long)(jnow - base->clk) < 1)
 		return;
@@ -1271,7 +1271,7 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
  * The counterpart to del_timer_wait_running().
  *
  * If there is a waiter for base->expiry_lock, then it was waiting for the
- * timer callback to finish. Drop expiry_lock and reaquire it. That allows
+ * timer callback to finish. Drop expiry_lock and reacquire it. That allows
  * the waiter to acquire the lock and make progress.
  */
 static void timer_sync_wait_running(struct timer_base *base)
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 88e6b8ed6ca5..f0d5062d9cbc 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -108,7 +108,7 @@ void update_vsyscall(struct timekeeper *tk)
 
 	/*
 	 * If the current clocksource is not VDSO capable, then spare the
-	 * update of the high reolution parts.
+	 * update of the high resolution parts.
 	 */
 	if (clock_mode != VDSO_CLOCKMODE_NONE)
 		update_vdso_data(vdata, tk);
diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c
index bfc974b4572d..ef8eb3604595 100644
--- a/tools/testing/selftests/timers/clocksource-switch.c
+++ b/tools/testing/selftests/timers/clocksource-switch.c
@@ -3,7 +3,7 @@
  *		(C) Copyright IBM 2012
  *		Licensed under the GPLv2
  *
- *  NOTE: This is a meta-test which quickly changes the clocksourc and
+ *  NOTE: This is a meta-test which quickly changes the clocksource and
  *  then uses other tests to detect problems. Thus this test requires
  *  that the inconsistency-check and nanosleep tests be present in the
  *  same directory it is run from.
@@ -134,7 +134,7 @@ int main(int argv, char **argc)
 		return -1;
 	}
 
-	/* Check everything is sane before we start switching asyncrhonously */
+	/* Check everything is sane before we start switching asynchronously */
 	for (i = 0; i < count; i++) {
 		printf("Validating clocksource %s\n", clocksource_list[i]);
 		if (change_clocksource(clocksource_list[i])) {
diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c
index 19e46ed5dfb5..23eb398c8140 100644
--- a/tools/testing/selftests/timers/leap-a-day.c
+++ b/tools/testing/selftests/timers/leap-a-day.c
@@ -5,7 +5,7 @@
  *              Licensed under the GPLv2
  *
  *  This test signals the kernel to insert a leap second
- *  every day at midnight GMT. This allows for stessing the
+ *  every day at midnight GMT. This allows for stressing the
  *  kernel's leap-second behavior, as well as how well applications
  *  handle the leap-second discontinuity.
  *
diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c
index dc80728ed191..f70802c5dd0d 100644
--- a/tools/testing/selftests/timers/leapcrash.c
+++ b/tools/testing/selftests/timers/leapcrash.c
@@ -4,10 +4,10 @@
  *              (C) Copyright 2013, 2015 Linaro Limited
  *              Licensed under the GPL
  *
- * This test demonstrates leapsecond deadlock that is possibe
+ * This test demonstrates leapsecond deadlock that is possible
  * on kernels from 2.6.26 to 3.3.
  *
- * WARNING: THIS WILL LIKELY HARDHANG SYSTEMS AND MAY LOSE DATA
+ * WARNING: THIS WILL LIKELY HARD HANG SYSTEMS AND MAY LOSE DATA
  * RUN AT YOUR OWN RISK!
  *  To build:
  *	$ gcc leapcrash.c -o leapcrash -lrt
diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c
index cf3e48919874..80aed4bf06fb 100644
--- a/tools/testing/selftests/timers/threadtest.c
+++ b/tools/testing/selftests/timers/threadtest.c
@@ -76,7 +76,7 @@ void checklist(struct timespec *list, int size)
 
 /* The shared thread shares a global list
  * that each thread fills while holding the lock.
- * This stresses clock syncronization across cpus.
+ * This stresses clock synchronization across cpus.
  */
 void *shared_thread(void *arg)
 {
-- 
cgit v1.2.3


From add2d73631070c951b0de81a01d1463a15cfbd47 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Mar 2021 11:21:45 -0700
Subject: net: set initial device refcount to 1

When adding CONFIG_PCPU_DEV_REFCNT, I forgot that the
initial net device refcount was 0.

When CONFIG_PCPU_DEV_REFCNT is not set, this means
the first dev_hold() triggers an illegal refcount
operation (addition on 0)

refcount_t: addition on 0; use-after-free.
WARNING: CPU: 0 PID: 1 at lib/refcount.c:25 refcount_warn_saturate+0x128/0x1a4

Fix is to change initial (and final) refcount to be 1.

Also add a missing kerneldoc piece, as reported by
Stephen Rothwell.

Fixes: 919067cc845f ("net: add CONFIG_PCPU_DEV_REFCNT")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Guenter Roeck <groeck@google.com>
Tested-by: Guenter Roeck <groeck@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e4a503288d9b..7005ad80e8d1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1792,6 +1792,7 @@ enum netdev_ml_priv_type {
  *
  *	@proto_down_reason:	reason a netdev interface is held down
  *	@pcpu_refcnt:		Number of references to this device
+ *	@dev_refcnt:		Number of references to this device
  *	@todo_list:		Delayed register/unregister
  *	@link_watch_list:	XXX: need comments on this one
  *
diff --git a/net/core/dev.c b/net/core/dev.c
index ffab3928eeeb..c9a496f5e687 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10346,7 +10346,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 	rebroadcast_time = warning_time = jiffies;
 	refcnt = netdev_refcnt_read(dev);
 
-	while (refcnt != 0) {
+	while (refcnt != 1) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 			rtnl_lock();
 
@@ -10383,7 +10383,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 		refcnt = netdev_refcnt_read(dev);
 
-		if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
+		if (refcnt != 1 && time_after(jiffies, warning_time + 10 * HZ)) {
 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
 				 dev->name, refcnt);
 			warning_time = jiffies;
@@ -10459,7 +10459,7 @@ void netdev_run_todo(void)
 		netdev_wait_allrefs(dev);
 
 		/* paranoia */
-		BUG_ON(netdev_refcnt_read(dev));
+		BUG_ON(netdev_refcnt_read(dev) != 1);
 		BUG_ON(!list_empty(&dev->ptype_all));
 		BUG_ON(!list_empty(&dev->ptype_specific));
 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
@@ -10680,6 +10680,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev->pcpu_refcnt = alloc_percpu(int);
 	if (!dev->pcpu_refcnt)
 		goto free_dev;
+	dev_hold(dev);
+#else
+	refcount_set(&dev->dev_refcnt, 1);
 #endif
 
 	if (dev_addr_init(dev))
-- 
cgit v1.2.3


From 1ab568e92bf8f6a359c977869dc546a23a6b5f13 Mon Sep 17 00:00:00 2001
From: Kurt Kanzenbach <kurt@kmk-computers.de>
Date: Mon, 22 Mar 2021 19:51:13 +0100
Subject: net: dsa: hellcreek: Report switch name and ID

Report the driver name, ASIC ID and the switch name via devlink. This is a
useful information for user space tooling.

Signed-off-by: Kurt Kanzenbach <kurt@kmk-computers.de>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/hirschmann/hellcreek.c             | 18 ++++++++++++++++++
 include/linux/platform_data/hirschmann-hellcreek.h |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c
index 64a73dd045c0..918be7eb626f 100644
--- a/drivers/net/dsa/hirschmann/hellcreek.c
+++ b/drivers/net/dsa/hirschmann/hellcreek.c
@@ -1082,6 +1082,22 @@ out:
 	return ret;
 }
 
+static int hellcreek_devlink_info_get(struct dsa_switch *ds,
+				      struct devlink_info_req *req,
+				      struct netlink_ext_ack *extack)
+{
+	struct hellcreek *hellcreek = ds->priv;
+	int ret;
+
+	ret = devlink_info_driver_name_put(req, "hellcreek");
+	if (ret)
+		return ret;
+
+	return devlink_info_version_fixed_put(req,
+					      DEVLINK_INFO_VERSION_GENERIC_ASIC_ID,
+					      hellcreek->pdata->name);
+}
+
 static u64 hellcreek_devlink_vlan_table_get(void *priv)
 {
 	struct hellcreek *hellcreek = priv;
@@ -1732,6 +1748,7 @@ static int hellcreek_port_setup_tc(struct dsa_switch *ds, int port,
 }
 
 static const struct dsa_switch_ops hellcreek_ds_ops = {
+	.devlink_info_get      = hellcreek_devlink_info_get,
 	.get_ethtool_stats     = hellcreek_get_ethtool_stats,
 	.get_sset_count	       = hellcreek_get_sset_count,
 	.get_strings	       = hellcreek_get_strings,
@@ -1909,6 +1926,7 @@ static int hellcreek_remove(struct platform_device *pdev)
 }
 
 static const struct hellcreek_platform_data de1soc_r1_pdata = {
+	.name		 = "r4c30",
 	.num_ports	 = 4,
 	.is_100_mbits	 = 1,
 	.qbv_support	 = 1,
diff --git a/include/linux/platform_data/hirschmann-hellcreek.h b/include/linux/platform_data/hirschmann-hellcreek.h
index 388846766bb2..6a000df5541f 100644
--- a/include/linux/platform_data/hirschmann-hellcreek.h
+++ b/include/linux/platform_data/hirschmann-hellcreek.h
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 
 struct hellcreek_platform_data {
+	const char *name;	/* Switch name */
 	int num_ports;		/* Amount of switch ports */
 	int is_100_mbits;	/* Is it configured to 100 or 1000 mbit/s */
 	int qbv_support;	/* Qbv support on front TSN ports */
-- 
cgit v1.2.3


From 5dd5f9347a927c169205f7385e5cf4e18c41e21a Mon Sep 17 00:00:00 2001
From: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Date: Sun, 21 Mar 2021 01:42:40 +0530
Subject: driver core: Trivial typo fix

s/subsytem/subsystem/

Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Link: https://lore.kernel.org/r/20210320201240.23745-1-unixbhaskar@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index ba660731bd25..a8ce0dc3b758 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -49,7 +49,7 @@ struct dev_iommu;
 /**
  * struct subsys_interface - interfaces to device functions
  * @name:       name of the device function
- * @subsys:     subsytem of the devices to attach to
+ * @subsys:     subsystem of the devices to attach to
  * @node:       the list of functions registered at the subsystem
  * @add_dev:    device hookup to device function handler
  * @remove_dev: device hookup to device function handler
-- 
cgit v1.2.3


From 1bd66c1a32ca8e5148eaba2675321637e89a49af Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sat, 20 Mar 2021 13:26:21 +0100
Subject: fs: document mapping helpers

Document new helpers we introduced this cycle.

Link: https://lore.kernel.org/r/20210320122623.599086-2-christian.brauner@ubuntu.com
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6a..33873531ffa6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1574,36 +1574,84 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
+/**
+ * kuid_into_mnt - map a kuid down into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ * @kuid: kuid to be mapped
+ *
+ * Return: @kuid mapped according to @mnt_userns.
+ * If @kuid has no mapping INVALID_UID is returned.
+ */
 static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
 				   kuid_t kuid)
 {
 	return make_kuid(mnt_userns, __kuid_val(kuid));
 }
 
+/**
+ * kgid_into_mnt - map a kgid down into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ * @kgid: kgid to be mapped
+ *
+ * Return: @kgid mapped according to @mnt_userns.
+ * If @kgid has no mapping INVALID_GID is returned.
+ */
 static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
 				   kgid_t kgid)
 {
 	return make_kgid(mnt_userns, __kgid_val(kgid));
 }
 
+/**
+ * i_uid_into_mnt - map an inode's i_uid down into a mnt_userns
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to map
+ *
+ * Return: the inode's i_uid mapped down according to @mnt_userns.
+ * If the inode's i_uid has no mapping INVALID_UID is returned.
+ */
 static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
 				    const struct inode *inode)
 {
 	return kuid_into_mnt(mnt_userns, inode->i_uid);
 }
 
+/**
+ * i_gid_into_mnt - map an inode's i_gid down into a mnt_userns
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to map
+ *
+ * Return: the inode's i_gid mapped down according to @mnt_userns.
+ * If the inode's i_gid has no mapping INVALID_GID is returned.
+ */
 static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
 				    const struct inode *inode)
 {
 	return kgid_into_mnt(mnt_userns, inode->i_gid);
 }
 
+/**
+ * kuid_from_mnt - map a kuid up into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ * @kuid: kuid to be mapped
+ *
+ * Return: @kuid mapped up according to @mnt_userns.
+ * If @kuid has no mapping INVALID_UID is returned.
+ */
 static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
 				   kuid_t kuid)
 {
 	return KUIDT_INIT(from_kuid(mnt_userns, kuid));
 }
 
+/**
+ * kgid_from_mnt - map a kgid up into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ * @kgid: kgid to be mapped
+ *
+ * Return: @kgid mapped up according to @mnt_userns.
+ * If @kgid has no mapping INVALID_GID is returned.
+ */
 static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
 				   kgid_t kgid)
 {
-- 
cgit v1.2.3


From a65e58e791a1690da8de731c8391816a22f5555c Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sat, 20 Mar 2021 13:26:22 +0100
Subject: fs: document and rename fsid helpers

Vivek pointed out that the fs{g,u}id_into_mnt() naming scheme can be
misleading as it could be understood as implying they do the exact same
thing as i_{g,u}id_into_mnt(). The original motivation for this naming
scheme was to signal to callers that the helpers will always take care
to map the k{g,u}id such that the ownership is expressed in terms of the
mnt_users.
Get rid of the confusion by renaming those helpers to something more
sensible. Al suggested mapped_fs{g,u}id() which seems a really good fit.
Usually filesystems don't need to bother with these helpers directly
only in some cases where they allocate objects that carry {g,u}ids which
are either filesystem specific (e.g. xfs quota objects) or don't have a
clean set of helpers as inodes have.

Link: https://lore.kernel.org/r/20210320122623.599086-3-christian.brauner@ubuntu.com
Inspired-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/ext4/ialloc.c     |  2 +-
 fs/inode.c           |  4 ++--
 fs/namei.c           |  8 ++++----
 fs/xfs/xfs_inode.c   | 10 +++++-----
 fs/xfs/xfs_symlink.c |  4 ++--
 include/linux/fs.h   | 28 ++++++++++++++++++++++++++--
 6 files changed, 40 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 633ae7becd61..d0dc12197346 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -970,7 +970,7 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns,
 		i_gid_write(inode, owner[1]);
 	} else if (test_opt(sb, GRPID)) {
 		inode->i_mode = mode;
-		inode->i_uid = fsuid_into_mnt(mnt_userns);
+		inode->i_uid = mapped_fsuid(mnt_userns);
 		inode->i_gid = dir->i_gid;
 	} else
 		inode_init_owner(mnt_userns, inode, dir, mode);
diff --git a/fs/inode.c b/fs/inode.c
index a047ab306f9a..81a6a59b7dd3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2148,7 +2148,7 @@ EXPORT_SYMBOL(init_special_inode);
 void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
 		      const struct inode *dir, umode_t mode)
 {
-	inode->i_uid = fsuid_into_mnt(mnt_userns);
+	inode->i_uid = mapped_fsuid(mnt_userns);
 	if (dir && dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
 
@@ -2160,7 +2160,7 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
 			 !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
 			mode &= ~S_ISGID;
 	} else
-		inode->i_gid = fsgid_into_mnt(mnt_userns);
+		inode->i_gid = mapped_fsgid(mnt_userns);
 	inode->i_mode = mode;
 }
 EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/namei.c b/fs/namei.c
index 216f16e74351..6b5424d34962 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2830,8 +2830,8 @@ static inline int may_create(struct user_namespace *mnt_userns,
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
 	s_user_ns = dir->i_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
-	    !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
+	if (!kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) ||
+	    !kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns)))
 		return -EOVERFLOW;
 	return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 }
@@ -3040,8 +3040,8 @@ static int may_o_create(struct user_namespace *mnt_userns,
 		return error;
 
 	s_user_ns = dir->dentry->d_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
-	    !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
+	if (!kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) ||
+	    !kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns)))
 		return -EOVERFLOW;
 
 	error = inode_permission(mnt_userns, dir->dentry->d_inode,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f93370bd7b1e..dc91f8c34d35 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -812,7 +812,7 @@ xfs_init_new_inode(
 
 	if (dir && !(dir->i_mode & S_ISGID) &&
 	    (mp->m_flags & XFS_MOUNT_GRPID)) {
-		inode->i_uid = fsuid_into_mnt(mnt_userns);
+		inode->i_uid = mapped_fsuid(mnt_userns);
 		inode->i_gid = dir->i_gid;
 		inode->i_mode = mode;
 	} else {
@@ -1007,8 +1007,8 @@ xfs_create(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
-			fsgid_into_mnt(mnt_userns), prid,
+	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
+			mapped_fsgid(mnt_userns), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
@@ -1158,8 +1158,8 @@ xfs_create_tmpfile(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
-			fsgid_into_mnt(mnt_userns), prid,
+	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
+			mapped_fsgid(mnt_userns), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 7f368b10ded1..63edb4dbed4a 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -182,8 +182,8 @@ xfs_symlink(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
-			fsgid_into_mnt(mnt_userns), prid,
+	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
+			mapped_fsgid(mnt_userns), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
 			&udqp, &gdqp, &pdqp);
 	if (error)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 33873531ffa6..e34967829183 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1658,12 +1658,36 @@ static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
 	return KGIDT_INIT(from_kgid(mnt_userns, kgid));
 }
 
-static inline kuid_t fsuid_into_mnt(struct user_namespace *mnt_userns)
+/**
+ * mapped_fsuid - return caller's fsuid mapped up into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ *
+ * Use this helper to initialize a new vfs or filesystem object based on
+ * the caller's fsuid. A common example is initializing the i_uid field of
+ * a newly allocated inode triggered by a creation event such as mkdir or
+ * O_CREAT. Other examples include the allocation of quotas for a specific
+ * user.
+ *
+ * Return: the caller's current fsuid mapped up according to @mnt_userns.
+ */
+static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns)
 {
 	return kuid_from_mnt(mnt_userns, current_fsuid());
 }
 
-static inline kgid_t fsgid_into_mnt(struct user_namespace *mnt_userns)
+/**
+ * mapped_fsgid - return caller's fsgid mapped up into a mnt_userns
+ * @mnt_userns: user namespace of the relevant mount
+ *
+ * Use this helper to initialize a new vfs or filesystem object based on
+ * the caller's fsgid. A common example is initializing the i_gid field of
+ * a newly allocated inode triggered by a creation event such as mkdir or
+ * O_CREAT. Other examples include the allocation of quotas for a specific
+ * user.
+ *
+ * Return: the caller's current fsgid mapped up according to @mnt_userns.
+ */
+static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
 {
 	return kgid_from_mnt(mnt_userns, current_fsgid());
 }
-- 
cgit v1.2.3


From 8e5389132ab429604c1a2459b52f0c849a71cc61 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sat, 20 Mar 2021 13:26:23 +0100
Subject: fs: introduce fsuidgid_has_mapping() helper

Don't open-code the checks and instead move them into a clean little
helper we can call. This also reduces the risk that if we ever change
something we forget to change all locations.

Link: https://lore.kernel.org/r/20210320122623.599086-4-christian.brauner@ubuntu.com
Inspired-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/namei.c         | 11 +++--------
 include/linux/fs.h | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 6b5424d34962..bc03cbc37ba7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2823,16 +2823,14 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
 static inline int may_create(struct user_namespace *mnt_userns,
 			     struct inode *dir, struct dentry *child)
 {
-	struct user_namespace *s_user_ns;
 	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
 	if (child->d_inode)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	s_user_ns = dir->i_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) ||
-	    !kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns)))
+	if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
 		return -EOVERFLOW;
+
 	return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 }
 
@@ -3034,14 +3032,11 @@ static int may_o_create(struct user_namespace *mnt_userns,
 			const struct path *dir, struct dentry *dentry,
 			umode_t mode)
 {
-	struct user_namespace *s_user_ns;
 	int error = security_path_mknod(dir, dentry, mode, 0);
 	if (error)
 		return error;
 
-	s_user_ns = dir->dentry->d_sb->s_user_ns;
-	if (!kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) ||
-	    !kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns)))
+	if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns))
 		return -EOVERFLOW;
 
 	error = inode_permission(mnt_userns, dir->dentry->d_inode,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e34967829183..e9e7e799425e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1692,6 +1692,26 @@ static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
 	return kgid_from_mnt(mnt_userns, current_fsgid());
 }
 
+/**
+ * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped
+ * @sb: the superblock we want a mapping in
+ * @mnt_userns: user namespace of the relevant mount
+ *
+ * Check whether the caller's fsuid and fsgid have a valid mapping in the
+ * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map
+ * the caller's fsuid and fsgid according to the @mnt_userns first.
+ *
+ * Return: true if fsuid and fsgid is mapped, false if not.
+ */
+static inline bool fsuidgid_has_mapping(struct super_block *sb,
+					struct user_namespace *mnt_userns)
+{
+	struct user_namespace *s_user_ns = sb->s_user_ns;
+
+	return kuid_has_mapping(s_user_ns, mapped_fsuid(mnt_userns)) &&
+	       kgid_has_mapping(s_user_ns, mapped_fsgid(mnt_userns));
+}
+
 extern struct timespec64 current_time(struct inode *inode);
 
 /*
-- 
cgit v1.2.3


From db998553cf11dd697485ac6142adbb35d21fff10 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sat, 20 Mar 2021 13:26:24 +0100
Subject: fs: introduce two inode i_{u,g}id initialization helpers

Give filesystem two little helpers that do the right thing when
initializing the i_uid and i_gid fields on idmapped and non-idmapped
mounts. Filesystems shouldn't have to be concerned with too many
details.

Link: https://lore.kernel.org/r/20210320122623.599086-5-christian.brauner@ubuntu.com
Inspired-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/ext4/ialloc.c   |  2 +-
 fs/inode.c         |  4 ++--
 fs/xfs/xfs_inode.c |  2 +-
 include/linux/fs.h | 28 ++++++++++++++++++++++++++++
 4 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index d0dc12197346..755a68bb7e22 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -970,7 +970,7 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns,
 		i_gid_write(inode, owner[1]);
 	} else if (test_opt(sb, GRPID)) {
 		inode->i_mode = mode;
-		inode->i_uid = mapped_fsuid(mnt_userns);
+		inode_fsuid_set(inode, mnt_userns);
 		inode->i_gid = dir->i_gid;
 	} else
 		inode_init_owner(mnt_userns, inode, dir, mode);
diff --git a/fs/inode.c b/fs/inode.c
index 81a6a59b7dd3..21c5a620ca89 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2148,7 +2148,7 @@ EXPORT_SYMBOL(init_special_inode);
 void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
 		      const struct inode *dir, umode_t mode)
 {
-	inode->i_uid = mapped_fsuid(mnt_userns);
+	inode_fsuid_set(inode, mnt_userns);
 	if (dir && dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
 
@@ -2160,7 +2160,7 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
 			 !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
 			mode &= ~S_ISGID;
 	} else
-		inode->i_gid = mapped_fsgid(mnt_userns);
+		inode_fsgid_set(inode, mnt_userns);
 	inode->i_mode = mode;
 }
 EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dc91f8c34d35..2a8bdf33e6c4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -812,7 +812,7 @@ xfs_init_new_inode(
 
 	if (dir && !(dir->i_mode & S_ISGID) &&
 	    (mp->m_flags & XFS_MOUNT_GRPID)) {
-		inode->i_uid = mapped_fsuid(mnt_userns);
+		inode_fsuid_set(inode, mnt_userns);
 		inode->i_gid = dir->i_gid;
 		inode->i_mode = mode;
 	} else {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e9e7e799425e..3eaf5f27a0e4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1692,6 +1692,34 @@ static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns)
 	return kgid_from_mnt(mnt_userns, current_fsgid());
 }
 
+/**
+ * inode_fsuid_set - initialize inode's i_uid field with callers fsuid
+ * @inode: inode to initialize
+ * @mnt_userns: user namespace of the mount the inode was found from
+ *
+ * Initialize the i_uid field of @inode. If the inode was found/created via
+ * an idmapped mount map the caller's fsuid according to @mnt_users.
+ */
+static inline void inode_fsuid_set(struct inode *inode,
+				   struct user_namespace *mnt_userns)
+{
+	inode->i_uid = mapped_fsuid(mnt_userns);
+}
+
+/**
+ * inode_fsgid_set - initialize inode's i_gid field with callers fsgid
+ * @inode: inode to initialize
+ * @mnt_userns: user namespace of the mount the inode was found from
+ *
+ * Initialize the i_gid field of @inode. If the inode was found/created via
+ * an idmapped mount map the caller's fsgid according to @mnt_users.
+ */
+static inline void inode_fsgid_set(struct inode *inode,
+				   struct user_namespace *mnt_userns)
+{
+	inode->i_gid = mapped_fsgid(mnt_userns);
+}
+
 /**
  * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped
  * @sb: the superblock we want a mapping in
-- 
cgit v1.2.3


From 39015399a849843ff8f840b68d16c6ff7c58e0f0 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 4 Feb 2021 18:00:55 +0000
Subject: fs: turn some comments into kernel-doc

While reviewing ./include/linux/fs.h, I noticed that three comments can
actually be turned into kernel-doc comments. This allows to check the
consistency between the descriptions and the functions' signatures in
case they may change in the future.

A quick validation with the consistency check:

  ./scripts/kernel-doc -none include/linux/fs.h

currently reports no issues in this file.

Link: https://lore.kernel.org/r/20210204180059.28360-2-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6a..644ccef39014 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1739,7 +1739,7 @@ static inline void sb_start_pagefault(struct super_block *sb)
 	__sb_start_write(sb, SB_FREEZE_PAGEFAULT);
 }
 
-/*
+/**
  * sb_start_intwrite - get write access to a superblock for internal fs purposes
  * @sb: the super we write to
  *
@@ -3161,7 +3161,7 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
 
 void inode_dio_wait(struct inode *inode);
 
-/*
+/**
  * inode_dio_begin - signal start of a direct I/O requests
  * @inode: inode the direct I/O happens on
  *
@@ -3173,7 +3173,7 @@ static inline void inode_dio_begin(struct inode *inode)
 	atomic_inc(&inode->i_dio_count);
 }
 
-/*
+/**
  * inode_dio_end - signal finish of a direct I/O requests
  * @inode: inode the direct I/O happens on
  *
-- 
cgit v1.2.3


From 92cb01c74ef13ca01e1af836236b140634967b82 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Mon, 1 Mar 2021 13:45:19 +0100
Subject: fs: update kernel-doc for vfs_rename()

Commit 9fe61450972d ("namei: introduce struct renamedata") introduces a
new struct for vfs_rename() and makes the vfs_rename() kernel-doc argument
description out of sync.

Move the description of arguments for vfs_rename() to a new kernel-doc for
the struct renamedata to make these descriptions checkable against the
actual implementation.

Link: https://lore.kernel.org/r/20210204180059.28360-3-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/fs.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 644ccef39014..e83c0bbc6454 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1782,6 +1782,17 @@ int vfs_rmdir(struct user_namespace *, struct inode *, struct dentry *);
 int vfs_unlink(struct user_namespace *, struct inode *, struct dentry *,
 	       struct inode **);
 
+/**
+ * struct renamedata - contains all information required for renaming
+ * @old_mnt_userns:    old user namespace of the mount the inode was found from
+ * @old_dir:           parent of source
+ * @old_dentry:                source
+ * @new_mnt_userns:    new user namespace of the mount the inode was found from
+ * @new_dir:           parent of destination
+ * @new_dentry:                destination
+ * @delegated_inode:   returns an inode needing a delegation break
+ * @flags:             rename flags
+ */
 struct renamedata {
 	struct user_namespace *old_mnt_userns;
 	struct inode *old_dir;
-- 
cgit v1.2.3


From 291da9d4a9eb3a1cb0610b7f4480f5b52b1825e7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Mar 2021 09:46:13 +0100
Subject: locking/mutex: Fix non debug version of mutex_lock_io_nested()

If CONFIG_DEBUG_LOCK_ALLOC=n then mutex_lock_io_nested() maps to
mutex_lock() which is clearly wrong because mutex_lock() lacks the
io_schedule_prepare()/finish() invocations.

Map it to mutex_lock_io().

Fixes: f21860bac05b ("locking/mutex, sched/wait: Fix the mutex_lock_io_nested() define")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/878s6fshii.fsf@nanos.tec.linutronix.de
---
 include/linux/mutex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 0cd631a19727..515cff77a4f4 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -185,7 +185,7 @@ extern void mutex_lock_io(struct mutex *lock);
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
 # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
 # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
-# define mutex_lock_io_nested(lock, subclass) mutex_lock(lock)
+# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock)
 #endif
 
 /*
-- 
cgit v1.2.3


From 2d5ba37461013253d2ff0a3641b727fd32ea97a9 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Tue, 23 Feb 2021 18:44:53 +0100
Subject: usb: ehci: add spurious flag to disable overcurrent checking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds an ignore_oc flag which can be set by EHCI controller
not supporting or wanting to disable overcurrent checking. The EHCI
platform data in include/linux/usb/ehci_pdriver.h is also augmented to
take advantage of this new flag.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
Link: https://lore.kernel.org/r/20210223174455.1378-2-noltari@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-hcd.c      | 2 +-
 drivers/usb/host/ehci-hub.c      | 4 ++--
 drivers/usb/host/ehci-platform.c | 2 ++
 drivers/usb/host/ehci.h          | 1 +
 include/linux/usb/ehci_pdriver.h | 1 +
 5 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 1926b328b6aa..2237d22d292a 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -651,7 +651,7 @@ static int ehci_run (struct usb_hcd *hcd)
 		"USB %x.%x started, EHCI %x.%02x%s\n",
 		((ehci->sbrn & 0xf0)>>4), (ehci->sbrn & 0x0f),
 		temp >> 8, temp & 0xff,
-		ignore_oc ? ", overcurrent ignored" : "");
+		(ignore_oc || ehci->spurious_oc) ? ", overcurrent ignored" : "");
 
 	ehci_writel(ehci, INTR_MASK,
 		    &ehci->regs->intr_enable); /* Turn On Interrupts */
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index 9f9ab5ccea88..159cc27b1a36 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -643,7 +643,7 @@ ehci_hub_status_data (struct usb_hcd *hcd, char *buf)
 	 * always set, seem to clear PORT_OCC and PORT_CSC when writing to
 	 * PORT_POWER; that's surprising, but maybe within-spec.
 	 */
-	if (!ignore_oc)
+	if (!ignore_oc && !ehci->spurious_oc)
 		mask = PORT_CSC | PORT_PEC | PORT_OCC;
 	else
 		mask = PORT_CSC | PORT_PEC;
@@ -1013,7 +1013,7 @@ int ehci_hub_control(
 		if (temp & PORT_PEC)
 			status |= USB_PORT_STAT_C_ENABLE << 16;
 
-		if ((temp & PORT_OCC) && !ignore_oc){
+		if ((temp & PORT_OCC) && (!ignore_oc && !ehci->spurious_oc)){
 			status |= USB_PORT_STAT_C_OVERCURRENT << 16;
 
 			/*
diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c
index a48dd3fac153..4d7b17f4f82b 100644
--- a/drivers/usb/host/ehci-platform.c
+++ b/drivers/usb/host/ehci-platform.c
@@ -327,6 +327,8 @@ static int ehci_platform_probe(struct platform_device *dev)
 		hcd->has_tt = 1;
 	if (pdata->reset_on_resume)
 		priv->reset_on_resume = true;
+	if (pdata->spurious_oc)
+		ehci->spurious_oc = 1;
 
 #ifndef CONFIG_USB_EHCI_BIG_ENDIAN_MMIO
 	if (ehci->big_endian_mmio) {
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index eabf22a78eae..80bb823aa9fe 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -218,6 +218,7 @@ struct ehci_hcd {			/* one per controller */
 	unsigned		frame_index_bug:1; /* MosChip (AKA NetMos) */
 	unsigned		need_oc_pp_cycle:1; /* MPC834X port power */
 	unsigned		imx28_write_fix:1; /* For Freescale i.MX28 */
+	unsigned		spurious_oc:1;
 
 	/* required for usb32 quirk */
 	#define OHCI_CTRL_HCFS          (3 << 6)
diff --git a/include/linux/usb/ehci_pdriver.h b/include/linux/usb/ehci_pdriver.h
index dd742afdc03f..89fc901e778f 100644
--- a/include/linux/usb/ehci_pdriver.h
+++ b/include/linux/usb/ehci_pdriver.h
@@ -50,6 +50,7 @@ struct usb_ehci_pdata {
 	unsigned	no_io_watchdog:1;
 	unsigned	reset_on_resume:1;
 	unsigned	dma_mask_64:1;
+	unsigned	spurious_oc:1;
 
 	/* Turn on all power and clocks */
 	int (*power_on)(struct platform_device *pdev);
-- 
cgit v1.2.3


From aaadc6aea6935e2f36c57056ff756fba0bbc4975 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Thu, 18 Mar 2021 16:54:06 +0100
Subject: USB: core: rename usb_driver_claim_interface() data parameter

It's been almost twenty years since the interface "private data" pointer
was removed in favour of using the driver-data pointer of struct device.

Let's rename the driver-data parameter of usb_driver_claim_interface()
so that it better reflects how it's used.

Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20210318155406.22399-2-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c | 6 +++---
 include/linux/usb.h       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index a1013d9da08d..072968c40ade 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -519,7 +519,7 @@ static int usb_unbind_interface(struct device *dev)
  * @driver: the driver to be bound
  * @iface: the interface to which it will be bound; must be in the
  *	usb device's active configuration
- * @priv: driver data associated with that interface
+ * @data: driver data associated with that interface
  *
  * This is used by usb device drivers that need to claim more than one
  * interface on a device when probing (audio and acm are current examples).
@@ -533,7 +533,7 @@ static int usb_unbind_interface(struct device *dev)
  * Return: 0 on success.
  */
 int usb_driver_claim_interface(struct usb_driver *driver,
-				struct usb_interface *iface, void *priv)
+				struct usb_interface *iface, void *data)
 {
 	struct device *dev;
 	int retval = 0;
@@ -550,7 +550,7 @@ int usb_driver_claim_interface(struct usb_driver *driver,
 		return -ENODEV;
 
 	dev->driver = &driver->drvwrap.driver;
-	usb_set_intfdata(iface, priv);
+	usb_set_intfdata(iface, data);
 	iface->needs_binding = 0;
 
 	iface->condition = USB_INTERFACE_BOUND;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 57c1e0ce5eba..b07e90d07ab6 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -841,7 +841,7 @@ extern int usb_free_streams(struct usb_interface *interface,
 
 /* used these for multi-interface device registration */
 extern int usb_driver_claim_interface(struct usb_driver *driver,
-			struct usb_interface *iface, void *priv);
+			struct usb_interface *iface, void *data);
 
 /**
  * usb_interface_claimed - returns true iff an interface is claimed
-- 
cgit v1.2.3


From 0299809be415567366b66f248eed93848b8dc9f3 Mon Sep 17 00:00:00 2001
From: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Date: Wed, 10 Mar 2021 19:42:44 -0800
Subject: usb: core: Track SuperSpeed Plus GenXxY

Introduce ssp_rate field to usb_device structure to capture the
connected SuperSpeed Plus signaling rate generation and lane count with
the corresponding usb_ssp_rate enum.

Signed-off-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Link: https://lore.kernel.org/r/b7805d121e5ae4ad5ae144bd860b6ac04ee47436.1615432770.git.Thinh.Nguyen@synopsys.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c |  6 +++-
 drivers/usb/core/hub.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb.h    |  2 ++
 3 files changed, 85 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 3f0381344221..6119fb41d736 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -2721,6 +2721,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 
 	rhdev->rx_lanes = 1;
 	rhdev->tx_lanes = 1;
+	rhdev->ssp_rate = USB_SSP_GEN_UNKNOWN;
 
 	switch (hcd->speed) {
 	case HCD_USB11:
@@ -2738,8 +2739,11 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	case HCD_USB32:
 		rhdev->rx_lanes = 2;
 		rhdev->tx_lanes = 2;
-		fallthrough;
+		rhdev->ssp_rate = USB_SSP_GEN_2x2;
+		rhdev->speed = USB_SPEED_SUPER_PLUS;
+		break;
 	case HCD_USB31:
+		rhdev->ssp_rate = USB_SSP_GEN_2x1;
 		rhdev->speed = USB_SPEED_SUPER_PLUS;
 		break;
 	default:
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 7f71218cc1e5..e78b2dd7801a 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -31,6 +31,7 @@
 #include <linux/pm_qos.h>
 #include <linux/kobject.h>
 
+#include <linux/bitfield.h>
 #include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
@@ -2668,6 +2669,81 @@ out_authorized:
 	return result;
 }
 
+/**
+ * get_port_ssp_rate - Match the extended port status to SSP rate
+ * @hdev: The hub device
+ * @ext_portstatus: extended port status
+ *
+ * Match the extended port status speed id to the SuperSpeed Plus sublink speed
+ * capability attributes. Base on the number of connected lanes and speed,
+ * return the corresponding enum usb_ssp_rate.
+ */
+static enum usb_ssp_rate get_port_ssp_rate(struct usb_device *hdev,
+					   u32 ext_portstatus)
+{
+	struct usb_ssp_cap_descriptor *ssp_cap = hdev->bos->ssp_cap;
+	u32 attr;
+	u8 speed_id;
+	u8 ssac;
+	u8 lanes;
+	int i;
+
+	if (!ssp_cap)
+		goto out;
+
+	speed_id = ext_portstatus & USB_EXT_PORT_STAT_RX_SPEED_ID;
+	lanes = USB_EXT_PORT_RX_LANES(ext_portstatus) + 1;
+
+	ssac = le32_to_cpu(ssp_cap->bmAttributes) &
+		USB_SSP_SUBLINK_SPEED_ATTRIBS;
+
+	for (i = 0; i <= ssac; i++) {
+		u8 ssid;
+
+		attr = le32_to_cpu(ssp_cap->bmSublinkSpeedAttr[i]);
+		ssid = FIELD_GET(USB_SSP_SUBLINK_SPEED_SSID, attr);
+		if (speed_id == ssid) {
+			u16 mantissa;
+			u8 lse;
+			u8 type;
+
+			/*
+			 * Note: currently asymmetric lane types are only
+			 * applicable for SSIC operate in SuperSpeed protocol
+			 */
+			type = FIELD_GET(USB_SSP_SUBLINK_SPEED_ST, attr);
+			if (type == USB_SSP_SUBLINK_SPEED_ST_ASYM_RX ||
+			    type == USB_SSP_SUBLINK_SPEED_ST_ASYM_TX)
+				goto out;
+
+			if (FIELD_GET(USB_SSP_SUBLINK_SPEED_LP, attr) !=
+			    USB_SSP_SUBLINK_SPEED_LP_SSP)
+				goto out;
+
+			lse = FIELD_GET(USB_SSP_SUBLINK_SPEED_LSE, attr);
+			mantissa = FIELD_GET(USB_SSP_SUBLINK_SPEED_LSM, attr);
+
+			/* Convert to Gbps */
+			for (; lse < USB_SSP_SUBLINK_SPEED_LSE_GBPS; lse++)
+				mantissa /= 1000;
+
+			if (mantissa >= 10 && lanes == 1)
+				return USB_SSP_GEN_2x1;
+
+			if (mantissa >= 10 && lanes == 2)
+				return USB_SSP_GEN_2x2;
+
+			if (mantissa >= 5 && lanes == 2)
+				return USB_SSP_GEN_1x2;
+
+			goto out;
+		}
+	}
+
+out:
+	return USB_SSP_GEN_UNKNOWN;
+}
+
 /*
  * Return 1 if port speed is SuperSpeedPlus, 0 otherwise
  * check it from the link protocol field of the current speed ID attribute.
@@ -2850,9 +2926,11 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1,
 		/* extended portstatus Rx and Tx lane count are zero based */
 		udev->rx_lanes = USB_EXT_PORT_RX_LANES(ext_portstatus) + 1;
 		udev->tx_lanes = USB_EXT_PORT_TX_LANES(ext_portstatus) + 1;
+		udev->ssp_rate = get_port_ssp_rate(hub->hdev, ext_portstatus);
 	} else {
 		udev->rx_lanes = 1;
 		udev->tx_lanes = 1;
+		udev->ssp_rate = USB_SSP_GEN_UNKNOWN;
 	}
 	if (hub_is_wusb(hub))
 		udev->speed = USB_SPEED_WIRELESS;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index b07e90d07ab6..ddd2f5b2a282 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -560,6 +560,7 @@ struct usb3_lpm_parameters {
  * @speed: device speed: high/full/low (or error)
  * @rx_lanes: number of rx lanes in use, USB 3.2 adds dual-lane support
  * @tx_lanes: number of tx lanes in use, USB 3.2 adds dual-lane support
+ * @ssp_rate: SuperSpeed Plus phy signaling rate and lane count
  * @tt: Transaction Translator info; used with low/full speed dev, highspeed hub
  * @ttport: device port on that tt hub
  * @toggle: one bit for each endpoint, with ([0] = IN, [1] = OUT) endpoints
@@ -636,6 +637,7 @@ struct usb_device {
 	enum usb_device_speed	speed;
 	unsigned int		rx_lanes;
 	unsigned int		tx_lanes;
+	enum usb_ssp_rate	ssp_rate;
 
 	struct usb_tt	*tt;
 	int		ttport;
-- 
cgit v1.2.3


From f2db85b64f0af1410ccb8ebcc9d7fa38e99feee9 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Tue, 2 Mar 2021 13:11:30 -0800
Subject: driver core: Avoid pointless deferred probe attempts

There's no point in adding a device to the deferred probe list if we
know for sure that it doesn't have a matching driver. So, check if a
device can match with a driver before adding it to the deferred probe
list.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210302211133.2244281-2-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/dd.c      | 6 ++++++
 include/linux/device.h | 4 ++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 66c31cda5462..83a68e980d04 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -122,6 +122,9 @@ static DECLARE_WORK(deferred_probe_work, deferred_probe_work_func);
 
 void driver_deferred_probe_add(struct device *dev)
 {
+	if (!dev->can_match)
+		return;
+
 	mutex_lock(&deferred_probe_mutex);
 	if (list_empty(&dev->p->deferred_probe)) {
 		dev_dbg(dev, "Added to deferred list\n");
@@ -725,6 +728,7 @@ static int driver_probe_device(struct device_driver *drv, struct device *dev)
 	if (!device_is_registered(dev))
 		return -ENODEV;
 
+	dev->can_match = true;
 	pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
 		 drv->bus->name, __func__, dev_name(dev), drv->name);
 
@@ -828,6 +832,7 @@ static int __device_attach_driver(struct device_driver *drv, void *_data)
 		return 0;
 	} else if (ret == -EPROBE_DEFER) {
 		dev_dbg(dev, "Device match requests probe deferral\n");
+		dev->can_match = true;
 		driver_deferred_probe_add(dev);
 	} else if (ret < 0) {
 		dev_dbg(dev, "Bus failed to match device: %d\n", ret);
@@ -1063,6 +1068,7 @@ static int __driver_attach(struct device *dev, void *data)
 		return 0;
 	} else if (ret == -EPROBE_DEFER) {
 		dev_dbg(dev, "Device match requests probe deferral\n");
+		dev->can_match = true;
 		driver_deferred_probe_add(dev);
 	} else if (ret < 0) {
 		dev_dbg(dev, "Bus failed to match device: %d\n", ret);
diff --git a/include/linux/device.h b/include/linux/device.h
index a8ce0dc3b758..38a2071cf776 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -439,6 +439,9 @@ struct dev_links_info {
  * @state_synced: The hardware state of this device has been synced to match
  *		  the software state of this device by calling the driver/bus
  *		  sync_state() callback.
+ * @can_match:	The device has matched with a driver at least once or it is in
+ *		a bus (like AMBA) which can't check for matching drivers until
+ *		other devices probe successfully.
  * @dma_coherent: this particular device is dma coherent, even if the
  *		architecture supports non-coherent devices.
  * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
@@ -545,6 +548,7 @@ struct device {
 	bool			offline:1;
 	bool			of_node_reused:1;
 	bool			state_synced:1;
+	bool			can_match:1;
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
-- 
cgit v1.2.3


From 0341ce5443949588e93581b49b934cdde2befbf8 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Tue, 23 Mar 2021 15:56:17 +0200
Subject: workqueue: Add resource managed version of delayed work init

A few drivers which need a delayed work-queue must cancel work at driver
detach. Some of those implement remove() solely for this purpose. Help
drivers to avoid unnecessary remove and error-branch implementation by
adding managed verision of delayed work initialization. This will also
help drivers to avoid mixing manual and devm based unwinding when other
resources are handled by devm.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/51769ea4668198deb798fe47fcfb5f5288d61586.1616506559.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/devm-helpers.h | 53 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 include/linux/devm-helpers.h

(limited to 'include/linux')

diff --git a/include/linux/devm-helpers.h b/include/linux/devm-helpers.h
new file mode 100644
index 000000000000..f64e0c9f3763
--- /dev/null
+++ b/include/linux/devm-helpers.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __LINUX_DEVM_HELPERS_H
+#define __LINUX_DEVM_HELPERS_H
+
+/*
+ * Functions which do automatically cancel operations or release resources upon
+ * driver detach.
+ *
+ * These should be helpful to avoid mixing the manual and devm-based resource
+ * management which can be source of annoying, rarely occurring,
+ * hard-to-reproduce bugs.
+ *
+ * Please take into account that devm based cancellation may be performed some
+ * time after the remove() is ran.
+ *
+ * Thus mixing devm and manual resource management can easily cause problems
+ * when unwinding operations with dependencies. IRQ scheduling a work in a queue
+ * is typical example where IRQs are often devm-managed and WQs are manually
+ * cleaned at remove(). If IRQs are not manually freed at remove() (and this is
+ * often the case when we use devm for IRQs) we have a period of time after
+ * remove() - and before devm managed IRQs are freed - where new IRQ may fire
+ * and schedule a work item which won't be cancelled because remove() was
+ * already ran.
+ */
+
+#include <linux/device.h>
+#include <linux/workqueue.h>
+
+static inline void devm_delayed_work_drop(void *res)
+{
+	cancel_delayed_work_sync(res);
+}
+
+/**
+ * devm_delayed_work_autocancel - Resource-managed work allocation
+ * @dev: Device which lifetime work is bound to
+ * @pdata: work to be cancelled when driver is detached
+ *
+ * Initialize work which is automatically cancelled when driver is detached.
+ * A few drivers need delayed work which must be cancelled before driver
+ * is detached to avoid accessing removed resources.
+ * devm_delayed_work_autocancel() can be used to omit the explicit
+ * cancelleation when driver is detached.
+ */
+static inline int devm_delayed_work_autocancel(struct device *dev,
+					       struct delayed_work *w,
+					       work_func_t worker)
+{
+	INIT_DELAYED_WORK(w, worker);
+	return devm_add_action(dev, devm_delayed_work_drop, w);
+}
+
+#endif
-- 
cgit v1.2.3


From f2cc020d7876de7583feb52ec939a32419cf9468 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 23 Mar 2021 18:49:35 +0100
Subject: tracing: Fix various typos in comments

Fix ~59 single-word typos in the tracing code comments, and fix
the grammar in a handful of places.

Link: https://lore.kernel.org/r/20210322224546.GA1981273@gmail.com
Link: https://lkml.kernel.org/r/20210323174935.GA4176821@gmail.com

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/microblaze/include/asm/ftrace.h |  2 +-
 arch/nds32/kernel/ftrace.c           |  2 +-
 arch/powerpc/include/asm/ftrace.h    |  4 ++--
 arch/sh/kernel/ftrace.c              |  2 +-
 arch/sparc/include/asm/ftrace.h      |  2 +-
 fs/tracefs/inode.c                   |  2 +-
 include/linux/ftrace.h               |  4 ++--
 include/linux/trace_events.h         |  2 +-
 include/linux/tracepoint.h           |  2 +-
 include/trace/events/io_uring.h      |  2 +-
 include/trace/events/rcu.h           |  2 +-
 include/trace/events/sched.h         |  2 +-
 include/trace/events/timer.h         |  2 +-
 kernel/trace/bpf_trace.c             |  5 +++--
 kernel/trace/fgraph.c                |  4 ++--
 kernel/trace/ftrace.c                |  8 ++++----
 kernel/trace/ring_buffer.c           |  2 +-
 kernel/trace/synth_event_gen_test.c  |  2 +-
 kernel/trace/trace.c                 | 18 +++++++++---------
 kernel/trace/trace.h                 |  4 ++--
 kernel/trace/trace_event_perf.c      |  2 +-
 kernel/trace/trace_events.c          |  4 ++--
 kernel/trace/trace_events_filter.c   |  4 ++--
 kernel/trace/trace_events_synth.c    |  2 +-
 kernel/trace/trace_functions_graph.c |  2 +-
 kernel/trace/trace_hwlat.c           |  4 ++--
 kernel/trace/trace_kprobe.c          |  2 +-
 kernel/trace/trace_probe.c           |  6 +++---
 kernel/trace/trace_probe.h           |  2 +-
 kernel/trace/trace_probe_tmpl.h      |  2 +-
 kernel/trace/trace_selftest.c        |  4 ++--
 kernel/trace/trace_seq.c             | 12 ++++++------
 32 files changed, 60 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h
index 5db7f4489f05..6a92bed37794 100644
--- a/arch/microblaze/include/asm/ftrace.h
+++ b/arch/microblaze/include/asm/ftrace.h
@@ -13,7 +13,7 @@ extern void ftrace_call_graph(void);
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-/* reloction of mcount call site is the same as the address */
+/* relocation of mcount call site is the same as the address */
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	return addr;
diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c
index 414f8a780cc3..0e23e3a8df6b 100644
--- a/arch/nds32/kernel/ftrace.c
+++ b/arch/nds32/kernel/ftrace.c
@@ -236,7 +236,7 @@ void __naked return_to_handler(void)
 		"bal ftrace_return_to_handler\n\t"
 		"move $lp, $r0               \n\t"
 
-		/* restore state nedded by the ABI  */
+		/* restore state needed by the ABI  */
 		"lmw.bim $r0,[$sp],$r1,#0x0  \n\t");
 }
 
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index bc76970b6ee5..debe8c4f7062 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -12,7 +12,7 @@
 
 #ifdef __ASSEMBLY__
 
-/* Based off of objdump optput from glibc */
+/* Based off of objdump output from glibc */
 
 #define MCOUNT_SAVE_FRAME			\
 	stwu	r1,-48(r1);			\
@@ -52,7 +52,7 @@ extern void _mcount(void);
 
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
-       /* reloction of mcount call site is the same as the address */
+       /* relocation of mcount call site is the same as the address */
        return addr;
 }
 
diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c
index 0646c5961846..295c43315bbe 100644
--- a/arch/sh/kernel/ftrace.c
+++ b/arch/sh/kernel/ftrace.c
@@ -67,7 +67,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  * Modifying code must take extra care. On an SMP machine, if
  * the code being modified is also being executed on another CPU
  * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
+ * We use kstop_machine to stop other CPUS from executing code.
  * But this does not stop NMIs from happening. We still need
  * to protect against that. We separate out the modification of
  * the code to take care of this.
diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h
index d3aa1a524431..e284394cb3aa 100644
--- a/arch/sparc/include/asm/ftrace.h
+++ b/arch/sparc/include/asm/ftrace.h
@@ -17,7 +17,7 @@ void _mcount(void);
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-/* reloction of mcount call site is the same as the address */
+/* relocation of mcount call site is the same as the address */
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	return addr;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 4b83cbded559..1261e8b41edb 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -477,7 +477,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
  *
  * The instances directory is special as it allows for mkdir and rmdir to
  * to be done by userspace. When a mkdir or rmdir is performed, the inode
- * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * locks are released and the methods passed in (@mkdir and @rmdir) are
  * called without locks and with the name of the directory being created
  * within the instances directory.
  *
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 86e5028bfa20..a69f363b61bf 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -33,7 +33,7 @@
 /*
  * If the arch's mcount caller does not support all of ftrace's
  * features, then it must call an indirect function that
- * does. Or at least does enough to prevent any unwelcomed side effects.
+ * does. Or at least does enough to prevent any unwelcome side effects.
  */
 #if !ARCH_SUPPORTS_FTRACE_OPS
 # define FTRACE_FORCE_LIST_FUNC 1
@@ -389,7 +389,7 @@ DECLARE_PER_CPU(int, disable_stack_tracer);
  */
 static inline void stack_tracer_disable(void)
 {
-	/* Preemption or interupts must be disabled */
+	/* Preemption or interrupts must be disabled */
 	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT))
 		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
 	this_cpu_inc(disable_stack_tracer);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8cba64ce23a4..36e27c1f42e0 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -206,7 +206,7 @@ static inline unsigned int tracing_gen_ctx_dec(void)
 
 	trace_ctx = tracing_gen_ctx();
 	/*
-	 * Subtract one from the preeption counter if preemption is enabled,
+	 * Subtract one from the preemption counter if preemption is enabled,
 	 * see trace_event_buffer_reserve()for details.
 	 */
 	if (IS_ENABLED(CONFIG_PREEMPTION))
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 9cfb099da58f..13f65420f188 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -465,7 +465,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
  *	*
  *	* The declared 'local variable' is called '__entry'
  *	*
- *	* __field(pid_t, prev_prid) is equivalent to a standard declariton:
+ *	* __field(pid_t, prev_prid) is equivalent to a standard declaration:
  *	*
  *	*	pid_t	prev_pid;
  *	*
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 9f0d3b7d56b0..ba78a5602cd1 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -49,7 +49,7 @@ TRACE_EVENT(io_uring_create,
 );
 
 /**
- * io_uring_register - called after a buffer/file/eventfd was succesfully
+ * io_uring_register - called after a buffer/file/eventfd was successfully
  * 					   registered for a ring
  *
  * @ctx:			pointer to a ring context structure
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 5fc29400e1a2..97177c10bf64 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -48,7 +48,7 @@ TRACE_EVENT(rcu_utilization,
  * RCU flavor, the grace-period number, and a string identifying the
  * grace-period-related event as follows:
  *
- *	"AccReadyCB": CPU acclerates new callbacks to RCU_NEXT_READY_TAIL.
+ *	"AccReadyCB": CPU accelerates new callbacks to RCU_NEXT_READY_TAIL.
  *	"AccWaitCB": CPU accelerates new callbacks to RCU_WAIT_TAIL.
  *	"newreq": Request a new grace period.
  *	"start": Start a grace period.
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index cbe3e152d24c..1eca2305ca42 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -174,7 +174,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_waking,
 	     TP_ARGS(p));
 
 /*
- * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG.
+ * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING.
  * It is not always called from the waking context.
  */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 19abb6c3eb73..6ad031c71be7 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -119,7 +119,7 @@ TRACE_EVENT(timer_expire_entry,
  * When used in combination with the timer_expire_entry tracepoint we can
  * determine the runtime of the timer callback function.
  *
- * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might
+ * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might
  * be invalid. We solely track the pointer.
  */
 DEFINE_EVENT(timer_class, timer_expire_exit,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b0c45d923f0f..49f7ddb36149 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -658,7 +658,7 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
 		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
 		i++;
 
-		/* skip optional "[0 +-][num]" width formating field */
+		/* skip optional "[0 +-][num]" width formatting field */
 		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
 		       fmt[i] == ' ')
 			i++;
@@ -748,7 +748,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
 		fmt_cnt++;
 	}
 
-	/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
+	/*
+	 * The maximum we can have is MAX_SEQ_PRINTF_VARARGS parameters, so just give
 	 * all of them to seq_printf().
 	 */
 	seq_printf(m, fmt, params[0], params[1], params[2], params[3],
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 29a6ebeebc9e..b8a0d1d564fb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -42,7 +42,7 @@ bool ftrace_graph_is_dead(void)
 }
 
 /**
- * ftrace_graph_stop - set to permanently disable function graph tracincg
+ * ftrace_graph_stop - set to permanently disable function graph tracing
  *
  * In case of an error int function graph tracing, this is called
  * to try to keep function graph tracing from causing any more harm.
@@ -117,7 +117,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 
 	/*
 	 * Skip graph tracing if the return location is served by direct trampoline,
-	 * since call sequence and return addresses is unpredicatable anymore.
+	 * since call sequence and return addresses are unpredictable anyway.
 	 * Ex: BPF trampoline may call original function and may skip frame
 	 * depending on type of BPF programs attached.
 	 */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d8e35575549..eb7d677cb466 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1045,7 +1045,7 @@ struct ftrace_ops global_ops = {
 };
 
 /*
- * Used by the stack undwinder to know about dynamic ftrace trampolines.
+ * Used by the stack unwinder to know about dynamic ftrace trampolines.
  */
 struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
 {
@@ -3000,7 +3000,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		 * When the kernel is preemptive, tasks can be preempted
 		 * while on a ftrace trampoline. Just scheduling a task on
 		 * a CPU is not good enough to flush them. Calling
-		 * synchornize_rcu_tasks() will wait for those tasks to
+		 * synchronize_rcu_tasks() will wait for those tasks to
 		 * execute and either schedule voluntarily or enter user space.
 		 */
 		if (IS_ENABLED(CONFIG_PREEMPTION))
@@ -5373,7 +5373,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct);
  * @reset - non zero to reset all filters before applying this filter.
  *
  * Filters denote which functions should be enabled when tracing is enabled
- * If @ip is NULL, it failes to update filter.
+ * If @ip is NULL, it fails to update filter.
  */
 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
 			 int remove, int reset)
@@ -6325,7 +6325,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash)
 	}
 }
 
-/* Clear any records from hashs */
+/* Clear any records from hashes */
 static void clear_mod_from_hashes(struct ftrace_page *pg)
 {
 	struct trace_array *tr;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1c61a8cd7b99..f4216df58e31 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3154,7 +3154,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
  * is called before preempt_count() is updated, since the check will
  * be on the NORMAL bit, the TRANSITION bit will then be set. If an
  * NMI then comes in, it will set the NMI bit, but when the NMI code
- * does the trace_recursive_unlock() it will clear the TRANSTION bit
+ * does the trace_recursive_unlock() it will clear the TRANSITION bit
  * and leave the NMI bit set. But this is fine, because the interrupt
  * code that set the TRANSITION bit will then clear the NMI bit when it
  * calls trace_recursive_unlock(). If another NMI comes in, it will
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index a4b4bbf8c3bf..0b15e975d2c2 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Test module for in-kernel sythetic event creation and generation.
+ * Test module for in-kernel synthetic event creation and generation.
  *
  * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org>
  */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e32f5a49f1cf..c8e54b674d3e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -514,7 +514,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list)
  * @filtered_pids: The list of pids to check
  * @search_pid: The PID to find in @filtered_pids
  *
- * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
  */
 bool
 trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
@@ -545,7 +545,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids,
 		       struct task_struct *task)
 {
 	/*
-	 * If filterd_no_pids is not empty, and the task's pid is listed
+	 * If filtered_no_pids is not empty, and the task's pid is listed
 	 * in filtered_no_pids, then return true.
 	 * Otherwise, if filtered_pids is empty, that means we can
 	 * trace all tasks. If it has content, then only trace pids
@@ -612,7 +612,7 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
 
 	(*pos)++;
 
-	/* pid already is +1 of the actual prevous bit */
+	/* pid already is +1 of the actual previous bit */
 	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
 
 	/* Return pid + 1 to allow zero to be represented */
@@ -834,7 +834,7 @@ DEFINE_MUTEX(trace_types_lock);
  * The content of events may become garbage if we allow other process consumes
  * these events concurrently:
  *   A) the page of the consumed events may become a normal page
- *      (not reader page) in ring buffer, and this page will be rewrited
+ *      (not reader page) in ring buffer, and this page will be rewritten
  *      by events producer.
  *   B) The page of the consumed events may become a page for splice_read,
  *      and this page will be returned to system.
@@ -1520,7 +1520,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
 #undef C
 #define C(a, b) b
 
-/* These must match the bit postions in trace_iterator_flags */
+/* These must match the bit positions in trace_iterator_flags */
 static const char *trace_options[] = {
 	TRACE_FLAGS
 	NULL
@@ -3368,7 +3368,7 @@ int trace_array_vprintk(struct trace_array *tr,
  * buffer (use trace_printk() for that), as writing into the top level
  * buffer should only have events that can be individually disabled.
  * trace_printk() is only used for debugging a kernel, and should not
- * be ever encorporated in normal use.
+ * be ever incorporated in normal use.
  *
  * trace_array_printk() can be used, as it will not add noise to the
  * top level tracing buffer.
@@ -7533,11 +7533,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
  * @cmd: The tracing command that caused the error
  * @str: The string to position the caret at within @cmd
  *
- * Finds the position of the first occurence of @str within @cmd.  The
+ * Finds the position of the first occurrence of @str within @cmd.  The
  * return value can be passed to tracing_log_err() for caret placement
  * within @cmd.
  *
- * Returns the index within @cmd of the first occurence of @str or 0
+ * Returns the index within @cmd of the first occurrence of @str or 0
  * if @str was not found.
  */
 unsigned int err_pos(char *cmd, const char *str)
@@ -9320,7 +9320,7 @@ int tracing_init_dentry(void)
 	 * As there may still be users that expect the tracing
 	 * files to exist in debugfs/tracing, we must automount
 	 * the tracefs file system there, so older tools still
-	 * work with the newer kerenl.
+	 * work with the newer kernel.
 	 */
 	tr->dir = debugfs_create_automount("tracing", NULL,
 					   trace_automount, NULL);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2952bd92bc62..5506424eae2a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1347,7 +1347,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
 
 /**
  * event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
+ * @file: The file pointer associated with the event
  * @buffer: The ring buffer that the event is being written to
  * @event: The event meta data in the ring buffer
  * @entry: The event itself
@@ -1374,7 +1374,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
 
 /**
  * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
+ * @file: The file pointer associated with the event
  * @buffer: The ring buffer that the event is being written to
  * @event: The event meta data in the ring buffer
  * @entry: The event itself
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 288ad2c274fb..03be4435d103 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 
 /*
  * Force it to be aligned to unsigned long to avoid misaligned accesses
- * suprises
+ * surprises
  */
 typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
 	perf_trace_t;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f58106eaf8cb..80e96989770e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2646,7 +2646,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 		}
 
 		/*
-		 * Since calls are grouped by systems, the likelyhood that the
+		 * Since calls are grouped by systems, the likelihood that the
 		 * next call in the iteration belongs to the same system as the
 		 * previous call is high. As an optimization, we skip searching
 		 * for a map[] that matches the call's system if the last call
@@ -2706,7 +2706,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
 }
 
 /*
- * Just create a decriptor for early init. A descriptor is required
+ * Just create a descriptor for early init. A descriptor is required
  * for enabling events at boot. We want to enable events before
  * the filesystem is initialized.
  */
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e91259f6a722..9730acf3c03e 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -296,7 +296,7 @@ enum {
  * and "FALSE" the program entry after that, we are now done with the first
  * pass.
  *
- * Making the above "a || b && c" have a progam of:
+ * Making the above "a || b && c" have a program of:
  *  prog[0] = { "a", 1, 2 }
  *  prog[1] = { "b", 0, 2 }
  *  prog[2] = { "c", 0, 3 }
@@ -390,7 +390,7 @@ enum {
  * F: return FALSE
  *
  * As "r = a; if (!r) goto n5;" is obviously the same as
- * "if (!a) goto n5;" without doing anything we can interperate the
+ * "if (!a) goto n5;" without doing anything we can interpret the
  * program as:
  * n1: if (!a) goto n5;
  * n2: if (!b) goto n5;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 8d71e6c83f10..2ac75eb6aa86 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1385,7 +1385,7 @@ static int destroy_synth_event(struct synth_event *se)
 
 /**
  * synth_event_delete - Delete a synthetic event
- * @event_name: The name of the new sythetic event
+ * @event_name: The name of the new synthetic event
  *
  * Delete a synthetic event that was created with synth_event_create().
  *
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0aa6e6faa943..0de6837722da 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -764,7 +764,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
  *  - we are inside irq code
  *  - we just entered irq code
  *
- * retunns 0 if
+ * returns 0 if
  *  - funcgraph-interrupts option is set
  *  - we are not inside irq code
  */
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 34dc1a712dcb..632ef88131a9 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -83,7 +83,7 @@ struct hwlat_sample {
 	u64			nmi_total_ts;	/* Total time spent in NMIs */
 	struct timespec64	timestamp;	/* wall time */
 	int			nmi_count;	/* # NMIs during this sample */
-	int			count;		/* # of iteratons over threash */
+	int			count;		/* # of iterations over thresh */
 };
 
 /* keep the global state somewhere. */
@@ -389,7 +389,7 @@ static int start_kthread(struct trace_array *tr)
 }
 
 /**
- * stop_kthread - Inform the hardware latency samping/detector kthread to stop
+ * stop_kthread - Inform the hardware latency sampling/detector kthread to stop
  *
  * This kicks the running hardware latency sampling/detector kernel thread and
  * tells it to stop sampling now. Use this on unload and at system shutdown.
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6fe770d86dc3..ea6178cb5e33 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1748,7 +1748,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 	if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
 		kretprobe_perf_func(tk, ri, regs);
 #endif
-	return 0;	/* We don't tweek kernel, so just return 0 */
+	return 0;	/* We don't tweak kernel, so just return 0 */
 }
 NOKPROBE_SYMBOL(kretprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index ec589a4612df..15413ad7cef2 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -168,7 +168,7 @@ void __trace_probe_log_err(int offset, int err_type)
 	if (!trace_probe_log.argv)
 		return;
 
-	/* Recalcurate the length and allocate buffer */
+	/* Recalculate the length and allocate buffer */
 	for (i = 0; i < trace_probe_log.argc; i++) {
 		if (i == trace_probe_log.index)
 			pos = len;
@@ -182,7 +182,7 @@ void __trace_probe_log_err(int offset, int err_type)
 		/**
 		 * Set the error position is next to the last arg + space.
 		 * Note that len includes the terminal null and the cursor
-		 * appaers at pos + 1.
+		 * appears at pos + 1.
 		 */
 		pos = len;
 		offset = 0;
@@ -592,7 +592,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
 	}
 
 	/*
-	 * Since $comm and immediate string can not be dereferred,
+	 * Since $comm and immediate string can not be dereferenced,
 	 * we can find those by strcmp.
 	 */
 	if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 7ce4027089ee..227d518e5ba5 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -134,7 +134,7 @@ struct fetch_type {
 	size_t			size;		/* Byte size of type */
 	int			is_signed;	/* Signed flag */
 	print_type_func_t	print;		/* Print functions */
-	const char		*fmt;		/* Fromat string */
+	const char		*fmt;		/* Format string */
 	const char		*fmttype;	/* Name in format file */
 };
 
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index e5282828f4a6..f003c5d02a3a 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -167,7 +167,7 @@ array:
 	return code->op == FETCH_OP_END ? ret : -EILSEQ;
 }
 
-/* Sum up total data length for dynamic arraies (strings) */
+/* Sum up total data length for dynamic arrays (strings) */
 static nokprobe_inline int
 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
 {
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 73ef12092250..adf7ef194005 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -878,7 +878,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/*
-	 * Now that the big kernel lock is no longer preemptable,
+	 * Now that the big kernel lock is no longer preemptible,
 	 * and this is called with the BKL held, it will always
 	 * fail. If preemption is already disabled, simply
 	 * pass the test. When the BKL is removed, or becomes
@@ -940,7 +940,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	int ret;
 
 	/*
-	 * Now that the big kernel lock is no longer preemptable,
+	 * Now that the big kernel lock is no longer preemptible,
 	 * and this is called with the BKL held, it will always
 	 * fail. If preemption is already disabled, simply
 	 * pass the test. When the BKL is removed, or becomes
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1d84fcc78e3e..9c90b3a7dce2 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -16,7 +16,7 @@
  * The buffer size is currently PAGE_SIZE, although it may become dynamic
  * in the future.
  *
- * A write to the buffer will either succed or fail. That is, unlike
+ * A write to the buffer will either succeed or fail. That is, unlike
  * sprintf() there will not be a partial write (well it may write into
  * the buffer but it wont update the pointers). This allows users to
  * try to write something into the trace_seq buffer and if it fails
@@ -73,7 +73,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
  * @fmt: printf format string
  *
  * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
+ * copy to user routines. To simplify formatting of a trace
  * trace_seq_printf() is used to store strings into a special
  * buffer (@s). Then the output may be either used by
  * the sequencer or pulled into another buffer.
@@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
  * @fmt: printf format string
  *
  * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
+ * copy to user routines. To simplify formatting of a trace
  * trace_seq_printf is used to store strings into a special
  * buffer (@s). Then the output may be either used by
  * the sequencer or pulled into another buffer.
@@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
  * @c: simple character to record
  *
  * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple charater
+ * copy to user routines. This function records a simple character
  * into a special buffer (@s) for later retrieval by a sequencer
  * or other mechanism.
  */
@@ -348,7 +348,7 @@ int trace_seq_path(struct trace_seq *s, const struct path *path)
 EXPORT_SYMBOL_GPL(trace_seq_path);
 
 /**
- * trace_seq_to_user - copy the squence buffer to user space
+ * trace_seq_to_user - copy the sequence buffer to user space
  * @s: trace sequence descriptor
  * @ubuf: The userspace memory location to copy to
  * @cnt: The amount to copy
@@ -363,7 +363,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
  *
  * On failure it returns -EBUSY if all of the content in the
  * sequence has been already read, which includes nothing in the
- * sequenc (@s->len == @s->readpos).
+ * sequence (@s->len == @s->readpos).
  *
  * Returns -EFAULT if the copy to userspace fails.
  */
-- 
cgit v1.2.3


From 39f985c8f667c80a3d1eb19d31138032fa36b09e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 20 Mar 2021 05:40:38 +0000
Subject: fs/cachefiles: Remove wait_bit_key layout dependency

Cachefiles was relying on wait_page_key and wait_bit_key being the
same layout, which is fragile.  Now that wait_page_key is exposed in
the pagemap.h header, we can remove that fragility

A comment on the need to maintain structure layout equivalence was added by
Linus[1] and that is no longer applicable.

Fixes: 62906027091f ("mm: add PageWaiters indicating tasks are waiting for a page bit")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: kafs-testing@auristor.com
cc: linux-cachefs@redhat.com
cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/20210320054104.1300774-2-willy@infradead.org/
Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3510ca20ece0150af6b10c77a74ff1b5c198e3e2 [1]
---
 fs/cachefiles/rdwr.c    | 7 +++----
 include/linux/pagemap.h | 1 -
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index e027c718ca01..8ffc40e84a59 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -24,17 +24,16 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
 		container_of(wait, struct cachefiles_one_read, monitor);
 	struct cachefiles_object *object;
 	struct fscache_retrieval *op = monitor->op;
-	struct wait_bit_key *key = _key;
+	struct wait_page_key *key = _key;
 	struct page *page = wait->private;
 
 	ASSERT(key);
 
 	_enter("{%lu},%u,%d,{%p,%u}",
 	       monitor->netfs_page->index, mode, sync,
-	       key->flags, key->bit_nr);
+	       key->page, key->bit_nr);
 
-	if (key->flags != &page->flags ||
-	    key->bit_nr != PG_locked)
+	if (key->page != page || key->bit_nr != PG_locked)
 		return 0;
 
 	_debug("--- monitor %p %lx ---", page, page->flags);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 20225b067583..8f4daac6eb4b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -559,7 +559,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 	return pgoff;
 }
 
-/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
 struct wait_page_key {
 	struct page *page;
 	int bit_nr;
-- 
cgit v1.2.3


From e5dbd33218bd8d87ab69f730ab90aed5fab7eb26 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 20 Mar 2021 05:40:39 +0000
Subject: mm/writeback: Add wait_on_page_writeback_killable

This is the killable version of wait_on_page_writeback.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: kafs-testing@auristor.com
cc: linux-afs@lists.infradead.org
cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/20210320054104.1300774-3-willy@infradead.org
---
 include/linux/pagemap.h |  1 +
 mm/page-writeback.c     | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8f4daac6eb4b..8c9947fd62f3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -682,6 +682,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
 
 int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
+int wait_on_page_writeback_killable(struct page *page);
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eb34d204d4ee..9e35b636a393 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2833,6 +2833,22 @@ void wait_on_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL_GPL(wait_on_page_writeback);
 
+/*
+ * Wait for a page to complete writeback.  Returns -EINTR if we get a
+ * fatal signal while waiting.
+ */
+int wait_on_page_writeback_killable(struct page *page)
+{
+	while (PageWriteback(page)) {
+		trace_wait_on_page_writeback(page, page_mapping(page));
+		if (wait_on_page_bit_killable(page, PG_writeback))
+			return -EINTR;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
+
 /**
  * wait_for_stable_page() - wait for writeback to finish, if necessary.
  * @page:	The page to wait on.
-- 
cgit v1.2.3


From c0e715bbd50e57319f76d0b757dc282893f2d476 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 23 Mar 2021 01:51:42 +0200
Subject: net: bridge: add helper for retrieving the current bridge port STP
 state

It may happen that we have the following topology with DSA or any other
switchdev driver with LAG offload:

ip link add br0 type bridge stp_state 1
ip link add bond0 type bond
ip link set bond0 master br0
ip link set swp0 master bond0
ip link set swp1 master bond0

STP decides that it should put bond0 into the BLOCKING state, and
that's that. The ports that are actively listening for the switchdev
port attributes emitted for the bond0 bridge port (because they are
offloading it) and have the honor of seeing that switchdev port
attribute can react to it, so we can program swp0 and swp1 into the
BLOCKING state.

But if then we do:

ip link set swp2 master bond0

then as far as the bridge is concerned, nothing has changed: it still
has one bridge port. But this new bridge port will not see any STP state
change notification and will remain FORWARDING, which is how the
standalone code leaves it in.

We need a function in the bridge driver which retrieves the current STP
state, such that drivers can synchronize to it when they may have missed
switchdev events.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Tobias Waldekranz <tobias@waldekranz.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  6 ++++++
 net/bridge/br_stp.c       | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index b979005ea39c..920d3a02cc68 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -136,6 +136,7 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev,
 				    __u16 vid);
 void br_fdb_clear_offload(const struct net_device *dev, u16 vid);
 bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
+u8 br_port_get_stp_state(const struct net_device *dev);
 #else
 static inline struct net_device *
 br_fdb_find_port(const struct net_device *br_dev,
@@ -154,6 +155,11 @@ br_port_flag_is_set(const struct net_device *dev, unsigned long flag)
 {
 	return false;
 }
+
+static inline u8 br_port_get_stp_state(const struct net_device *dev)
+{
+	return BR_STATE_DISABLED;
+}
 #endif
 
 #endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 21c6781906aa..86b5e05d3f21 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -64,6 +64,20 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 	}
 }
 
+u8 br_port_get_stp_state(const struct net_device *dev)
+{
+	struct net_bridge_port *p;
+
+	ASSERT_RTNL();
+
+	p = br_port_get_rtnl(dev);
+	if (!p)
+		return BR_STATE_DISABLED;
+
+	return p->state;
+}
+EXPORT_SYMBOL_GPL(br_port_get_stp_state);
+
 /* called under bridge lock */
 struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
 {
-- 
cgit v1.2.3


From f1d42ea10056b9050d1c5b8e19995f66c30aeded Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 23 Mar 2021 01:51:43 +0200
Subject: net: bridge: add helper to retrieve the current ageing time

The SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME attribute is only emitted from:

sysfs/ioctl/netlink
-> br_set_ageing_time
   -> __set_ageing_time

therefore not at bridge port creation time, so:
(a) switchdev drivers have to hardcode the initial value for the address
    ageing time, because they didn't get any notification
(b) that hardcoded value can be out of sync, if the user changes the
    ageing time before enslaving the port to the bridge

We need a helper in the bridge, such that switchdev drivers can query
the current value of the bridge ageing time when they start offloading
it.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Tobias Waldekranz <tobias@waldekranz.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  6 ++++++
 net/bridge/br_stp.c       | 13 +++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 920d3a02cc68..ebd16495459c 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -137,6 +137,7 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev,
 void br_fdb_clear_offload(const struct net_device *dev, u16 vid);
 bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
 u8 br_port_get_stp_state(const struct net_device *dev);
+clock_t br_get_ageing_time(struct net_device *br_dev);
 #else
 static inline struct net_device *
 br_fdb_find_port(const struct net_device *br_dev,
@@ -160,6 +161,11 @@ static inline u8 br_port_get_stp_state(const struct net_device *dev)
 {
 	return BR_STATE_DISABLED;
 }
+
+static inline clock_t br_get_ageing_time(struct net_device *br_dev)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 86b5e05d3f21..3dafb6143cff 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -639,6 +639,19 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 	return 0;
 }
 
+clock_t br_get_ageing_time(struct net_device *br_dev)
+{
+	struct net_bridge *br;
+
+	if (!netif_is_bridge_master(br_dev))
+		return 0;
+
+	br = netdev_priv(br_dev);
+
+	return jiffies_to_clock_t(br->ageing_time);
+}
+EXPORT_SYMBOL_GPL(br_get_ageing_time);
+
 /* called under bridge lock */
 void __br_set_topology_change(struct net_bridge *br, unsigned char val)
 {
-- 
cgit v1.2.3


From 4f2673b3a2b6246729a1ff13b8945a040839dbd3 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 23 Mar 2021 01:51:44 +0200
Subject: net: bridge: add helper to replay port and host-joined mdb entries

I have a system with DSA ports, and udhcpcd is configured to bring
interfaces up as soon as they are created.

I create a bridge as follows:

ip link add br0 type bridge

As soon as I create the bridge and udhcpcd brings it up, I also have
avahi which automatically starts sending IPv6 packets to advertise some
local services, and because of that, the br0 bridge joins the following
IPv6 groups due to the code path detailed below:

33:33:ff:6d:c1:9c vid 0
33:33:00:00:00:6a vid 0
33:33:00:00:00:fb vid 0

br_dev_xmit
-> br_multicast_rcv
   -> br_ip6_multicast_add_group
      -> __br_multicast_add_group
         -> br_multicast_host_join
            -> br_mdb_notify

This is all fine, but inside br_mdb_notify we have br_mdb_switchdev_host
hooked up, and switchdev will attempt to offload the host joined groups
to an empty list of ports. Of course nobody offloads them.

Then when we add a port to br0:

ip link set swp0 master br0

the bridge doesn't replay the host-joined MDB entries from br_add_if,
and eventually the host joined addresses expire, and a switchdev
notification for deleting it is emitted, but surprise, the original
addition was already completely missed.

The strategy to address this problem is to replay the MDB entries (both
the port ones and the host joined ones) when the new port joins the
bridge, similar to what vxlan_fdb_replay does (in that case, its FDB can
be populated and only then attached to a bridge that you offload).
However there are 2 possibilities: the addresses can be 'pushed' by the
bridge into the port, or the port can 'pull' them from the bridge.

Considering that in the general case, the new port can be really late to
the party, and there may have been many other switchdev ports that
already received the initial notification, we would like to avoid
delivering duplicate events to them, since they might misbehave. And
currently, the bridge calls the entire switchdev notifier chain, whereas
for replaying it should just call the notifier block of the new guy.
But the bridge doesn't know what is the new guy's notifier block, it
just knows where the switchdev notifier chain is. So for simplification,
we make this a driver-initiated pull for now, and the notifier block is
passed as an argument.

To emulate the calling context for mdb objects (deferred and put on the
blocking notifier chain), we must iterate under RCU protection through
the bridge's mdb entries, queue them, and only call them once we're out
of the RCU read-side critical section.

There was some opportunity for reuse between br_mdb_switchdev_host_port,
br_mdb_notify and the newly added br_mdb_queue_one in how the switchdev
mdb object is created, so a helper was created.

Suggested-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |   9 +++
 include/net/switchdev.h   |   1 +
 net/bridge/br_mdb.c       | 148 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 141 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index ebd16495459c..f6472969bb44 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -69,6 +69,8 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto);
 bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
 bool br_multicast_router(const struct net_device *dev);
+int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
+		  struct notifier_block *nb, struct netlink_ext_ack *extack);
 #else
 static inline int br_multicast_list_adjacent(struct net_device *dev,
 					     struct list_head *br_ip_list)
@@ -93,6 +95,13 @@ static inline bool br_multicast_router(const struct net_device *dev)
 {
 	return false;
 }
+static inline int br_mdb_replay(struct net_device *br_dev,
+				struct net_device *dev,
+				struct notifier_block *nb,
+				struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index b7fc7d0f54e2..8c3218177136 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -68,6 +68,7 @@ enum switchdev_obj_id {
 };
 
 struct switchdev_obj {
+	struct list_head list;
 	struct net_device *orig_dev;
 	enum switchdev_obj_id id;
 	u32 flags;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 8846c5bcd075..95fa4af0e8dd 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -506,6 +506,134 @@ err:
 	kfree(priv);
 }
 
+static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb,
+				      const struct net_bridge_mdb_entry *mp)
+{
+	if (mp->addr.proto == htons(ETH_P_IP))
+		ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr);
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (mp->addr.proto == htons(ETH_P_IPV6))
+		ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr);
+#endif
+	else
+		ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr);
+
+	mdb->vid = mp->addr.vid;
+}
+
+static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
+			     struct switchdev_obj_port_mdb *mdb,
+			     struct netlink_ext_ack *extack)
+{
+	struct switchdev_notifier_port_obj_info obj_info = {
+		.info = {
+			.dev = dev,
+			.extack = extack,
+		},
+		.obj = &mdb->obj,
+	};
+	int err;
+
+	err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info);
+	return notifier_to_errno(err);
+}
+
+static int br_mdb_queue_one(struct list_head *mdb_list,
+			    enum switchdev_obj_id id,
+			    const struct net_bridge_mdb_entry *mp,
+			    struct net_device *orig_dev)
+{
+	struct switchdev_obj_port_mdb *mdb;
+
+	mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC);
+	if (!mdb)
+		return -ENOMEM;
+
+	mdb->obj.id = id;
+	mdb->obj.orig_dev = orig_dev;
+	br_switchdev_mdb_populate(mdb, mp);
+	list_add_tail(&mdb->obj.list, mdb_list);
+
+	return 0;
+}
+
+int br_mdb_replay(struct net_device *br_dev, struct net_device *dev,
+		  struct notifier_block *nb, struct netlink_ext_ack *extack)
+{
+	struct net_bridge_mdb_entry *mp;
+	struct switchdev_obj *obj, *tmp;
+	struct net_bridge *br;
+	LIST_HEAD(mdb_list);
+	int err = 0;
+
+	ASSERT_RTNL();
+
+	if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev))
+		return -EINVAL;
+
+	br = netdev_priv(br_dev);
+
+	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
+		return 0;
+
+	/* We cannot walk over br->mdb_list protected just by the rtnl_mutex,
+	 * because the write-side protection is br->multicast_lock. But we
+	 * need to emulate the [ blocking ] calling context of a regular
+	 * switchdev event, so since both br->multicast_lock and RCU read side
+	 * critical sections are atomic, we have no choice but to pick the RCU
+	 * read side lock, queue up all our events, leave the critical section
+	 * and notify switchdev from blocking context.
+	 */
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
+		struct net_bridge_port_group __rcu **pp;
+		struct net_bridge_port_group *p;
+
+		if (mp->host_joined) {
+			err = br_mdb_queue_one(&mdb_list,
+					       SWITCHDEV_OBJ_ID_HOST_MDB,
+					       mp, br_dev);
+			if (err) {
+				rcu_read_unlock();
+				goto out_free_mdb;
+			}
+		}
+
+		for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
+		     pp = &p->next) {
+			if (p->key.port->dev != dev)
+				continue;
+
+			err = br_mdb_queue_one(&mdb_list,
+					       SWITCHDEV_OBJ_ID_PORT_MDB,
+					       mp, dev);
+			if (err) {
+				rcu_read_unlock();
+				goto out_free_mdb;
+			}
+		}
+	}
+
+	rcu_read_unlock();
+
+	list_for_each_entry(obj, &mdb_list, list) {
+		err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj),
+					extack);
+		if (err)
+			goto out_free_mdb;
+	}
+
+out_free_mdb:
+	list_for_each_entry_safe(obj, tmp, &mdb_list, list) {
+		list_del(&obj->list);
+		kfree(SWITCHDEV_OBJ_PORT_MDB(obj));
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(br_mdb_replay);
+
 static void br_mdb_switchdev_host_port(struct net_device *dev,
 				       struct net_device *lower_dev,
 				       struct net_bridge_mdb_entry *mp,
@@ -515,18 +643,12 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
 		.obj = {
 			.id = SWITCHDEV_OBJ_ID_HOST_MDB,
 			.flags = SWITCHDEV_F_DEFER,
+			.orig_dev = dev,
 		},
-		.vid = mp->addr.vid,
 	};
 
-	if (mp->addr.proto == htons(ETH_P_IP))
-		ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr);
-#if IS_ENABLED(CONFIG_IPV6)
-	else
-		ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr);
-#endif
+	br_switchdev_mdb_populate(&mdb, mp);
 
-	mdb.obj.orig_dev = dev;
 	switch (type) {
 	case RTM_NEWMDB:
 		switchdev_port_obj_add(lower_dev, &mdb.obj, NULL);
@@ -558,21 +680,13 @@ void br_mdb_notify(struct net_device *dev,
 			.id = SWITCHDEV_OBJ_ID_PORT_MDB,
 			.flags = SWITCHDEV_F_DEFER,
 		},
-		.vid = mp->addr.vid,
 	};
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
 	if (pg) {
-		if (mp->addr.proto == htons(ETH_P_IP))
-			ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr);
-#if IS_ENABLED(CONFIG_IPV6)
-		else if (mp->addr.proto == htons(ETH_P_IPV6))
-			ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr);
-#endif
-		else
-			ether_addr_copy(mdb.addr, mp->addr.dst.mac_addr);
+		br_switchdev_mdb_populate(&mdb, mp);
 
 		mdb.obj.orig_dev = pg->key.port->dev;
 		switch (type) {
-- 
cgit v1.2.3


From 04846f903b53b32d29453e865646309db29f255a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 23 Mar 2021 01:51:45 +0200
Subject: net: bridge: add helper to replay port and local fdb entries

When a switchdev port starts offloading a LAG that is already in a
bridge and has an FDB entry pointing to it:

ip link set bond0 master br0
bridge fdb add dev bond0 00:01:02:03:04:05 master static
ip link set swp0 master bond0

the switchdev driver will have no idea that this FDB entry is there,
because it missed the switchdev event emitted at its creation.

Ido Schimmel pointed this out during a discussion about challenges with
switchdev offloading of stacked interfaces between the physical port and
the bridge, and recommended to just catch that condition and deny the
CHANGEUPPER event:
https://lore.kernel.org/netdev/20210210105949.GB287766@shredder.lan/

But in fact, we might need to deal with the hard thing anyway, which is
to replay all FDB addresses relevant to this port, because it isn't just
static FDB entries, but also local addresses (ones that are not
forwarded but terminated by the bridge). There, we can't just say 'oh
yeah, there was an upper already so I'm not joining that'.

So, similar to the logic for replaying MDB entries, add a function that
must be called by individual switchdev drivers and replays local FDB
entries as well as ones pointing towards a bridge port. This time, we
use the atomic switchdev notifier block, since that's what FDB entries
expect for some reason.

Reported-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h |  9 +++++++++
 net/bridge/br_fdb.c       | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index f6472969bb44..b564c4486a45 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -147,6 +147,8 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid);
 bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag);
 u8 br_port_get_stp_state(const struct net_device *dev);
 clock_t br_get_ageing_time(struct net_device *br_dev);
+int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
+		  struct notifier_block *nb);
 #else
 static inline struct net_device *
 br_fdb_find_port(const struct net_device *br_dev,
@@ -175,6 +177,13 @@ static inline clock_t br_get_ageing_time(struct net_device *br_dev)
 {
 	return 0;
 }
+
+static inline int br_fdb_replay(struct net_device *br_dev,
+				struct net_device *dev,
+				struct notifier_block *nb)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #endif
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b7490237f3fc..698b79747d32 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -726,6 +726,56 @@ static inline size_t fdb_nlmsg_size(void)
 		+ nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */
 }
 
+static int br_fdb_replay_one(struct notifier_block *nb,
+			     struct net_bridge_fdb_entry *fdb,
+			     struct net_device *dev)
+{
+	struct switchdev_notifier_fdb_info item;
+	int err;
+
+	item.addr = fdb->key.addr.addr;
+	item.vid = fdb->key.vlan_id;
+	item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+	item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
+	item.info.dev = dev;
+
+	err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item);
+	return notifier_to_errno(err);
+}
+
+int br_fdb_replay(struct net_device *br_dev, struct net_device *dev,
+		  struct notifier_block *nb)
+{
+	struct net_bridge_fdb_entry *fdb;
+	struct net_bridge *br;
+	int err = 0;
+
+	if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev))
+		return -EINVAL;
+
+	br = netdev_priv(br_dev);
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) {
+		struct net_bridge_port *dst = READ_ONCE(fdb->dst);
+		struct net_device *dst_dev;
+
+		dst_dev = dst ? dst->dev : br->dev;
+		if (dst_dev != br_dev && dst_dev != dev)
+			continue;
+
+		err = br_fdb_replay_one(nb, fdb, dst_dev);
+		if (err)
+			break;
+	}
+
+	rcu_read_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(br_fdb_replay);
+
 static void fdb_notify(struct net_bridge *br,
 		       const struct net_bridge_fdb_entry *fdb, int type,
 		       bool swdev_notify)
-- 
cgit v1.2.3


From 22f67cdfae6aaa7e841ced17207391fb368c8e9e Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 23 Mar 2021 01:51:46 +0200
Subject: net: bridge: add helper to replay VLANs installed on port

Currently this simple setup with DSA:

ip link add br0 type bridge vlan_filtering 1
ip link add bond0 type bond
ip link set bond0 master br0
ip link set swp0 master bond0

will not work because the bridge has created the PVID in br_add_if ->
nbp_vlan_init, and it has notified switchdev of the existence of VLAN 1,
but that was too early, since swp0 was not yet a lower of bond0, so it
had no reason to act upon that notification.

We need a helper in the bridge to replay the switchdev VLAN objects that
were notified since the bridge port creation, because some of them may
have been missed.

As opposed to the br_mdb_replay function, the vg->vlan_list write side
protection is offered by the rtnl_mutex which is sleepable, so we don't
need to queue up the objects in atomic context, we can replay them right
away.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 10 +++++++
 net/bridge/br_vlan.c      | 73 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index b564c4486a45..2cc35038a8ca 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -111,6 +111,8 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid);
 int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto);
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo);
+int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
+		   struct notifier_block *nb, struct netlink_ext_ack *extack);
 #else
 static inline bool br_vlan_enabled(const struct net_device *dev)
 {
@@ -137,6 +139,14 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
 {
 	return -EINVAL;
 }
+
+static inline int br_vlan_replay(struct net_device *br_dev,
+				 struct net_device *dev,
+				 struct notifier_block *nb,
+				 struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8829f621b8ec..ca8daccff217 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1751,6 +1751,79 @@ out_kfree:
 	kfree_skb(skb);
 }
 
+static int br_vlan_replay_one(struct notifier_block *nb,
+			      struct net_device *dev,
+			      struct switchdev_obj_port_vlan *vlan,
+			      struct netlink_ext_ack *extack)
+{
+	struct switchdev_notifier_port_obj_info obj_info = {
+		.info = {
+			.dev = dev,
+			.extack = extack,
+		},
+		.obj = &vlan->obj,
+	};
+	int err;
+
+	err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info);
+	return notifier_to_errno(err);
+}
+
+int br_vlan_replay(struct net_device *br_dev, struct net_device *dev,
+		   struct notifier_block *nb, struct netlink_ext_ack *extack)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	int err = 0;
+	u16 pvid;
+
+	ASSERT_RTNL();
+
+	if (!netif_is_bridge_master(br_dev))
+		return -EINVAL;
+
+	if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev))
+		return -EINVAL;
+
+	if (netif_is_bridge_master(dev)) {
+		br = netdev_priv(dev);
+		vg = br_vlan_group(br);
+		p = NULL;
+	} else {
+		p = br_port_get_rtnl(dev);
+		if (WARN_ON(!p))
+			return -EINVAL;
+		vg = nbp_vlan_group(p);
+		br = p->br;
+	}
+
+	if (!vg)
+		return 0;
+
+	pvid = br_get_pvid(vg);
+
+	list_for_each_entry(v, &vg->vlan_list, vlist) {
+		struct switchdev_obj_port_vlan vlan = {
+			.obj.orig_dev = dev,
+			.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+			.flags = br_vlan_flags(v, pvid),
+			.vid = v->vid,
+		};
+
+		if (!br_vlan_should_use(v))
+			continue;
+
+		br_vlan_replay_one(nb, dev, &vlan, extack);
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(br_vlan_replay);
+
 /* check if v_curr can enter a range ending in range_end */
 bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
 			     const struct net_bridge_vlan *range_end)
-- 
cgit v1.2.3


From 5aa3afe107d9099fc0dea2acf82c3e3c8f0f20e2 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Tue, 23 Mar 2021 07:49:23 +0100
Subject: net: make unregister netdev warning timeout configurable

netdev_wait_allrefs() issues a warning if refcount does not drop to 0
after 10 seconds. While 10 second wait generally should not happen
under normal workload in normal environment, it seems to fire falsely
very often during fuzzing and/or in qemu emulation (~10x slower).
At least it's not possible to understand if it's really a false
positive or not. Automated testing generally bumps all timeouts
to very high values to avoid flake failures.
Add net.core.netdev_unregister_timeout_secs sysctl to make
the timeout configurable for automated testing systems.
Lowering the timeout may also be useful for e.g. manual bisection.
The default value matches the current behavior.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=211877
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/sysctl/net.rst | 11 +++++++++++
 include/linux/netdevice.h                |  1 +
 net/core/dev.c                           |  6 +++++-
 net/core/sysctl_net_core.c               | 10 ++++++++++
 4 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index f2ab8a5b6a4b..2090bfc69aa5 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -311,6 +311,17 @@ permit to distribute the load on several cpus.
 If set to 1 (default), timestamps are sampled as soon as possible, before
 queueing.
 
+netdev_unregister_timeout_secs
+------------------------------
+
+Unregister network device timeout in seconds.
+This option controls the timeout (in seconds) used to issue a warning while
+waiting for a network device refcount to drop to 0 during device
+unregistration. A lower value may be useful during bisection to detect
+a leaked reference faster. A larger value may be useful to prevent false
+warnings on slow/loaded systems.
+Default value is 10, minimum 0, maximum 3600.
+
 optmem_max
 ----------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7005ad80e8d1..5fa66db0cb5d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4661,6 +4661,7 @@ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
 
 extern int		netdev_max_backlog;
 extern int		netdev_tstamp_prequeue;
+extern int		netdev_unregister_timeout_secs;
 extern int		weight_p;
 extern int		dev_weight_rx_bias;
 extern int		dev_weight_tx_bias;
diff --git a/net/core/dev.c b/net/core/dev.c
index c9a496f5e687..515309573cb8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10322,6 +10322,8 @@ int netdev_refcnt_read(const struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_refcnt_read);
 
+int netdev_unregister_timeout_secs __read_mostly = 10;
+
 #define WAIT_REFS_MIN_MSECS 1
 #define WAIT_REFS_MAX_MSECS 250
 /**
@@ -10383,7 +10385,9 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 		refcnt = netdev_refcnt_read(dev);
 
-		if (refcnt != 1 && time_after(jiffies, warning_time + 10 * HZ)) {
+		if (refcnt &&
+		    time_after(jiffies, warning_time +
+			       netdev_unregister_timeout_secs * HZ)) {
 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
 				 dev->name, refcnt);
 			warning_time = jiffies;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 4567de519603..d84c8a1b280e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -24,6 +24,7 @@
 
 static int two = 2;
 static int three = 3;
+static int int_3600 = 3600;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
@@ -570,6 +571,15 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ONE,
 	},
+	{
+		.procname	= "netdev_unregister_timeout_secs",
+		.data		= &netdev_unregister_timeout_secs,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &int_3600,
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From ee89646619ba07d054348c0240da4b953cd1e72f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 18 Mar 2021 15:03:18 +0200
Subject: pps: clients: gpio: Get rid of legacy platform data

Platform data is a legacy interface to supply device properties
to the driver. In this case we even don't have in-kernel users
for it. Just remove it for good.

Acked-by: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210318130321.24227-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pps/clients/pps-gpio.c | 17 +++--------------
 include/linux/pps-gpio.h       | 19 -------------------
 2 files changed, 3 insertions(+), 33 deletions(-)
 delete mode 100644 include/linux/pps-gpio.h

(limited to 'include/linux')

diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c
index dc9ed6fc3dae..291240dce79e 100644
--- a/drivers/pps/clients/pps-gpio.c
+++ b/drivers/pps/clients/pps-gpio.c
@@ -16,7 +16,6 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/pps_kernel.h>
-#include <linux/pps-gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/list.h>
 #include <linux/of_device.h>
@@ -164,7 +163,6 @@ static int pps_gpio_probe(struct platform_device *pdev)
 	struct pps_gpio_device_data *data;
 	int ret;
 	int pps_default_params;
-	const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data;
 
 	/* allocate space for device info */
 	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
@@ -173,18 +171,9 @@ static int pps_gpio_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, data);
 
 	/* GPIO setup */
-	if (pdata) {
-		data->gpio_pin = pdata->gpio_pin;
-		data->echo_pin = pdata->echo_pin;
-
-		data->assert_falling_edge = pdata->assert_falling_edge;
-		data->capture_clear = pdata->capture_clear;
-		data->echo_active_ms = pdata->echo_active_ms;
-	} else {
-		ret = pps_gpio_setup(pdev);
-		if (ret)
-			return -EINVAL;
-	}
+	ret = pps_gpio_setup(pdev);
+	if (ret)
+		return -EINVAL;
 
 	/* IRQ setup */
 	ret = gpiod_to_irq(data->gpio_pin);
diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h
deleted file mode 100644
index 7bf49908be06..000000000000
--- a/include/linux/pps-gpio.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * pps-gpio.h -- PPS client for GPIOs
- *
- * Copyright (C) 2011 James Nuss <jamesnuss@nanometrics.ca>
- */
-
-#ifndef _PPS_GPIO_H
-#define _PPS_GPIO_H
-
-struct pps_gpio_platform_data {
-	struct gpio_desc *gpio_pin;
-	struct gpio_desc *echo_pin;
-	bool assert_falling_edge;
-	bool capture_clear;
-	unsigned int echo_active_ms;
-};
-
-#endif /* _PPS_GPIO_H */
-- 
cgit v1.2.3


From 6bbdc3db76ccc6d9ff1c3d6ad36c8ae5bd67ee1f Mon Sep 17 00:00:00 2001
From: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Date: Mon, 22 Mar 2021 05:01:08 +0530
Subject: hv: hyperv.h: a few mundane typo fixes

s/sructure/structure/
s/extention/extension/
s/offerred/offered/
s/adversley/adversely/

Signed-off-by: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20210321233108.3885240-1-unixbhaskar@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/linux/hyperv.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index f1d74dcf0353..2c18c8e768ef 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -284,7 +284,7 @@ struct vmbus_channel_offer {
 
 		/*
 		 * Pipes:
-		 * The following sructure is an integrated pipe protocol, which
+		 * The following structure is an integrated pipe protocol, which
 		 * is implemented on top of standard user-defined data. Pipe
 		 * clients have MAX_PIPE_USER_DEFINED_BYTES left for their own
 		 * use.
@@ -883,11 +883,11 @@ struct vmbus_channel {
 	 * Support for sub-channels. For high performance devices,
 	 * it will be useful to have multiple sub-channels to support
 	 * a scalable communication infrastructure with the host.
-	 * The support for sub-channels is implemented as an extention
+	 * The support for sub-channels is implemented as an extension
 	 * to the current infrastructure.
 	 * The initial offer is considered the primary channel and this
 	 * offer message will indicate if the host supports sub-channels.
-	 * The guest is free to ask for sub-channels to be offerred and can
+	 * The guest is free to ask for sub-channels to be offered and can
 	 * open these sub-channels as a normal "primary" channel. However,
 	 * all sub-channels will have the same type and instance guids as the
 	 * primary channel. Requests sent on a given channel will result in a
@@ -951,7 +951,7 @@ struct vmbus_channel {
 	 * Clearly, these optimizations improve throughput at the expense of
 	 * latency. Furthermore, since the channel is shared for both
 	 * control and data messages, control messages currently suffer
-	 * unnecessary latency adversley impacting performance and boot
+	 * unnecessary latency adversely impacting performance and boot
 	 * time. To fix this issue, permit tagging the channel as being
 	 * in "low latency" mode. In this mode, we will bypass the monitor
 	 * mechanism.
-- 
cgit v1.2.3


From 54443ef6f5d10d9c6bb17f1dbeea7eb8d5c9a839 Mon Sep 17 00:00:00 2001
From: JC Kuo <jckuo@nvidia.com>
Date: Wed, 20 Jan 2021 15:34:01 +0800
Subject: clk: tegra: Add PLLE HW power sequencer control

PLLE has a hardware power sequencer logic which is a state machine
that can power on/off PLLE without any software intervention. The
sequencer has two inputs, one from XUSB UPHY PLL and the other from
SATA UPHY PLL. PLLE provides reference clock to XUSB and SATA UPHY
PLLs. When both of the downstream PLLs are powered-off, PLLE hardware
power sequencer will automatically power off PLLE for power saving.

XUSB and SATA UPHY PLLs also have their own hardware power sequencer
logic. XUSB UPHY PLL is shared between XUSB SuperSpeed ports and PCIE
controllers. The XUSB UPHY PLL hardware power sequencer has inputs
from XUSB and PCIE. When all of the XUSB SuperSpeed ports and PCIE
controllers are in low power state, XUSB UPHY PLL hardware power
sequencer automatically power off PLL and flags idle to PLLE hardware
power sequencer. Similar applies to SATA UPHY PLL.

PLLE hardware power sequencer has to be enabled after both downstream
sequencers are enabled.

This commit adds two helper functions:
1. tegra210_plle_hw_sequence_start() for XUSB PADCTL driver to enable
   PLLE hardware sequencer at proper time.

2. tegra210_plle_hw_sequence_is_enabled() for XUSB PADCTL driver to
   check whether PLLE hardware sequencer has been enabled or not.

Signed-off-by: JC Kuo <jckuo@nvidia.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/clk/tegra/clk-tegra210.c | 53 +++++++++++++++++++++++++++++++++++++++-
 include/linux/clk/tegra.h        |  4 ++-
 2 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c
index 68cbb98af567..b9099012dc7b 100644
--- a/drivers/clk/tegra/clk-tegra210.c
+++ b/drivers/clk/tegra/clk-tegra210.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (c) 2012-2014 NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2012-2020 NVIDIA CORPORATION.  All rights reserved.
  */
 
 #include <linux/io.h>
@@ -403,6 +403,14 @@ static unsigned long tegra210_input_freq[] = {
 #define PLLRE_BASE_DEFAULT_MASK		0x1c000000
 #define PLLRE_MISC0_WRITE_MASK		0x67ffffff
 
+/* PLLE */
+#define PLLE_MISC_IDDQ_SW_CTRL		(1 << 14)
+#define PLLE_AUX_USE_LOCKDET		(1 << 3)
+#define PLLE_AUX_SS_SEQ_INCLUDE		(1 << 31)
+#define PLLE_AUX_ENABLE_SWCTL		(1 << 4)
+#define PLLE_AUX_SS_SWCTL		(1 << 6)
+#define PLLE_AUX_SEQ_ENABLE		(1 << 24)
+
 /* PLLX */
 #define PLLX_USE_DYN_RAMP		1
 #define PLLX_BASE_LOCK			(1 << 27)
@@ -489,6 +497,49 @@ static unsigned long tegra210_input_freq[] = {
 #define PLLU_MISC0_WRITE_MASK		0xbfffffff
 #define PLLU_MISC1_WRITE_MASK		0x00000007
 
+bool tegra210_plle_hw_sequence_is_enabled(void)
+{
+	u32 value;
+
+	value = readl_relaxed(clk_base + PLLE_AUX);
+	if (value & PLLE_AUX_SEQ_ENABLE)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(tegra210_plle_hw_sequence_is_enabled);
+
+int tegra210_plle_hw_sequence_start(void)
+{
+	u32 value;
+
+	if (tegra210_plle_hw_sequence_is_enabled())
+		return 0;
+
+	/* skip if PLLE is not enabled yet */
+	value = readl_relaxed(clk_base + PLLE_MISC0);
+	if (!(value & PLLE_MISC_LOCK))
+		return -EIO;
+
+	value &= ~PLLE_MISC_IDDQ_SW_CTRL;
+	writel_relaxed(value, clk_base + PLLE_MISC0);
+
+	value = readl_relaxed(clk_base + PLLE_AUX);
+	value |= (PLLE_AUX_USE_LOCKDET | PLLE_AUX_SS_SEQ_INCLUDE);
+	value &= ~(PLLE_AUX_ENABLE_SWCTL | PLLE_AUX_SS_SWCTL);
+	writel_relaxed(value, clk_base + PLLE_AUX);
+
+	fence_udelay(1, clk_base);
+
+	value |= PLLE_AUX_SEQ_ENABLE;
+	writel_relaxed(value, clk_base + PLLE_AUX);
+
+	fence_udelay(1, clk_base);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tegra210_plle_hw_sequence_start);
+
 void tegra210_xusb_pll_hw_control_enable(void)
 {
 	u32 val;
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index eb016fc9cc0b..f7ff722a03dd 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2012-2020, NVIDIA CORPORATION.  All rights reserved.
  */
 
 #ifndef __LINUX_CLK_TEGRA_H_
@@ -123,6 +123,8 @@ static inline void tegra_cpu_clock_resume(void)
 }
 #endif
 
+extern int tegra210_plle_hw_sequence_start(void);
+extern bool tegra210_plle_hw_sequence_is_enabled(void);
 extern void tegra210_xusb_pll_hw_control_enable(void);
 extern void tegra210_xusb_pll_hw_sequence_start(void);
 extern void tegra210_sata_pll_hw_control_enable(void);
-- 
cgit v1.2.3


From ddb94eafab8b597b05904c8277194ea2d6357fa9 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:32 +0100
Subject: net: resolve forwarding path from virtual netdevice and HW
 destination address

This patch adds dev_fill_forward_path() which resolves the path to reach
the real netdevice from the IP forwarding side. This function takes as
input the netdevice and the destination hardware address and it walks
down the devices calling .ndo_fill_forward_path() for each device until
the real device is found.

For instance, assuming the following topology:

               IP forwarding
              /             \
           br0              eth0
           / \
       eth1  eth2
        .
        .
        .
       ethX
 ab:cd:ef:ab:cd:ef

where eth1 and eth2 are bridge ports and eth0 provides WAN connectivity.
ethX is the interface in another box which is connected to the eth1
bridge port.

For packets going through IP forwarding to br0 whose destination MAC
address is ab:cd:ef:ab:cd:ef, dev_fill_forward_path() provides the
following path:

	br0 -> eth1

.ndo_fill_forward_path for br0 looks up at the FDB for the bridge port
from the destination MAC address to get the bridge port eth1.

This information allows to create a fast path that bypasses the classic
bridge and IP forwarding paths, so packets go directly from the bridge
port eth1 to eth0 (wan interface) and vice versa.

             fast path
      .------------------------.
     /                          \
    |           IP forwarding   |
    |          /             \  \/
    |       br0               eth0
    .       / \
     -> eth1  eth2
        .
        .
        .
       ethX
 ab:cd:ef:ab:cd:ef

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 27 +++++++++++++++++++++++++++
 net/core/dev.c            | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5fa66db0cb5d..03cff88c7292 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -848,6 +848,27 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb,
 				       struct net_device *sb_dev);
 
+enum net_device_path_type {
+	DEV_PATH_ETHERNET = 0,
+};
+
+struct net_device_path {
+	enum net_device_path_type	type;
+	const struct net_device		*dev;
+};
+
+#define NET_DEVICE_PATH_STACK_MAX	5
+
+struct net_device_path_stack {
+	int			num_paths;
+	struct net_device_path	path[NET_DEVICE_PATH_STACK_MAX];
+};
+
+struct net_device_path_ctx {
+	const struct net_device *dev;
+	const u8		*daddr;
+};
+
 enum tc_setup_type {
 	TC_SETUP_QDISC_MQPRIO,
 	TC_SETUP_CLSU32,
@@ -1282,6 +1303,8 @@ struct netdev_net_notifier {
  * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
  *	If a device is paired with a peer device, return the peer instance.
  *	The caller must be under RCU read context.
+ * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
+ *     Get the forwarding path to reach the real device from the HW destination address
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1488,6 +1511,8 @@ struct net_device_ops {
 	int			(*ndo_tunnel_ctl)(struct net_device *dev,
 						  struct ip_tunnel_parm *p, int cmd);
 	struct net_device *	(*ndo_get_peer_dev)(struct net_device *dev);
+	int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
+                                                         struct net_device_path *path);
 };
 
 /**
@@ -2870,6 +2895,8 @@ void dev_remove_offload(struct packet_offload *po);
 
 int dev_get_iflink(const struct net_device *dev);
 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+			  struct net_device_path_stack *stack);
 struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
 				      unsigned short mask);
 struct net_device *dev_get_by_name(struct net *net, const char *name);
diff --git a/net/core/dev.c b/net/core/dev.c
index 515309573cb8..4bb6dcdbed8b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -848,6 +848,52 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 
+static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
+{
+	int k = stack->num_paths++;
+
+	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+		return NULL;
+
+	return &stack->path[k];
+}
+
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+			  struct net_device_path_stack *stack)
+{
+	const struct net_device *last_dev;
+	struct net_device_path_ctx ctx = {
+		.dev	= dev,
+		.daddr	= daddr,
+	};
+	struct net_device_path *path;
+	int ret = 0;
+
+	stack->num_paths = 0;
+	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+		last_dev = ctx.dev;
+		path = dev_fwd_path(stack);
+		if (!path)
+			return -1;
+
+		memset(path, 0, sizeof(struct net_device_path));
+		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+		if (ret < 0)
+			return -1;
+
+		if (WARN_ON_ONCE(last_dev == ctx.dev))
+			return -1;
+	}
+	path = dev_fwd_path(stack);
+	if (!path)
+		return -1;
+	path->type = DEV_PATH_ETHERNET;
+	path->dev = ctx.dev;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+
 /**
  *	__dev_get_by_name	- find a device by its name
  *	@net: the applicable net namespace
-- 
cgit v1.2.3


From e4417d6950b06fe6c520e937b337daff093220ff Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:33 +0100
Subject: net: 8021q: resolve forwarding path for vlan devices

Add .ndo_fill_forward_path for vlan devices.

For instance, assuming the following topology:

                   IP forwarding
                  /             \
            eth0.100             eth0
            |
            eth0
            .
            .
            .
           ethX
     ab:cd:ef:ab:cd:ef

For packets going through IP forwarding to eth0.100 whose destination
MAC address is ab:cd:ef:ab:cd:ef, dev_fill_forward_path() provides the
following path:

        eth0.100 -> eth0

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 +++++++
 net/8021q/vlan_dev.c      | 15 +++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03cff88c7292..8823a56744f1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -850,11 +850,18 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 
 enum net_device_path_type {
 	DEV_PATH_ETHERNET = 0,
+	DEV_PATH_VLAN,
 };
 
 struct net_device_path {
 	enum net_device_path_type	type;
 	const struct net_device		*dev;
+	union {
+		struct {
+			u16		id;
+			__be16		proto;
+		} encap;
+	};
 };
 
 #define NET_DEVICE_PATH_STACK_MAX	5
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index dc1a197792e6..1b1955a63f7f 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -776,6 +776,20 @@ static int vlan_dev_get_iflink(const struct net_device *dev)
 	return real_dev->ifindex;
 }
 
+static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx,
+				      struct net_device_path *path)
+{
+	struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev);
+
+	path->type = DEV_PATH_VLAN;
+	path->encap.id = vlan->vlan_id;
+	path->encap.proto = vlan->vlan_proto;
+	path->dev = ctx->dev;
+	ctx->dev = vlan->real_dev;
+
+	return 0;
+}
+
 static const struct ethtool_ops vlan_ethtool_ops = {
 	.get_link_ksettings	= vlan_ethtool_get_link_ksettings,
 	.get_drvinfo	        = vlan_ethtool_get_drvinfo,
@@ -814,6 +828,7 @@ static const struct net_device_ops vlan_netdev_ops = {
 #endif
 	.ndo_fix_features	= vlan_dev_fix_features,
 	.ndo_get_iflink		= vlan_dev_get_iflink,
+	.ndo_fill_forward_path	= vlan_dev_fill_forward_path,
 };
 
 static void vlan_dev_free(struct net_device *dev)
-- 
cgit v1.2.3


From ec9d16bab615ceda8ac22a7b4d2c7601bbe172cb Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:34 +0100
Subject: net: bridge: resolve forwarding path for bridge devices

Add .ndo_fill_forward_path for bridge devices.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/bridge/br_device.c    | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8823a56744f1..a24270b0d200 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -851,6 +851,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 enum net_device_path_type {
 	DEV_PATH_ETHERNET = 0,
 	DEV_PATH_VLAN,
+	DEV_PATH_BRIDGE,
 };
 
 struct net_device_path {
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 3f2f06b4dd27..c241719013f4 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -385,6 +385,32 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
 	return br_del_if(br, slave_dev);
 }
 
+static int br_fill_forward_path(struct net_device_path_ctx *ctx,
+				struct net_device_path *path)
+{
+	struct net_bridge_fdb_entry *f;
+	struct net_bridge_port *dst;
+	struct net_bridge *br;
+
+	if (netif_is_bridge_port(ctx->dev))
+		return -1;
+
+	br = netdev_priv(ctx->dev);
+	f = br_fdb_find_rcu(br, ctx->daddr, 0);
+	if (!f || !f->dst)
+		return -1;
+
+	dst = READ_ONCE(f->dst);
+	if (!dst)
+		return -1;
+
+	path->type = DEV_PATH_BRIDGE;
+	path->dev = dst->br->dev;
+	ctx->dev = dst->dev;
+
+	return 0;
+}
+
 static const struct ethtool_ops br_ethtool_ops = {
 	.get_drvinfo		 = br_getinfo,
 	.get_link		 = ethtool_op_get_link,
@@ -419,6 +445,7 @@ static const struct net_device_ops br_netdev_ops = {
 	.ndo_bridge_setlink	 = br_setlink,
 	.ndo_bridge_dellink	 = br_dellink,
 	.ndo_features_check	 = passthru_features_check,
+	.ndo_fill_forward_path	 = br_fill_forward_path,
 };
 
 static struct device_type br_type = {
-- 
cgit v1.2.3


From bcf2766b1377421b7c9259865b25c1b62a7fa686 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:35 +0100
Subject: net: bridge: resolve forwarding path for VLAN tag actions in bridge
 devices

Depending on the VLAN settings of the bridge and the port, the bridge can
either add or remove a tag. When vlan filtering is enabled, the fdb lookup
also needs to know the VLAN tag/proto for the destination address
To provide this, keep track of the stack of VLAN tags for the path in the
lookup context

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 16 ++++++++++++++
 net/8021q/vlan_dev.c      |  6 ++++++
 net/bridge/br_device.c    | 23 +++++++++++++++++++-
 net/bridge/br_private.h   | 20 ++++++++++++++++++
 net/bridge/br_vlan.c      | 53 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a24270b0d200..344d9c0c9b22 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -862,10 +862,20 @@ struct net_device_path {
 			u16		id;
 			__be16		proto;
 		} encap;
+		struct {
+			enum {
+				DEV_PATH_BR_VLAN_KEEP,
+				DEV_PATH_BR_VLAN_TAG,
+				DEV_PATH_BR_VLAN_UNTAG,
+			}		vlan_mode;
+			u16		vlan_id;
+			__be16		vlan_proto;
+		} bridge;
 	};
 };
 
 #define NET_DEVICE_PATH_STACK_MAX	5
+#define NET_DEVICE_PATH_VLAN_MAX	2
 
 struct net_device_path_stack {
 	int			num_paths;
@@ -875,6 +885,12 @@ struct net_device_path_stack {
 struct net_device_path_ctx {
 	const struct net_device *dev;
 	const u8		*daddr;
+
+	int			num_vlans;
+	struct {
+		u16		id;
+		__be16		proto;
+	} vlan[NET_DEVICE_PATH_VLAN_MAX];
 };
 
 enum tc_setup_type {
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 1b1955a63f7f..4db3f0621959 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -786,6 +786,12 @@ static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx,
 	path->encap.proto = vlan->vlan_proto;
 	path->dev = ctx->dev;
 	ctx->dev = vlan->real_dev;
+	if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+		return -ENOSPC;
+
+	ctx->vlan[ctx->num_vlans].id = vlan->vlan_id;
+	ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto;
+	ctx->num_vlans++;
 
 	return 0;
 }
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index c241719013f4..0c72503e0d39 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -396,7 +396,10 @@ static int br_fill_forward_path(struct net_device_path_ctx *ctx,
 		return -1;
 
 	br = netdev_priv(ctx->dev);
-	f = br_fdb_find_rcu(br, ctx->daddr, 0);
+
+	br_vlan_fill_forward_path_pvid(br, ctx, path);
+
+	f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id);
 	if (!f || !f->dst)
 		return -1;
 
@@ -404,10 +407,28 @@ static int br_fill_forward_path(struct net_device_path_ctx *ctx,
 	if (!dst)
 		return -1;
 
+	if (br_vlan_fill_forward_path_mode(br, dst, path))
+		return -1;
+
 	path->type = DEV_PATH_BRIDGE;
 	path->dev = dst->br->dev;
 	ctx->dev = dst->dev;
 
+	switch (path->bridge.vlan_mode) {
+	case DEV_PATH_BR_VLAN_TAG:
+		if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+			return -ENOSPC;
+		ctx->vlan[ctx->num_vlans].id = path->bridge.vlan_id;
+		ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto;
+		ctx->num_vlans++;
+		break;
+	case DEV_PATH_BR_VLAN_UNTAG:
+		ctx->num_vlans--;
+		break;
+	case DEV_PATH_BR_VLAN_KEEP:
+		break;
+	}
+
 	return 0;
 }
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d7d167e10b70..50747990188e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1118,6 +1118,13 @@ void br_vlan_notify(const struct net_bridge *br,
 bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
 			     const struct net_bridge_vlan *range_end);
 
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+				    struct net_device_path_ctx *ctx,
+				    struct net_device_path *path);
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+				   struct net_bridge_port *dst,
+				   struct net_device_path *path);
+
 static inline struct net_bridge_vlan_group *br_vlan_group(
 					const struct net_bridge *br)
 {
@@ -1277,6 +1284,19 @@ static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p,
 	return 0;
 }
 
+static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+						  struct net_device_path_ctx *ctx,
+						  struct net_device_path *path)
+{
+}
+
+static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+						 struct net_bridge_port *dst,
+						 struct net_device_path *path)
+{
+	return 0;
+}
+
 static inline struct net_bridge_vlan_group *br_vlan_group(
 					const struct net_bridge *br)
 {
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 6f961cb5f771..c92240b21c4a 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1339,6 +1339,59 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
 }
 EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
 
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+				    struct net_device_path_ctx *ctx,
+				    struct net_device_path *path)
+{
+	struct net_bridge_vlan_group *vg;
+	int idx = ctx->num_vlans - 1;
+	u16 vid;
+
+	path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+
+	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+		return;
+
+	vg = br_vlan_group(br);
+
+	if (idx >= 0 &&
+	    ctx->vlan[idx].proto == br->vlan_proto) {
+		vid = ctx->vlan[idx].id;
+	} else {
+		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG;
+		vid = br_get_pvid(vg);
+	}
+
+	path->bridge.vlan_id = vid;
+	path->bridge.vlan_proto = br->vlan_proto;
+}
+
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+				   struct net_bridge_port *dst,
+				   struct net_device_path *path)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+
+	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+		return 0;
+
+	vg = nbp_vlan_group_rcu(dst);
+	v = br_vlan_find(vg, path->bridge.vlan_id);
+	if (!v || !br_vlan_should_use(v))
+		return -EINVAL;
+
+	if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
+		return 0;
+
+	if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
+		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+	else
+		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
+
+	return 0;
+}
+
 int br_vlan_get_info(const struct net_device *dev, u16 vid,
 		     struct bridge_vlan_info *p_vinfo)
 {
-- 
cgit v1.2.3


From f6efc675c9dd8d93f826b79ae7e33e03301db609 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:36 +0100
Subject: net: ppp: resolve forwarding path for bridge pppoe devices

Pass on the PPPoE session ID, destination hardware address and the real
device.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/ppp_generic.c | 22 ++++++++++++++++++++++
 drivers/net/ppp/pppoe.c       | 23 +++++++++++++++++++++++
 include/linux/netdevice.h     |  2 ++
 include/linux/ppp_channel.h   |  3 +++
 4 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index d445ecb1d0c7..930e49ef15f6 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1560,12 +1560,34 @@ static void ppp_dev_priv_destructor(struct net_device *dev)
 		ppp_destroy_interface(ppp);
 }
 
+static int ppp_fill_forward_path(struct net_device_path_ctx *ctx,
+				 struct net_device_path *path)
+{
+	struct ppp *ppp = netdev_priv(ctx->dev);
+	struct ppp_channel *chan;
+	struct channel *pch;
+
+	if (ppp->flags & SC_MULTILINK)
+		return -EOPNOTSUPP;
+
+	if (list_empty(&ppp->channels))
+		return -ENODEV;
+
+	pch = list_first_entry(&ppp->channels, struct channel, clist);
+	chan = pch->chan;
+	if (!chan->ops->fill_forward_path)
+		return -EOPNOTSUPP;
+
+	return chan->ops->fill_forward_path(ctx, path, chan);
+}
+
 static const struct net_device_ops ppp_netdev_ops = {
 	.ndo_init	 = ppp_dev_init,
 	.ndo_uninit      = ppp_dev_uninit,
 	.ndo_start_xmit  = ppp_start_xmit,
 	.ndo_do_ioctl    = ppp_net_ioctl,
 	.ndo_get_stats64 = ppp_get_stats64,
+	.ndo_fill_forward_path = ppp_fill_forward_path,
 };
 
 static struct device_type ppp_type = {
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 9dc7f4b93d51..3619520340b7 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -972,8 +972,31 @@ static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	return __pppoe_xmit(sk, skb);
 }
 
+static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx,
+				   struct net_device_path *path,
+				   const struct ppp_channel *chan)
+{
+	struct sock *sk = (struct sock *)chan->private;
+	struct pppox_sock *po = pppox_sk(sk);
+	struct net_device *dev = po->pppoe_dev;
+
+	if (sock_flag(sk, SOCK_DEAD) ||
+	    !(sk->sk_state & PPPOX_CONNECTED) || !dev)
+		return -1;
+
+	path->type = DEV_PATH_PPPOE;
+	path->encap.proto = htons(ETH_P_PPP_SES);
+	path->encap.id = be16_to_cpu(po->num);
+	memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN);
+	path->dev = ctx->dev;
+	ctx->dev = dev;
+
+	return 0;
+}
+
 static const struct ppp_channel_ops pppoe_chan_ops = {
 	.start_xmit = pppoe_xmit,
+	.fill_forward_path = pppoe_fill_forward_path,
 };
 
 static int pppoe_recvmsg(struct socket *sock, struct msghdr *m,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 344d9c0c9b22..dd54f7cc3f12 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -852,6 +852,7 @@ enum net_device_path_type {
 	DEV_PATH_ETHERNET = 0,
 	DEV_PATH_VLAN,
 	DEV_PATH_BRIDGE,
+	DEV_PATH_PPPOE,
 };
 
 struct net_device_path {
@@ -861,6 +862,7 @@ struct net_device_path {
 		struct {
 			u16		id;
 			__be16		proto;
+			u8		h_dest[ETH_ALEN];
 		} encap;
 		struct {
 			enum {
diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h
index 98966064ee68..91f9a928344e 100644
--- a/include/linux/ppp_channel.h
+++ b/include/linux/ppp_channel.h
@@ -28,6 +28,9 @@ struct ppp_channel_ops {
 	int	(*start_xmit)(struct ppp_channel *, struct sk_buff *);
 	/* Handle an ioctl call that has come in via /dev/ppp. */
 	int	(*ioctl)(struct ppp_channel *, unsigned int, unsigned long);
+	int	(*fill_forward_path)(struct net_device_path_ctx *,
+				     struct net_device_path *,
+				     const struct ppp_channel *);
 };
 
 struct ppp_channel {
-- 
cgit v1.2.3


From 0994d492a1b78dff96671ccf6ad8294cc2bd909e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:37 +0100
Subject: net: dsa: resolve forwarding path for dsa slave ports

Add .ndo_fill_forward_path for dsa slave port devices

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  5 +++++
 net/dsa/slave.c           | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dd54f7cc3f12..90db74132090 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -853,6 +853,7 @@ enum net_device_path_type {
 	DEV_PATH_VLAN,
 	DEV_PATH_BRIDGE,
 	DEV_PATH_PPPOE,
+	DEV_PATH_DSA,
 };
 
 struct net_device_path {
@@ -873,6 +874,10 @@ struct net_device_path {
 			u16		vlan_id;
 			__be16		vlan_proto;
 		} bridge;
+		struct {
+			int port;
+			u16 proto;
+		} dsa;
 	};
 };
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index c51e52418a62..7453ceca2c7e 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1654,6 +1654,21 @@ static void dsa_slave_get_stats64(struct net_device *dev,
 		dev_get_tstats64(dev, s);
 }
 
+static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx,
+				       struct net_device_path *path)
+{
+	struct dsa_port *dp = dsa_slave_to_port(ctx->dev);
+	struct dsa_port *cpu_dp = dp->cpu_dp;
+
+	path->dev = ctx->dev;
+	path->type = DEV_PATH_DSA;
+	path->dsa.proto = cpu_dp->tag_ops->proto;
+	path->dsa.port = dp->index;
+	ctx->dev = cpu_dp->master;
+
+	return 0;
+}
+
 static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_open	 	= dsa_slave_open,
 	.ndo_stop		= dsa_slave_close,
@@ -1679,6 +1694,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= dsa_slave_vlan_rx_kill_vid,
 	.ndo_get_devlink_port	= dsa_slave_get_devlink_port,
 	.ndo_change_mtu		= dsa_slave_change_mtu,
+	.ndo_fill_forward_path	= dsa_slave_fill_forward_path,
 };
 
 static struct device_type dsa_type = {
-- 
cgit v1.2.3


From 26267bf9bb57d504c785d8659adc8e02b6629c95 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:48 +0100
Subject: netfilter: flowtable: bridge vlan hardware offload and switchdev

The switch might have already added the VLAN tag through PVID hardware
offload. Keep this extra VLAN in the flowtable but skip it on egress.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h             | 1 +
 include/net/netfilter/nf_flow_table.h | 8 +++++---
 net/bridge/br_device.c                | 1 +
 net/bridge/br_vlan.c                  | 2 ++
 net/netfilter/nf_flow_table_core.c    | 2 ++
 net/netfilter/nf_flow_table_offload.c | 6 +++++-
 net/netfilter/nft_flow_offload.c      | 5 +++++
 7 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 90db74132090..02fa1da8cd22 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -870,6 +870,7 @@ struct net_device_path {
 				DEV_PATH_BR_VLAN_KEEP,
 				DEV_PATH_BR_VLAN_TAG,
 				DEV_PATH_BR_VLAN_UNTAG,
+				DEV_PATH_BR_VLAN_UNTAG_HW,
 			}		vlan_mode;
 			u16		vlan_id;
 			__be16		vlan_proto;
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 52afcee6e999..4d991c1e93ef 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -123,9 +123,10 @@ struct flow_offload_tuple {
 	/* All members above are keys for lookups, see flow_offload_hash(). */
 	struct { }			__hash;
 
-	u8				dir:4,
+	u8				dir:2,
 					xmit_type:2,
-					encap_num:2;
+					encap_num:2,
+					in_vlan_ingress:2;
 	u16				mtu;
 	union {
 		struct dst_entry	*dst_cache;
@@ -185,7 +186,8 @@ struct nf_flow_route {
 				u16		id;
 				__be16		proto;
 			} encap[NF_FLOW_TABLE_ENCAP_MAX];
-			u8			num_encaps;
+			u8			num_encaps:2,
+						ingress_vlans:2;
 		} in;
 		struct {
 			u32			ifindex;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 0c72503e0d39..e8b626cc6bfd 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -422,6 +422,7 @@ static int br_fill_forward_path(struct net_device_path_ctx *ctx,
 		ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto;
 		ctx->num_vlans++;
 		break;
+	case DEV_PATH_BR_VLAN_UNTAG_HW:
 	case DEV_PATH_BR_VLAN_UNTAG:
 		ctx->num_vlans--;
 		break;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index c92240b21c4a..da3256a3eed0 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1386,6 +1386,8 @@ int br_vlan_fill_forward_path_mode(struct net_bridge *br,
 
 	if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
 		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+	else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW;
 	else
 		path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
 
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index f728c955b1dc..8fa7bf9d5f3f 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -95,6 +95,8 @@ static int flow_offload_fill_route(struct flow_offload *flow,
 	for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
 		flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
 		flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+		if (route->tuple[dir].in.ingress_vlans & BIT(i))
+			flow_tuple->in_vlan_ingress |= BIT(j);
 		j++;
 	}
 	flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index e0d079601fcb..9326ba74745e 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -594,8 +594,12 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
 	other_tuple = &flow->tuplehash[!dir].tuple;
 
 	for (i = 0; i < other_tuple->encap_num; i++) {
-		struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+		struct flow_action_entry *entry;
 
+		if (other_tuple->in_vlan_ingress & BIT(i))
+			continue;
+
+		entry = flow_action_entry_next(flow_rule);
 		entry->id = FLOW_ACTION_VLAN_PUSH;
 		entry->vlan.vid = other_tuple->encap[i].id;
 		entry->vlan.proto = other_tuple->encap[i].proto;
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index d25b4b109e25..4843dd2b410c 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -72,6 +72,7 @@ struct nft_forward_info {
 		__be16	proto;
 	} encap[NF_FLOW_TABLE_ENCAP_MAX];
 	u8 num_encaps;
+	u8 ingress_vlans;
 	u8 h_source[ETH_ALEN];
 	u8 h_dest[ETH_ALEN];
 	enum flow_offload_xmit_type xmit_type;
@@ -130,6 +131,9 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
 				memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
 
 			switch (path->bridge.vlan_mode) {
+			case DEV_PATH_BR_VLAN_UNTAG_HW:
+				info->ingress_vlans |= BIT(info->num_encaps - 1);
+				break;
 			case DEV_PATH_BR_VLAN_TAG:
 				info->encap[info->num_encaps].id = path->bridge.vlan_id;
 				info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
@@ -198,6 +202,7 @@ static void nft_dev_forward_path(struct nf_flow_route *route,
 		route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
 	}
 	route->tuple[!dir].in.num_encaps = info.num_encaps;
+	route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
 
 	if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
 		memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
-- 
cgit v1.2.3


From b0b8b689d78c9666a991e1cc87c3dad4c261007f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 15 Mar 2021 11:56:24 +0000
Subject: genirq: Allow architectures to override set_handle_irq() fallback

Some architectures want to provide the generic set_handle_irq() API, but
for structural reasons need to provide their own implementation. For
example, arm64 needs to do this to provide uniform set_handle_irq() and
set_handle_fiq() registration functions.

Make this possible by allowing architectures to provide their own
implementation of set_handle_irq when CONFIG_GENERIC_IRQ_MULTI_HANDLER
is not selected.

Signed-off-by: Marc Zyngier <maz@kernel.org>
[Mark: expand commit message]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Hector Martin <marcan@marcan.st>
Cc: James Morse <james.morse@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210315115629.57191-2-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/irq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2efde6a79b7e..9890180b84fd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1258,11 +1258,13 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *));
  */
 extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init;
 #else
+#ifndef set_handle_irq
 #define set_handle_irq(handle_irq)		\
 	do {					\
 		(void)handle_irq;		\
 		WARN_ON(1);			\
 	} while (0)
 #endif
+#endif
 
 #endif /* _LINUX_IRQ_H */
-- 
cgit v1.2.3


From 341f67e424e572bfc034daa534c6fa667533e6a4 Mon Sep 17 00:00:00 2001
From: Tan Tee Min <tee.min.tan@intel.com>
Date: Tue, 23 Mar 2021 19:07:34 +0800
Subject: net: stmmac: Add hardware supported cross-timestamp

Cross timestamping is supported on Integrated Ethernet Controller in
Intel SoC such as EHL and TGL with Always Running Timer.

The hardware cross-timestamp result is made available to
applications through the PTP_SYS_OFFSET_PRECISE ioctl which calls
stmmac_getcrosststamp().

Device time is stored in the MAC Auxiliary register. The 64-bit System
time (ART timestamp) is stored in registers that are only addressable
by using MDIO space.

Signed-off-by: Tan Tee Min <tee.min.tan@intel.com>
Co-developed-by: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Signed-off-by: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |   2 +
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c  | 108 +++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h       |   8 ++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c   |   2 +
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |   3 +
 .../net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c  |  11 +++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c   |  32 ++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h   |  23 +++++
 include/linux/stmmac.h                             |   4 +
 9 files changed, 193 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 1c0c60bdf854..95469059dca1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -388,6 +388,8 @@ struct dma_features {
 	unsigned int estsel;
 	unsigned int fpesel;
 	unsigned int tbssel;
+	/* Numbers of Auxiliary Snapshot Inputs */
+	unsigned int aux_snapshot_n;
 };
 
 /* RX Buffer size must be multiple of 4/8/16 bytes */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 763b549e3c2d..992294d25706 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -8,6 +8,7 @@
 #include "dwmac-intel.h"
 #include "dwmac4.h"
 #include "stmmac.h"
+#include "stmmac_ptp.h"
 
 #define INTEL_MGBE_ADHOC_ADDR	0x15
 #define INTEL_MGBE_XPCS_ADDR	0x16
@@ -240,6 +241,108 @@ static void intel_mgbe_ptp_clk_freq_config(void *npriv)
 	writel(gpio_value, priv->ioaddr + GMAC_GPIO_STATUS);
 }
 
+static void get_arttime(struct mii_bus *mii, int intel_adhoc_addr,
+			u64 *art_time)
+{
+	u64 ns;
+
+	ns = mdiobus_read(mii, intel_adhoc_addr, PMC_ART_VALUE3);
+	ns <<= GMAC4_ART_TIME_SHIFT;
+	ns |= mdiobus_read(mii, intel_adhoc_addr, PMC_ART_VALUE2);
+	ns <<= GMAC4_ART_TIME_SHIFT;
+	ns |= mdiobus_read(mii, intel_adhoc_addr, PMC_ART_VALUE1);
+	ns <<= GMAC4_ART_TIME_SHIFT;
+	ns |= mdiobus_read(mii, intel_adhoc_addr, PMC_ART_VALUE0);
+
+	*art_time = ns;
+}
+
+static int intel_crosststamp(ktime_t *device,
+			     struct system_counterval_t *system,
+			     void *ctx)
+{
+	struct intel_priv_data *intel_priv;
+
+	struct stmmac_priv *priv = (struct stmmac_priv *)ctx;
+	void __iomem *ptpaddr = priv->ptpaddr;
+	void __iomem *ioaddr = priv->hw->pcsr;
+	unsigned long flags;
+	u64 art_time = 0;
+	u64 ptp_time = 0;
+	u32 num_snapshot;
+	u32 gpio_value;
+	u32 acr_value;
+	int ret;
+	u32 v;
+	int i;
+
+	if (!boot_cpu_has(X86_FEATURE_ART))
+		return -EOPNOTSUPP;
+
+	intel_priv = priv->plat->bsp_priv;
+
+	/* Enable Internal snapshot trigger */
+	acr_value = readl(ptpaddr + PTP_ACR);
+	acr_value &= ~PTP_ACR_MASK;
+	switch (priv->plat->int_snapshot_num) {
+	case AUX_SNAPSHOT0:
+		acr_value |= PTP_ACR_ATSEN0;
+		break;
+	case AUX_SNAPSHOT1:
+		acr_value |= PTP_ACR_ATSEN1;
+		break;
+	case AUX_SNAPSHOT2:
+		acr_value |= PTP_ACR_ATSEN2;
+		break;
+	case AUX_SNAPSHOT3:
+		acr_value |= PTP_ACR_ATSEN3;
+		break;
+	default:
+		return -EINVAL;
+	}
+	writel(acr_value, ptpaddr + PTP_ACR);
+
+	/* Clear FIFO */
+	acr_value = readl(ptpaddr + PTP_ACR);
+	acr_value |= PTP_ACR_ATSFC;
+	writel(acr_value, ptpaddr + PTP_ACR);
+
+	/* Trigger Internal snapshot signal
+	 * Create a rising edge by just toggle the GPO1 to low
+	 * and back to high.
+	 */
+	gpio_value = readl(ioaddr + GMAC_GPIO_STATUS);
+	gpio_value &= ~GMAC_GPO1;
+	writel(gpio_value, ioaddr + GMAC_GPIO_STATUS);
+	gpio_value |= GMAC_GPO1;
+	writel(gpio_value, ioaddr + GMAC_GPIO_STATUS);
+
+	/* Poll for time sync operation done */
+	ret = readl_poll_timeout(priv->ioaddr + GMAC_INT_STATUS, v,
+				 (v & GMAC_INT_TSIE), 100, 10000);
+
+	if (ret == -ETIMEDOUT) {
+		pr_err("%s: Wait for time sync operation timeout\n", __func__);
+		return ret;
+	}
+
+	num_snapshot = (readl(ioaddr + GMAC_TIMESTAMP_STATUS) &
+			GMAC_TIMESTAMP_ATSNS_MASK) >>
+			GMAC_TIMESTAMP_ATSNS_SHIFT;
+
+	/* Repeat until the timestamps are from the FIFO last segment */
+	for (i = 0; i < num_snapshot; i++) {
+		spin_lock_irqsave(&priv->ptp_lock, flags);
+		stmmac_get_ptptime(priv, ptpaddr, &ptp_time);
+		*device = ns_to_ktime(ptp_time);
+		spin_unlock_irqrestore(&priv->ptp_lock, flags);
+		get_arttime(priv->mii, intel_priv->mdio_adhoc_addr, &art_time);
+		*system = convert_art_to_tsc(art_time);
+	}
+
+	return 0;
+}
+
 static void common_default_data(struct plat_stmmacenet_data *plat)
 {
 	plat->clk_csr = 2;	/* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */
@@ -384,6 +487,11 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	plat->mdio_bus_data->phy_mask = 1 << INTEL_MGBE_ADHOC_ADDR;
 	plat->mdio_bus_data->phy_mask |= 1 << INTEL_MGBE_XPCS_ADDR;
 
+	plat->int_snapshot_num = AUX_SNAPSHOT1;
+
+	plat->has_crossts = true;
+	plat->crosststamp = intel_crosststamp;
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index ef8502d2b6e6..462ca7ed095a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -50,6 +50,7 @@
 #define GMAC_L4_ADDR(reg)		(0x904 + (reg) * 0x30)
 #define GMAC_L3_ADDR0(reg)		(0x910 + (reg) * 0x30)
 #define GMAC_L3_ADDR1(reg)		(0x914 + (reg) * 0x30)
+#define GMAC_TIMESTAMP_STATUS		0x00000b20
 
 /* RX Queues Routing */
 #define GMAC_RXQCTRL_AVCPQ_MASK		GENMASK(2, 0)
@@ -144,6 +145,7 @@
 #define GMAC_INT_PCS_PHYIS		BIT(3)
 #define GMAC_INT_PMT_EN			BIT(4)
 #define GMAC_INT_LPI_EN			BIT(5)
+#define GMAC_INT_TSIE			BIT(12)
 
 #define	GMAC_PCS_IRQ_DEFAULT	(GMAC_INT_RGSMIIS | GMAC_INT_PCS_LINK |	\
 				 GMAC_INT_PCS_ANE)
@@ -260,6 +262,7 @@ enum power_event {
 #define GMAC_HW_RXFIFOSIZE		GENMASK(4, 0)
 
 /* MAC HW features2 bitmap */
+#define GMAC_HW_FEAT_AUXSNAPNUM		GENMASK(30, 28)
 #define GMAC_HW_FEAT_PPSOUTNUM		GENMASK(26, 24)
 #define GMAC_HW_FEAT_TXCHCNT		GENMASK(21, 18)
 #define GMAC_HW_FEAT_RXCHCNT		GENMASK(15, 12)
@@ -305,6 +308,11 @@ enum power_event {
 #define GMAC_L4DP0_SHIFT		16
 #define GMAC_L4SP0			GENMASK(15, 0)
 
+/* MAC Timestamp Status */
+#define GMAC_TIMESTAMP_AUXTSTRIG	BIT(2)
+#define GMAC_TIMESTAMP_ATSNS_MASK	GENMASK(29, 25)
+#define GMAC_TIMESTAMP_ATSNS_SHIFT	25
+
 /*  MTL registers */
 #define MTL_OPERATION_MODE		0x00000c00
 #define MTL_FRPE			BIT(15)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 8958778d16b7..8954b85eb850 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -412,6 +412,8 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr,
 
 	/* IEEE 1588-2002 */
 	dma_cap->time_stamp = 0;
+	/* Number of Auxiliary Snapshot Inputs */
+	dma_cap->aux_snapshot_n = (hw_cap & GMAC_HW_FEAT_AUXSNAPNUM) >> 28;
 
 	/* MAC HW feature3 */
 	hw_cap = readl(ioaddr + GMAC_HW_FEATURE3);
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 692541c7b419..59bf7078a754 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -508,6 +508,7 @@ struct stmmac_hwtimestamp {
 	int (*adjust_systime) (void __iomem *ioaddr, u32 sec, u32 nsec,
 			       int add_sub, int gmac4);
 	void (*get_systime) (void __iomem *ioaddr, u64 *systime);
+	void (*get_ptptime)(void __iomem *ioaddr, u64 *ptp_time);
 };
 
 #define stmmac_config_hw_tstamping(__priv, __args...) \
@@ -522,6 +523,8 @@ struct stmmac_hwtimestamp {
 	stmmac_do_callback(__priv, ptp, adjust_systime, __args)
 #define stmmac_get_systime(__priv, __args...) \
 	stmmac_do_void_callback(__priv, ptp, get_systime, __args)
+#define stmmac_get_ptptime(__priv, __args...) \
+	stmmac_do_void_callback(__priv, ptp, get_ptptime, __args)
 
 /* Helpers to manage the descriptors for chain and ring modes */
 struct stmmac_mode_ops {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
index d291612eeafb..113c51bcc0b5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
@@ -153,6 +153,16 @@ static void get_systime(void __iomem *ioaddr, u64 *systime)
 		*systime = ns;
 }
 
+static void get_ptptime(void __iomem *ptpaddr, u64 *ptp_time)
+{
+	u64 ns;
+
+	ns = readl(ptpaddr + PTP_ATNR);
+	ns += readl(ptpaddr + PTP_ATSR) * NSEC_PER_SEC;
+
+	*ptp_time = ns;
+}
+
 const struct stmmac_hwtimestamp stmmac_ptp = {
 	.config_hw_tstamping = config_hw_tstamping,
 	.init_systime = init_systime,
@@ -160,4 +170,5 @@ const struct stmmac_hwtimestamp stmmac_ptp = {
 	.config_addend = config_addend,
 	.adjust_systime = adjust_systime,
 	.get_systime = get_systime,
+	.get_ptptime = get_ptptime,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 8b10fd10446f..b164ae22e35f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -9,6 +9,7 @@
 *******************************************************************************/
 #include "stmmac.h"
 #include "stmmac_ptp.h"
+#include "dwmac4.h"
 
 /**
  * stmmac_adjust_freq
@@ -165,6 +166,36 @@ static int stmmac_enable(struct ptp_clock_info *ptp,
 	return ret;
 }
 
+/**
+ * stmmac_get_syncdevicetime
+ * @device: current device time
+ * @system: system counter value read synchronously with device time
+ * @ctx: context provided by timekeeping code
+ * Description: Read device and system clock simultaneously and return the
+ * corrected clock values in ns.
+ **/
+static int stmmac_get_syncdevicetime(ktime_t *device,
+				     struct system_counterval_t *system,
+				     void *ctx)
+{
+	struct stmmac_priv *priv = (struct stmmac_priv *)ctx;
+
+	if (priv->plat->crosststamp)
+		return priv->plat->crosststamp(device, system, ctx);
+	else
+		return -EOPNOTSUPP;
+}
+
+static int stmmac_getcrosststamp(struct ptp_clock_info *ptp,
+				 struct system_device_crosststamp *xtstamp)
+{
+	struct stmmac_priv *priv =
+		container_of(ptp, struct stmmac_priv, ptp_clock_ops);
+
+	return get_device_system_crosststamp(stmmac_get_syncdevicetime,
+					     priv, NULL, xtstamp);
+}
+
 /* structure describing a PTP hardware clock */
 static struct ptp_clock_info stmmac_ptp_clock_ops = {
 	.owner = THIS_MODULE,
@@ -180,6 +211,7 @@ static struct ptp_clock_info stmmac_ptp_clock_ops = {
 	.gettime64 = stmmac_get_time,
 	.settime64 = stmmac_set_time,
 	.enable = stmmac_enable,
+	.getcrosststamp = stmmac_getcrosststamp,
 };
 
 /**
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
index 7abb1d47e7da..f88727ce4d30 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
@@ -23,6 +23,9 @@
 #define	PTP_STSUR	0x10	/* System Time – Seconds Update Reg */
 #define	PTP_STNSUR	0x14	/* System Time – Nanoseconds Update Reg */
 #define	PTP_TAR		0x18	/* Timestamp Addend Reg */
+#define	PTP_ACR		0x40	/* Auxiliary Control Reg */
+#define	PTP_ATNR	0x48	/* Auxiliary Timestamp - Nanoseconds Reg */
+#define	PTP_ATSR	0x4c	/* Auxiliary Timestamp - Seconds Reg */
 
 #define	PTP_STNSUR_ADDSUB_SHIFT	31
 #define	PTP_DIGITAL_ROLLOVER_MODE	0x3B9ACA00	/* 10e9-1 ns */
@@ -64,4 +67,24 @@
 #define	PTP_SSIR_SSINC_MASK		0xff
 #define	GMAC4_PTP_SSIR_SSINC_SHIFT	16
 
+/* Auxiliary Control defines */
+#define	PTP_ACR_ATSFC		BIT(0)	/* Auxiliary Snapshot FIFO Clear */
+#define	PTP_ACR_ATSEN0		BIT(4)	/* Auxiliary Snapshot 0 Enable */
+#define	PTP_ACR_ATSEN1		BIT(5)	/* Auxiliary Snapshot 1 Enable */
+#define	PTP_ACR_ATSEN2		BIT(6)	/* Auxiliary Snapshot 2 Enable */
+#define	PTP_ACR_ATSEN3		BIT(7)	/* Auxiliary Snapshot 3 Enable */
+#define	PTP_ACR_MASK		GENMASK(7, 4)	/* Aux Snapshot Mask */
+#define	PMC_ART_VALUE0		0x01	/* PMC_ART[15:0] timer value */
+#define	PMC_ART_VALUE1		0x02	/* PMC_ART[31:16] timer value */
+#define	PMC_ART_VALUE2		0x03	/* PMC_ART[47:32] timer value */
+#define	PMC_ART_VALUE3		0x04	/* PMC_ART[63:48] timer value */
+#define	GMAC4_ART_TIME_SHIFT	16	/* ART TIME 16-bits shift */
+
+enum aux_snapshot {
+	AUX_SNAPSHOT0 = 0x10,
+	AUX_SNAPSHOT1 = 0x20,
+	AUX_SNAPSHOT2 = 0x40,
+	AUX_SNAPSHOT3 = 0x80,
+};
+
 #endif	/* __STMMAC_PTP_H__ */
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 10abc80b601e..5134e802f39a 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -186,6 +186,8 @@ struct plat_stmmacenet_data {
 	void (*exit)(struct platform_device *pdev, void *priv);
 	struct mac_device_info *(*setup)(void *priv);
 	int (*clks_config)(void *priv, bool enabled);
+	int (*crosststamp)(ktime_t *device, struct system_counterval_t *system,
+			   void *ctx);
 	void *bsp_priv;
 	struct clk *stmmac_clk;
 	struct clk *pclk;
@@ -206,5 +208,7 @@ struct plat_stmmacenet_data {
 	u8 vlan_fail_q;
 	unsigned int eee_usecs_rate;
 	struct pci_dev *pdev;
+	bool has_crossts;
+	int int_snapshot_num;
 };
 #endif
-- 
cgit v1.2.3


From 0ef25ed104ac17fa0586fbb076f24a5e8940b966 Mon Sep 17 00:00:00 2001
From: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Date: Wed, 24 Mar 2021 00:46:40 +0800
Subject: net: phy: add genphy_c45_loopback

Add generic code to enable C45 PHY loopback into the common phy-c45.c
file. This will allow C45 PHY drivers aceess this by setting
.set_loopback.

Suggested-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-c45.c | 8 ++++++++
 include/linux/phy.h       | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 077f2929c45e..91e3acb9e397 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -560,6 +560,14 @@ int gen10g_config_aneg(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(gen10g_config_aneg);
 
+int genphy_c45_loopback(struct phy_device *phydev, bool enable)
+{
+	return phy_modify_mmd(phydev, MDIO_MMD_PCS, MDIO_CTRL1,
+			      MDIO_PCS_CTRL1_LOOPBACK,
+			      enable ? MDIO_PCS_CTRL1_LOOPBACK : 0);
+}
+EXPORT_SYMBOL_GPL(genphy_c45_loopback);
+
 struct phy_driver genphy_c45_driver = {
 	.phy_id         = 0xffffffff,
 	.phy_id_mask    = 0xffffffff,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1a12e4436b5b..8e2cf84b2318 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1532,6 +1532,7 @@ int genphy_c45_read_mdix(struct phy_device *phydev);
 int genphy_c45_pma_read_abilities(struct phy_device *phydev);
 int genphy_c45_read_status(struct phy_device *phydev);
 int genphy_c45_config_aneg(struct phy_device *phydev);
+int genphy_c45_loopback(struct phy_device *phydev, bool enable);
 
 /* Generic C45 PHY driver */
 extern struct phy_driver genphy_c45_driver;
-- 
cgit v1.2.3


From 0909fc2b2c41aae50a18a36ac2858d156f521871 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 25 Feb 2021 17:36:06 -0800
Subject: rcu: Provide polling interfaces for Tiny RCU grace periods

There is a need for a non-blocking polling interface for RCU grace
periods, so this commit supplies start_poll_synchronize_rcu() and
poll_state_synchronize_rcu() for this purpose.  Note that the existing
get_state_synchronize_rcu() may be used if future grace periods are
inevitable (perhaps due to a later call_rcu() invocation).  The new
start_poll_synchronize_rcu() is to be used if future grace periods
might not otherwise happen.  Finally, poll_state_synchronize_rcu()
provides a lockless check for a grace period having elapsed since
the corresponding call to either of the get_state_synchronize_rcu()
or start_poll_synchronize_rcu().

As with get_state_synchronize_rcu(), the return value from either
get_state_synchronize_rcu() or start_poll_synchronize_rcu() is passed in
to a later call to either poll_state_synchronize_rcu() or the existing
(might_sleep) cond_synchronize_rcu().

[ paulmck: Revert cond_synchronize_rcu() to might_sleep() per Frederic Weisbecker feedback. ]
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h |  7 +++----
 kernel/rcu/tiny.c       | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 2a97334eb786..35e0be326ffc 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -17,10 +17,9 @@
 /* Never flag non-existent other CPUs! */
 static inline bool rcu_eqs_special_set(int cpu) { return false; }
 
-static inline unsigned long get_state_synchronize_rcu(void)
-{
-	return 0;
-}
+unsigned long get_state_synchronize_rcu(void);
+unsigned long start_poll_synchronize_rcu(void);
+bool poll_state_synchronize_rcu(unsigned long oldstate);
 
 static inline void cond_synchronize_rcu(unsigned long oldstate)
 {
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index aa897c3f2e92..c8a029fbb114 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -32,12 +32,14 @@ struct rcu_ctrlblk {
 	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
 	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
 	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+	unsigned long gp_seq;		/* Grace-period counter. */
 };
 
 /* Definition for rcupdate control block. */
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.donetail	= &rcu_ctrlblk.rcucblist,
 	.curtail	= &rcu_ctrlblk.rcucblist,
+	.gp_seq		= 0 - 300UL,
 };
 
 void rcu_barrier(void)
@@ -56,6 +58,7 @@ void rcu_qs(void)
 		rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
 		raise_softirq_irqoff(RCU_SOFTIRQ);
 	}
+	WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1);
 	local_irq_restore(flags);
 }
 
@@ -177,6 +180,43 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+/*
+ * Return a grace-period-counter "cookie".  For more information,
+ * see the Tree RCU header comment.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+	return READ_ONCE(rcu_ctrlblk.gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/*
+ * Return a grace-period-counter "cookie" and ensure that a future grace
+ * period completes.  For more information, see the Tree RCU header comment.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+	unsigned long gp_seq = get_state_synchronize_rcu();
+
+	if (unlikely(is_idle_task(current))) {
+		/* force scheduling for rcu_qs() */
+		resched_cpu(0);
+	}
+	return gp_seq;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
+
+/*
+ * Return true if the grace period corresponding to oldstate has completed
+ * and false otherwise.  For more information, see the Tree RCU header
+ * comment.
+ */
+bool poll_state_synchronize_rcu(unsigned long oldstate)
+{
+	return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+
 void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-- 
cgit v1.2.3


From 5a5586112b929546e16029261a987c9197bfdfa2 Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Wed, 24 Mar 2021 17:07:42 +0800
Subject: net: stmmac: support FPE link partner hand-shaking procedure

In order to discover whether remote station supports frame preemption,
local station sends verify mPacket and expects response mPacket in
return from the remote station.

So, we add the functions to send and handle event when verify mPacket
and response mPacket are exchanged between the networked stations.

The mechanism to handle different FPE states between local and remote
station (link partner) is implemented using workqueue which starts a
task each time there is some sign of verify & response mPacket exchange
as check in FPE IRQ event. The task retries couple of times to try to
spot the states that both stations are ready to enter FPE ON. This allows
different end points to enable FPE at different time and verify-response
mPacket can happen asynchronously. Ultimately, the task will only turn
FPE ON when local station have both exchange response in both directions.

Thanks to Voon Weifeng for implementing the core functions for detecting
FPE events and send mPacket and phylink related change.

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Co-developed-by: Voon Weifeng <weifeng.voon@intel.com>
Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Co-developed-by: Tan Tee Min <tee.min.tan@intel.com>
Signed-off-by: Tan Tee Min <tee.min.tan@intel.com>
Co-developed-by: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Signed-off-by: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h      |   7 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |   8 +
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c      |  49 ++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h      |  11 ++
 drivers/net/ethernet/stmicro/stmmac/hwif.h        |   7 +
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      |   7 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 183 ++++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c   |  39 ++++-
 include/linux/stmmac.h                            |  27 ++++
 9 files changed, 331 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 95469059dca1..d065b11b7b10 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -315,6 +315,13 @@ enum dma_irq_status {
 #define	CORE_IRQ_RX_PATH_IN_LPI_MODE	(1 << 2)
 #define	CORE_IRQ_RX_PATH_EXIT_LPI_MODE	(1 << 3)
 
+/* FPE defines */
+#define FPE_EVENT_UNKNOWN		0
+#define FPE_EVENT_TRSP			BIT(0)
+#define FPE_EVENT_TVER			BIT(1)
+#define FPE_EVENT_RRSP			BIT(2)
+#define FPE_EVENT_RVER			BIT(3)
+
 #define CORE_IRQ_MTL_RX_OVERFLOW	BIT(8)
 
 /* Physical Coding Sublayer */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 29f765a246a0..95864f014ffa 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -53,6 +53,10 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 	if (hw->pcs)
 		value |= GMAC_PCS_IRQ_DEFAULT;
 
+	/* Enable FPE interrupt */
+	if ((GMAC_HW_FEAT_FPESEL & readl(ioaddr + GMAC_HW_FEATURE3)) >> 26)
+		value |= GMAC_INT_FPE_EN;
+
 	writel(value, ioaddr + GMAC_INT_EN);
 }
 
@@ -1245,6 +1249,8 @@ const struct stmmac_ops dwmac410_ops = {
 	.config_l4_filter = dwmac4_config_l4_filter,
 	.est_configure = dwmac5_est_configure,
 	.fpe_configure = dwmac5_fpe_configure,
+	.fpe_send_mpacket = dwmac5_fpe_send_mpacket,
+	.fpe_irq_status = dwmac5_fpe_irq_status,
 	.add_hw_vlan_rx_fltr = dwmac4_add_hw_vlan_rx_fltr,
 	.del_hw_vlan_rx_fltr = dwmac4_del_hw_vlan_rx_fltr,
 	.restore_hw_vlan_rx_fltr = dwmac4_restore_hw_vlan_rx_fltr,
@@ -1294,6 +1300,8 @@ const struct stmmac_ops dwmac510_ops = {
 	.config_l4_filter = dwmac4_config_l4_filter,
 	.est_configure = dwmac5_est_configure,
 	.fpe_configure = dwmac5_fpe_configure,
+	.fpe_send_mpacket = dwmac5_fpe_send_mpacket,
+	.fpe_irq_status = dwmac5_fpe_irq_status,
 	.add_hw_vlan_rx_fltr = dwmac4_add_hw_vlan_rx_fltr,
 	.del_hw_vlan_rx_fltr = dwmac4_del_hw_vlan_rx_fltr,
 	.restore_hw_vlan_rx_fltr = dwmac4_restore_hw_vlan_rx_fltr,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
index 0ae85f8adf67..5b010ebfede9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
@@ -707,3 +707,52 @@ void dwmac5_fpe_configure(void __iomem *ioaddr, u32 num_txq, u32 num_rxq,
 	value |= EFPE;
 	writel(value, ioaddr + MAC_FPE_CTRL_STS);
 }
+
+int dwmac5_fpe_irq_status(void __iomem *ioaddr, struct net_device *dev)
+{
+	u32 value;
+	int status;
+
+	status = FPE_EVENT_UNKNOWN;
+
+	value = readl(ioaddr + MAC_FPE_CTRL_STS);
+
+	if (value & TRSP) {
+		status |= FPE_EVENT_TRSP;
+		netdev_info(dev, "FPE: Respond mPacket is transmitted\n");
+	}
+
+	if (value & TVER) {
+		status |= FPE_EVENT_TVER;
+		netdev_info(dev, "FPE: Verify mPacket is transmitted\n");
+	}
+
+	if (value & RRSP) {
+		status |= FPE_EVENT_RRSP;
+		netdev_info(dev, "FPE: Respond mPacket is received\n");
+	}
+
+	if (value & RVER) {
+		status |= FPE_EVENT_RVER;
+		netdev_info(dev, "FPE: Verify mPacket is received\n");
+	}
+
+	return status;
+}
+
+void dwmac5_fpe_send_mpacket(void __iomem *ioaddr, enum stmmac_mpacket_type type)
+{
+	u32 value;
+
+	value = readl(ioaddr + MAC_FPE_CTRL_STS);
+
+	if (type == MPACKET_VERIFY) {
+		value &= ~SRSP;
+		value |= SVER;
+	} else {
+		value &= ~SVER;
+		value |= SRSP;
+	}
+
+	writel(value, ioaddr + MAC_FPE_CTRL_STS);
+}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
index 709bbfc9ae61..ff555d8b0cdf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
@@ -12,6 +12,12 @@
 #define TMOUTEN				BIT(0)
 
 #define MAC_FPE_CTRL_STS		0x00000234
+#define TRSP				BIT(19)
+#define TVER				BIT(18)
+#define RRSP				BIT(17)
+#define RVER				BIT(16)
+#define SRSP				BIT(2)
+#define SVER				BIT(1)
 #define EFPE				BIT(0)
 
 #define MAC_PPS_CONTROL			0x00000b70
@@ -128,6 +134,8 @@
 #define GMAC_RXQCTRL_VFFQ_SHIFT		17
 #define GMAC_RXQCTRL_VFFQE		BIT(16)
 
+#define GMAC_INT_FPE_EN			BIT(17)
+
 int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp);
 int dwmac5_safety_feat_irq_status(struct net_device *ndev,
 		void __iomem *ioaddr, unsigned int asp,
@@ -145,5 +153,8 @@ void dwmac5_est_irq_status(void __iomem *ioaddr, struct net_device *dev,
 			   struct stmmac_extra_stats *x, u32 txqcnt);
 void dwmac5_fpe_configure(void __iomem *ioaddr, u32 num_txq, u32 num_rxq,
 			  bool enable);
+void dwmac5_fpe_send_mpacket(void __iomem *ioaddr,
+			     enum stmmac_mpacket_type type);
+int dwmac5_fpe_irq_status(void __iomem *ioaddr, struct net_device *dev);
 
 #endif /* __DWMAC5_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 59bf7078a754..45edac5f60db 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -397,6 +397,9 @@ struct stmmac_ops {
 			       struct stmmac_extra_stats *x, u32 txqcnt);
 	void (*fpe_configure)(void __iomem *ioaddr, u32 num_txq, u32 num_rxq,
 			      bool enable);
+	void (*fpe_send_mpacket)(void __iomem *ioaddr,
+				 enum stmmac_mpacket_type type);
+	int (*fpe_irq_status)(void __iomem *ioaddr, struct net_device *dev);
 };
 
 #define stmmac_core_init(__priv, __args...) \
@@ -497,6 +500,10 @@ struct stmmac_ops {
 	stmmac_do_void_callback(__priv, mac, est_irq_status, __args)
 #define stmmac_fpe_configure(__priv, __args...) \
 	stmmac_do_void_callback(__priv, mac, fpe_configure, __args)
+#define stmmac_fpe_send_mpacket(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, fpe_send_mpacket, __args)
+#define stmmac_fpe_irq_status(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, fpe_irq_status, __args)
 
 /* PTP and HW Timer helpers */
 struct stmmac_hwtimestamp {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 375c503d2df8..4faad331a4ca 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -234,6 +234,12 @@ struct stmmac_priv {
 	struct workqueue_struct *wq;
 	struct work_struct service_task;
 
+	/* Workqueue for handling FPE hand-shaking */
+	unsigned long fpe_task_state;
+	struct workqueue_struct *fpe_wq;
+	struct work_struct fpe_task;
+	char wq_name[IFNAMSIZ + 4];
+
 	/* TC Handling */
 	unsigned int tc_entries_max;
 	unsigned int tc_off_max;
@@ -273,6 +279,7 @@ bool stmmac_eee_init(struct stmmac_priv *priv);
 int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt);
 int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size);
 int stmmac_bus_clks_config(struct stmmac_priv *priv, bool enabled);
+void stmmac_fpe_handshake(struct stmmac_priv *priv, bool enable);
 
 #if IS_ENABLED(CONFIG_STMMAC_SELFTESTS)
 void stmmac_selftest_run(struct net_device *dev,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 8d7015d3a537..170296820af0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -971,6 +971,21 @@ static void stmmac_mac_an_restart(struct phylink_config *config)
 	/* Not Supported */
 }
 
+static void stmmac_fpe_link_state_handle(struct stmmac_priv *priv, bool is_up)
+{
+	struct stmmac_fpe_cfg *fpe_cfg = priv->plat->fpe_cfg;
+	enum stmmac_fpe_state *lo_state = &fpe_cfg->lo_fpe_state;
+	enum stmmac_fpe_state *lp_state = &fpe_cfg->lp_fpe_state;
+	bool *hs_enable = &fpe_cfg->hs_enable;
+
+	if (is_up && *hs_enable) {
+		stmmac_fpe_send_mpacket(priv, priv->ioaddr, MPACKET_VERIFY);
+	} else {
+		*lo_state = FPE_EVENT_UNKNOWN;
+		*lp_state = FPE_EVENT_UNKNOWN;
+	}
+}
+
 static void stmmac_mac_link_down(struct phylink_config *config,
 				 unsigned int mode, phy_interface_t interface)
 {
@@ -981,6 +996,8 @@ static void stmmac_mac_link_down(struct phylink_config *config,
 	priv->tx_lpi_enabled = false;
 	stmmac_eee_init(priv);
 	stmmac_set_eee_pls(priv, priv->hw, false);
+
+	stmmac_fpe_link_state_handle(priv, false);
 }
 
 static void stmmac_mac_link_up(struct phylink_config *config,
@@ -1079,6 +1096,8 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 		priv->tx_lpi_enabled = priv->eee_enabled;
 		stmmac_set_eee_pls(priv, priv->hw, true);
 	}
+
+	stmmac_fpe_link_state_handle(priv, true);
 }
 
 static const struct phylink_mac_ops stmmac_phylink_mac_ops = {
@@ -2793,6 +2812,26 @@ static void stmmac_safety_feat_configuration(struct stmmac_priv *priv)
 	}
 }
 
+static int stmmac_fpe_start_wq(struct stmmac_priv *priv)
+{
+	char *name;
+
+	clear_bit(__FPE_TASK_SCHED, &priv->fpe_task_state);
+
+	name = priv->wq_name;
+	sprintf(name, "%s-fpe", priv->dev->name);
+
+	priv->fpe_wq = create_singlethread_workqueue(name);
+	if (!priv->fpe_wq) {
+		netdev_err(priv->dev, "%s: Failed to create workqueue\n", name);
+
+		return -ENOMEM;
+	}
+	netdev_info(priv->dev, "FPE workqueue start");
+
+	return 0;
+}
+
 /**
  * stmmac_hw_setup - setup mac in a usable state.
  *  @dev : pointer to the device structure.
@@ -2929,6 +2968,13 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	/* Start the ball rolling... */
 	stmmac_start_all_dma(priv);
 
+	if (priv->dma_cap.fpesel) {
+		stmmac_fpe_start_wq(priv);
+
+		if (priv->plat->fpe_cfg->enable)
+			stmmac_fpe_handshake(priv, true);
+	}
+
 	return 0;
 }
 
@@ -3090,6 +3136,16 @@ init_phy_error:
 	return ret;
 }
 
+static void stmmac_fpe_stop_wq(struct stmmac_priv *priv)
+{
+	set_bit(__FPE_REMOVING, &priv->fpe_task_state);
+
+	if (priv->fpe_wq)
+		destroy_workqueue(priv->fpe_wq);
+
+	netdev_info(priv->dev, "FPE workqueue stop");
+}
+
 /**
  *  stmmac_release - close entry point of the driver
  *  @dev : device pointer.
@@ -3139,6 +3195,9 @@ static int stmmac_release(struct net_device *dev)
 
 	pm_runtime_put(priv->device);
 
+	if (priv->dma_cap.fpesel)
+		stmmac_fpe_stop_wq(priv);
+
 	return 0;
 }
 
@@ -4280,6 +4339,48 @@ static int stmmac_set_features(struct net_device *netdev,
 	return 0;
 }
 
+static void stmmac_fpe_event_status(struct stmmac_priv *priv, int status)
+{
+	struct stmmac_fpe_cfg *fpe_cfg = priv->plat->fpe_cfg;
+	enum stmmac_fpe_state *lo_state = &fpe_cfg->lo_fpe_state;
+	enum stmmac_fpe_state *lp_state = &fpe_cfg->lp_fpe_state;
+	bool *hs_enable = &fpe_cfg->hs_enable;
+
+	if (status == FPE_EVENT_UNKNOWN || !*hs_enable)
+		return;
+
+	/* If LP has sent verify mPacket, LP is FPE capable */
+	if ((status & FPE_EVENT_RVER) == FPE_EVENT_RVER) {
+		if (*lp_state < FPE_STATE_CAPABLE)
+			*lp_state = FPE_STATE_CAPABLE;
+
+		/* If user has requested FPE enable, quickly response */
+		if (*hs_enable)
+			stmmac_fpe_send_mpacket(priv, priv->ioaddr,
+						MPACKET_RESPONSE);
+	}
+
+	/* If Local has sent verify mPacket, Local is FPE capable */
+	if ((status & FPE_EVENT_TVER) == FPE_EVENT_TVER) {
+		if (*lo_state < FPE_STATE_CAPABLE)
+			*lo_state = FPE_STATE_CAPABLE;
+	}
+
+	/* If LP has sent response mPacket, LP is entering FPE ON */
+	if ((status & FPE_EVENT_RRSP) == FPE_EVENT_RRSP)
+		*lp_state = FPE_STATE_ENTERING_ON;
+
+	/* If Local has sent response mPacket, Local is entering FPE ON */
+	if ((status & FPE_EVENT_TRSP) == FPE_EVENT_TRSP)
+		*lo_state = FPE_STATE_ENTERING_ON;
+
+	if (!test_bit(__FPE_REMOVING, &priv->fpe_task_state) &&
+	    !test_and_set_bit(__FPE_TASK_SCHED, &priv->fpe_task_state) &&
+	    priv->fpe_wq) {
+		queue_work(priv->fpe_wq, &priv->fpe_task);
+	}
+}
+
 /**
  *  stmmac_interrupt - main ISR
  *  @irq: interrupt number.
@@ -4318,6 +4419,13 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 		stmmac_est_irq_status(priv, priv->ioaddr, priv->dev,
 				      &priv->xstats, tx_cnt);
 
+	if (priv->dma_cap.fpesel) {
+		int status = stmmac_fpe_irq_status(priv, priv->ioaddr,
+						   priv->dev);
+
+		stmmac_fpe_event_status(priv, status);
+	}
+
 	/* To handle GMAC own interrupts */
 	if ((priv->plat->has_gmac) || xmac) {
 		int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats);
@@ -5065,6 +5173,68 @@ int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size)
 	return ret;
 }
 
+#define SEND_VERIFY_MPAKCET_FMT "Send Verify mPacket lo_state=%d lp_state=%d\n"
+static void stmmac_fpe_lp_task(struct work_struct *work)
+{
+	struct stmmac_priv *priv = container_of(work, struct stmmac_priv,
+						fpe_task);
+	struct stmmac_fpe_cfg *fpe_cfg = priv->plat->fpe_cfg;
+	enum stmmac_fpe_state *lo_state = &fpe_cfg->lo_fpe_state;
+	enum stmmac_fpe_state *lp_state = &fpe_cfg->lp_fpe_state;
+	bool *hs_enable = &fpe_cfg->hs_enable;
+	bool *enable = &fpe_cfg->enable;
+	int retries = 20;
+
+	while (retries-- > 0) {
+		/* Bail out immediately if FPE handshake is OFF */
+		if (*lo_state == FPE_STATE_OFF || !*hs_enable)
+			break;
+
+		if (*lo_state == FPE_STATE_ENTERING_ON &&
+		    *lp_state == FPE_STATE_ENTERING_ON) {
+			stmmac_fpe_configure(priv, priv->ioaddr,
+					     priv->plat->tx_queues_to_use,
+					     priv->plat->rx_queues_to_use,
+					     *enable);
+
+			netdev_info(priv->dev, "configured FPE\n");
+
+			*lo_state = FPE_STATE_ON;
+			*lp_state = FPE_STATE_ON;
+			netdev_info(priv->dev, "!!! BOTH FPE stations ON\n");
+			break;
+		}
+
+		if ((*lo_state == FPE_STATE_CAPABLE ||
+		     *lo_state == FPE_STATE_ENTERING_ON) &&
+		     *lp_state != FPE_STATE_ON) {
+			netdev_info(priv->dev, SEND_VERIFY_MPAKCET_FMT,
+				    *lo_state, *lp_state);
+			stmmac_fpe_send_mpacket(priv, priv->ioaddr,
+						MPACKET_VERIFY);
+		}
+		/* Sleep then retry */
+		msleep(500);
+	}
+
+	clear_bit(__FPE_TASK_SCHED, &priv->fpe_task_state);
+}
+
+void stmmac_fpe_handshake(struct stmmac_priv *priv, bool enable)
+{
+	if (priv->plat->fpe_cfg->hs_enable != enable) {
+		if (enable) {
+			stmmac_fpe_send_mpacket(priv, priv->ioaddr,
+						MPACKET_VERIFY);
+		} else {
+			priv->plat->fpe_cfg->lo_fpe_state = FPE_STATE_OFF;
+			priv->plat->fpe_cfg->lp_fpe_state = FPE_STATE_OFF;
+		}
+
+		priv->plat->fpe_cfg->hs_enable = enable;
+	}
+}
+
 /**
  * stmmac_dvr_probe
  * @device: device pointer
@@ -5122,6 +5292,9 @@ int stmmac_dvr_probe(struct device *device,
 
 	INIT_WORK(&priv->service_task, stmmac_service_task);
 
+	/* Initialize Link Partner FPE workqueue */
+	INIT_WORK(&priv->fpe_task, stmmac_fpe_lp_task);
+
 	/* Override with kernel parameters if supplied XXX CRS XXX
 	 * this needs to have multiple instances
 	 */
@@ -5435,8 +5608,18 @@ int stmmac_suspend(struct device *dev)
 		if (ret)
 			return ret;
 	}
+
 	mutex_unlock(&priv->lock);
 
+	if (priv->dma_cap.fpesel) {
+		/* Disable FPE */
+		stmmac_fpe_configure(priv, priv->ioaddr,
+				     priv->plat->tx_queues_to_use,
+				     priv->plat->rx_queues_to_use, false);
+
+		stmmac_fpe_handshake(priv, false);
+	}
+
 	priv->speed = SPEED_UNKNOWN;
 	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index b80cb2985b39..1d84ee359808 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -297,6 +297,17 @@ static int tc_init(struct stmmac_priv *priv)
 
 	dev_info(priv->device, "Enabling HW TC (entries=%d, max_off=%d)\n",
 			priv->tc_entries_max, priv->tc_off_max);
+
+	if (!priv->plat->fpe_cfg) {
+		priv->plat->fpe_cfg = devm_kzalloc(priv->device,
+						   sizeof(*priv->plat->fpe_cfg),
+						   GFP_KERNEL);
+		if (!priv->plat->fpe_cfg)
+			return -ENOMEM;
+	} else {
+		memset(priv->plat->fpe_cfg, 0, sizeof(*priv->plat->fpe_cfg));
+	}
+
 	return 0;
 }
 
@@ -829,13 +840,10 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	if (fpe && !priv->dma_cap.fpesel)
 		return -EOPNOTSUPP;
 
-	ret = stmmac_fpe_configure(priv, priv->ioaddr,
-				   priv->plat->tx_queues_to_use,
-				   priv->plat->rx_queues_to_use, fpe);
-	if (ret && fpe) {
-		netdev_err(priv->dev, "failed to enable Frame Preemption\n");
-		return ret;
-	}
+	/* Actual FPE register configuration will be done after FPE handshake
+	 * is success.
+	 */
+	priv->plat->fpe_cfg->enable = fpe;
 
 	ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
 				   priv->plat->clk_ptp_rate);
@@ -845,12 +853,29 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	}
 
 	netdev_info(priv->dev, "configured EST\n");
+
+	if (fpe) {
+		stmmac_fpe_handshake(priv, true);
+		netdev_info(priv->dev, "start FPE handshake\n");
+	}
+
 	return 0;
 
 disable:
 	priv->plat->est->enable = false;
 	stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
 			     priv->plat->clk_ptp_rate);
+
+	priv->plat->fpe_cfg->enable = false;
+	stmmac_fpe_configure(priv, priv->ioaddr,
+			     priv->plat->tx_queues_to_use,
+			     priv->plat->rx_queues_to_use,
+			     false);
+	netdev_info(priv->dev, "disabled FPE\n");
+
+	stmmac_fpe_handshake(priv, false);
+	netdev_info(priv->dev, "stop FPE handshake\n");
+
 	return ret;
 }
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 5134e802f39a..febdb43d27e5 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -144,6 +144,32 @@ struct stmmac_txq_cfg {
 	int tbs_en;
 };
 
+/* FPE link state */
+enum stmmac_fpe_state {
+	FPE_STATE_OFF = 0,
+	FPE_STATE_CAPABLE = 1,
+	FPE_STATE_ENTERING_ON = 2,
+	FPE_STATE_ON = 3,
+};
+
+/* FPE link-partner hand-shaking mPacket type */
+enum stmmac_mpacket_type {
+	MPACKET_VERIFY = 0,
+	MPACKET_RESPONSE = 1,
+};
+
+enum stmmac_fpe_task_state_t {
+	__FPE_REMOVING,
+	__FPE_TASK_SCHED,
+};
+
+struct stmmac_fpe_cfg {
+	bool enable;				/* FPE enable */
+	bool hs_enable;				/* FPE handshake enable */
+	enum stmmac_fpe_state lp_fpe_state;	/* Link Partner FPE state */
+	enum stmmac_fpe_state lo_fpe_state;	/* Local station FPE state */
+};
+
 struct plat_stmmacenet_data {
 	int bus_id;
 	int phy_addr;
@@ -155,6 +181,7 @@ struct plat_stmmacenet_data {
 	struct device_node *mdio_node;
 	struct stmmac_dma_cfg *dma_cfg;
 	struct stmmac_est *est;
+	struct stmmac_fpe_cfg *fpe_cfg;
 	int clk_csr;
 	int has_gmac;
 	int enh_desc;
-- 
cgit v1.2.3


From 8af856d18bfbe89676ade38caa2a5d06f75f211d Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Wed, 24 Mar 2021 13:40:40 +0800
Subject: locking/mutex: Remove repeated declaration

Commit 0cd39f4600ed ("locking/seqlock, headers: Untangle the spaghetti monster")
introduces 'struct ww_acquire_ctx' again, remove the repeated declaration and move
the pre-declarations to the top.

Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lore.kernel.org/r/1616564440-61318-1-git-send-email-zhangshaokun@hisilicon.com
---
 include/linux/mutex.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 0cd631a19727..e7a126796937 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -20,6 +20,7 @@
 #include <linux/osq_lock.h>
 #include <linux/debug_locks.h>
 
+struct ww_class;
 struct ww_acquire_ctx;
 
 /*
@@ -65,9 +66,6 @@ struct mutex {
 #endif
 };
 
-struct ww_class;
-struct ww_acquire_ctx;
-
 struct ww_mutex {
 	struct mutex base;
 	struct ww_acquire_ctx *ctx;
-- 
cgit v1.2.3


From 3bf30882c3c7b6e376d9d6d04082c9aa2d2ac30a Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Mon, 8 Mar 2021 14:50:35 +0800
Subject: drivers/perf: hisi: Add support for HiSilicon SLLC PMU driver

HiSilicon's Hip09 is comprised by multi-dies that can be connected by SLLC
module (Skyros Link Layer Controller), its has separate PMU registers which
the driver can program it freely and interrupt is supported to handle
counter overflow. Let's support its driver under the framework of HiSilicon
uncore PMU driver.

SLLC PMU supports the following filter functions:
* tracetag_en: allows user to count data according to tt_req or
tt_core set in L3C PMU.

* srcid_cmd & srcid_msk: allows user to filter statistics that come from
specific CCL/ICL by configuration source ID.

* tgtid_hi & tgtid_lo: it also supports event statistics that these
operations will go to the CCL/ICL by configuration target ID or
target ID range. It's the same as source ID with 11-bit width in
the SoC. More introduction is added in documentation:
Documentation/admin-guide/perf/hisi-pmu.rst

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: John Garry <john.garry@huawei.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Co-developed-by: Qi Liu <liuqi115@huawei.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/1615186237-22263-8-git-send-email-zhangshaokun@hisilicon.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/Makefile               |   2 +-
 drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c | 530 ++++++++++++++++++++++++++
 include/linux/cpuhotplug.h                    |   1 +
 3 files changed, 532 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c

(limited to 'include/linux')

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index e8377061845f..6600a9d45dd8 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \
-			  hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o
+			  hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
new file mode 100644
index 000000000000..46be312fa126
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HiSilicon SLLC uncore Hardware event counters support
+ *
+ * Copyright (C) 2020 Hisilicon Limited
+ * Author: Shaokun Zhang <zhangshaokun@hisilicon.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ */
+#include <linux/acpi.h>
+#include <linux/cpuhotplug.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/list.h>
+#include <linux/smp.h>
+
+#include "hisi_uncore_pmu.h"
+
+/* SLLC register definition */
+#define SLLC_INT_MASK			0x0814
+#define SLLC_INT_STATUS			0x0818
+#define SLLC_INT_CLEAR			0x081c
+#define SLLC_PERF_CTRL			0x1c00
+#define SLLC_SRCID_CTRL			0x1c04
+#define SLLC_TGTID_CTRL			0x1c08
+#define SLLC_EVENT_CTRL			0x1c14
+#define SLLC_EVENT_TYPE0		0x1c18
+#define SLLC_VERSION			0x1cf0
+#define SLLC_EVENT_CNT0_L		0x1d00
+
+#define SLLC_EVTYPE_MASK		0xff
+#define SLLC_PERF_CTRL_EN		BIT(0)
+#define SLLC_FILT_EN			BIT(1)
+#define SLLC_TRACETAG_EN		BIT(2)
+#define SLLC_SRCID_EN			BIT(4)
+#define SLLC_SRCID_NONE			0x0
+#define SLLC_TGTID_EN			BIT(5)
+#define SLLC_TGTID_NONE			0x0
+#define SLLC_TGTID_MIN_SHIFT		1
+#define SLLC_TGTID_MAX_SHIFT		12
+#define SLLC_SRCID_CMD_SHIFT		1
+#define SLLC_SRCID_MSK_SHIFT		12
+#define SLLC_NR_EVENTS			0x80
+
+HISI_PMU_EVENT_ATTR_EXTRACTOR(tgtid_min, config1, 10, 0);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(tgtid_max, config1, 21, 11);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(srcid_cmd, config1, 32, 22);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(srcid_msk, config1, 43, 33);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(tracetag_en, config1, 44, 44);
+
+static bool tgtid_is_valid(u32 max, u32 min)
+{
+	return max > 0 && max >= min;
+}
+
+static void hisi_sllc_pmu_enable_tracetag(struct perf_event *event)
+{
+	struct hisi_pmu *sllc_pmu = to_hisi_pmu(event->pmu);
+	u32 tt_en = hisi_get_tracetag_en(event);
+
+	if (tt_en) {
+		u32 val;
+
+		val = readl(sllc_pmu->base + SLLC_PERF_CTRL);
+		val |= SLLC_TRACETAG_EN | SLLC_FILT_EN;
+		writel(val, sllc_pmu->base + SLLC_PERF_CTRL);
+	}
+}
+
+static void hisi_sllc_pmu_disable_tracetag(struct perf_event *event)
+{
+	struct hisi_pmu *sllc_pmu = to_hisi_pmu(event->pmu);
+	u32 tt_en = hisi_get_tracetag_en(event);
+
+	if (tt_en) {
+		u32 val;
+
+		val = readl(sllc_pmu->base + SLLC_PERF_CTRL);
+		val &= ~(SLLC_TRACETAG_EN | SLLC_FILT_EN);
+		writel(val, sllc_pmu->base + SLLC_PERF_CTRL);
+	}
+}
+
+static void hisi_sllc_pmu_config_tgtid(struct perf_event *event)
+{
+	struct hisi_pmu *sllc_pmu = to_hisi_pmu(event->pmu);
+	u32 min = hisi_get_tgtid_min(event);
+	u32 max = hisi_get_tgtid_max(event);
+
+	if (tgtid_is_valid(max, min)) {
+		u32 val = (max << SLLC_TGTID_MAX_SHIFT) | (min << SLLC_TGTID_MIN_SHIFT);
+
+		writel(val, sllc_pmu->base + SLLC_TGTID_CTRL);
+		/* Enable the tgtid */
+		val = readl(sllc_pmu->base + SLLC_PERF_CTRL);
+		val |= SLLC_TGTID_EN | SLLC_FILT_EN;
+		writel(val, sllc_pmu->base + SLLC_PERF_CTRL);
+	}
+}
+
+static void hisi_sllc_pmu_clear_tgtid(struct perf_event *event)
+{
+	struct hisi_pmu *sllc_pmu = to_hisi_pmu(event->pmu);
+	u32 min = hisi_get_tgtid_min(event);
+	u32 max = hisi_get_tgtid_max(event);
+
+	if (tgtid_is_valid(max, min)) {
+		u32 val;
+
+		writel(SLLC_TGTID_NONE, sllc_pmu->base + SLLC_TGTID_CTRL);
+		/* Disable the tgtid */
+		val = readl(sllc_pmu->base + SLLC_PERF_CTRL);
+		val &= ~(SLLC_TGTID_EN | SLLC_FILT_EN);
+		writel(val, sllc_pmu->base + SLLC_PERF_CTRL);
+	}
+}
+
+static void hisi_sllc_pmu_config_srcid(struct perf_event *event)
+{
+	struct hisi_pmu *sllc_pmu = to_hisi_pmu(event->pmu);
+	u32 cmd = hisi_get_srcid_cmd(event);
+
+	if (cmd) {
+		u32 val, msk;
+
+		msk = hisi_get_srcid_msk(event);
+		val = (cmd << SLLC_SRCID_CMD_SHIFT) | (msk << SLLC_SRCID_MSK_SHIFT);
+		writel(val, sllc_pmu->base + SLLC_SRCID_CTRL);
+		/* Enable the srcid */
+		val = readl(sllc_pmu->base + SLLC_PERF_CTRL);
+		val |= SLLC_SRCID_EN | SLLC_FILT_EN;
+		writel(val, sllc_pmu->base + SLLC_PERF_CTRL);
+	}
+}
+
+static void hisi_sllc_pmu_clear_srcid(struct perf_event *event)
+{
+	struct hisi_pmu *sllc_pmu = to_hisi_pmu(event->pmu);
+	u32 cmd = hisi_get_srcid_cmd(event);
+
+	if (cmd) {
+		u32 val;
+
+		writel(SLLC_SRCID_NONE, sllc_pmu->base + SLLC_SRCID_CTRL);
+		/* Disable the srcid */
+		val = readl(sllc_pmu->base + SLLC_PERF_CTRL);
+		val &= ~(SLLC_SRCID_EN | SLLC_FILT_EN);
+		writel(val, sllc_pmu->base + SLLC_PERF_CTRL);
+	}
+}
+
+static void hisi_sllc_pmu_enable_filter(struct perf_event *event)
+{
+	if (event->attr.config1 != 0x0) {
+		hisi_sllc_pmu_enable_tracetag(event);
+		hisi_sllc_pmu_config_srcid(event);
+		hisi_sllc_pmu_config_tgtid(event);
+	}
+}
+
+static void hisi_sllc_pmu_clear_filter(struct perf_event *event)
+{
+	if (event->attr.config1 != 0x0) {
+		hisi_sllc_pmu_disable_tracetag(event);
+		hisi_sllc_pmu_clear_srcid(event);
+		hisi_sllc_pmu_clear_tgtid(event);
+	}
+}
+
+static u32 hisi_sllc_pmu_get_counter_offset(int idx)
+{
+	return (SLLC_EVENT_CNT0_L + idx * 8);
+}
+
+static u64 hisi_sllc_pmu_read_counter(struct hisi_pmu *sllc_pmu,
+				      struct hw_perf_event *hwc)
+{
+	return readq(sllc_pmu->base +
+		     hisi_sllc_pmu_get_counter_offset(hwc->idx));
+}
+
+static void hisi_sllc_pmu_write_counter(struct hisi_pmu *sllc_pmu,
+					struct hw_perf_event *hwc, u64 val)
+{
+	writeq(val, sllc_pmu->base +
+	       hisi_sllc_pmu_get_counter_offset(hwc->idx));
+}
+
+static void hisi_sllc_pmu_write_evtype(struct hisi_pmu *sllc_pmu, int idx,
+				       u32 type)
+{
+	u32 reg, reg_idx, shift, val;
+
+	/*
+	 * Select the appropriate event select register(SLLC_EVENT_TYPE0/1).
+	 * There are 2 event select registers for the 8 hardware counters.
+	 * Event code is 8-bits and for the former 4 hardware counters,
+	 * SLLC_EVENT_TYPE0 is chosen. For the latter 4 hardware counters,
+	 * SLLC_EVENT_TYPE1 is chosen.
+	 */
+	reg = SLLC_EVENT_TYPE0 + (idx / 4) * 4;
+	reg_idx = idx % 4;
+	shift = 8 * reg_idx;
+
+	/* Write event code to SLLC_EVENT_TYPEx Register */
+	val = readl(sllc_pmu->base + reg);
+	val &= ~(SLLC_EVTYPE_MASK << shift);
+	val |= (type << shift);
+	writel(val, sllc_pmu->base + reg);
+}
+
+static void hisi_sllc_pmu_start_counters(struct hisi_pmu *sllc_pmu)
+{
+	u32 val;
+
+	val = readl(sllc_pmu->base + SLLC_PERF_CTRL);
+	val |= SLLC_PERF_CTRL_EN;
+	writel(val, sllc_pmu->base + SLLC_PERF_CTRL);
+}
+
+static void hisi_sllc_pmu_stop_counters(struct hisi_pmu *sllc_pmu)
+{
+	u32 val;
+
+	val = readl(sllc_pmu->base + SLLC_PERF_CTRL);
+	val &= ~(SLLC_PERF_CTRL_EN);
+	writel(val, sllc_pmu->base + SLLC_PERF_CTRL);
+}
+
+static void hisi_sllc_pmu_enable_counter(struct hisi_pmu *sllc_pmu,
+					 struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	val = readl(sllc_pmu->base + SLLC_EVENT_CTRL);
+	val |= 1 << hwc->idx;
+	writel(val, sllc_pmu->base + SLLC_EVENT_CTRL);
+}
+
+static void hisi_sllc_pmu_disable_counter(struct hisi_pmu *sllc_pmu,
+					  struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	val = readl(sllc_pmu->base + SLLC_EVENT_CTRL);
+	val &= ~(1 << hwc->idx);
+	writel(val, sllc_pmu->base + SLLC_EVENT_CTRL);
+}
+
+static void hisi_sllc_pmu_enable_counter_int(struct hisi_pmu *sllc_pmu,
+					     struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	val = readl(sllc_pmu->base + SLLC_INT_MASK);
+	/* Write 0 to enable interrupt */
+	val &= ~(1 << hwc->idx);
+	writel(val, sllc_pmu->base + SLLC_INT_MASK);
+}
+
+static void hisi_sllc_pmu_disable_counter_int(struct hisi_pmu *sllc_pmu,
+					      struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	val = readl(sllc_pmu->base + SLLC_INT_MASK);
+	/* Write 1 to mask interrupt */
+	val |= 1 << hwc->idx;
+	writel(val, sllc_pmu->base + SLLC_INT_MASK);
+}
+
+static u32 hisi_sllc_pmu_get_int_status(struct hisi_pmu *sllc_pmu)
+{
+	return readl(sllc_pmu->base + SLLC_INT_STATUS);
+}
+
+static void hisi_sllc_pmu_clear_int_status(struct hisi_pmu *sllc_pmu, int idx)
+{
+	writel(1 << idx, sllc_pmu->base + SLLC_INT_CLEAR);
+}
+
+static const struct acpi_device_id hisi_sllc_pmu_acpi_match[] = {
+	{ "HISI0263", },
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, hisi_sllc_pmu_acpi_match);
+
+static int hisi_sllc_pmu_init_data(struct platform_device *pdev,
+				   struct hisi_pmu *sllc_pmu)
+{
+	/*
+	 * Use the SCCL_ID and the index ID to identify the SLLC PMU,
+	 * while SCCL_ID is from MPIDR_EL1 by CPU.
+	 */
+	if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id",
+				     &sllc_pmu->sccl_id)) {
+		dev_err(&pdev->dev, "Cannot read sccl-id!\n");
+		return -EINVAL;
+	}
+
+	if (device_property_read_u32(&pdev->dev, "hisilicon,idx-id",
+				     &sllc_pmu->index_id)) {
+		dev_err(&pdev->dev, "Cannot read idx-id!\n");
+		return -EINVAL;
+	}
+
+	/* SLLC PMUs only share the same SCCL */
+	sllc_pmu->ccl_id = -1;
+
+	sllc_pmu->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(sllc_pmu->base)) {
+		dev_err(&pdev->dev, "ioremap failed for sllc_pmu resource.\n");
+		return PTR_ERR(sllc_pmu->base);
+	}
+
+	sllc_pmu->identifier = readl(sllc_pmu->base + SLLC_VERSION);
+
+	return 0;
+}
+
+static struct attribute *hisi_sllc_pmu_v2_format_attr[] = {
+	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
+	HISI_PMU_FORMAT_ATTR(tgtid_min, "config1:0-10"),
+	HISI_PMU_FORMAT_ATTR(tgtid_max, "config1:11-21"),
+	HISI_PMU_FORMAT_ATTR(srcid_cmd, "config1:22-32"),
+	HISI_PMU_FORMAT_ATTR(srcid_msk, "config1:33-43"),
+	HISI_PMU_FORMAT_ATTR(tracetag_en, "config1:44"),
+	NULL
+};
+
+static const struct attribute_group hisi_sllc_pmu_v2_format_group = {
+	.name = "format",
+	.attrs = hisi_sllc_pmu_v2_format_attr,
+};
+
+static struct attribute *hisi_sllc_pmu_v2_events_attr[] = {
+	HISI_PMU_EVENT_ATTR(rx_req,             0x30),
+	HISI_PMU_EVENT_ATTR(rx_data,            0x31),
+	HISI_PMU_EVENT_ATTR(tx_req,             0x34),
+	HISI_PMU_EVENT_ATTR(tx_data,            0x35),
+	HISI_PMU_EVENT_ATTR(cycles,             0x09),
+	NULL
+};
+
+static const struct attribute_group hisi_sllc_pmu_v2_events_group = {
+	.name = "events",
+	.attrs = hisi_sllc_pmu_v2_events_attr,
+};
+
+static DEVICE_ATTR(cpumask, 0444, hisi_cpumask_sysfs_show, NULL);
+
+static struct attribute *hisi_sllc_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static const struct attribute_group hisi_sllc_pmu_cpumask_attr_group = {
+	.attrs = hisi_sllc_pmu_cpumask_attrs,
+};
+
+static struct device_attribute hisi_sllc_pmu_identifier_attr =
+	__ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL);
+
+static struct attribute *hisi_sllc_pmu_identifier_attrs[] = {
+	&hisi_sllc_pmu_identifier_attr.attr,
+	NULL
+};
+
+static struct attribute_group hisi_sllc_pmu_identifier_group = {
+	.attrs = hisi_sllc_pmu_identifier_attrs,
+};
+
+static const struct attribute_group *hisi_sllc_pmu_v2_attr_groups[] = {
+	&hisi_sllc_pmu_v2_format_group,
+	&hisi_sllc_pmu_v2_events_group,
+	&hisi_sllc_pmu_cpumask_attr_group,
+	&hisi_sllc_pmu_identifier_group,
+	NULL
+};
+
+static const struct hisi_uncore_ops hisi_uncore_sllc_ops = {
+	.write_evtype		= hisi_sllc_pmu_write_evtype,
+	.get_event_idx		= hisi_uncore_pmu_get_event_idx,
+	.start_counters		= hisi_sllc_pmu_start_counters,
+	.stop_counters		= hisi_sllc_pmu_stop_counters,
+	.enable_counter		= hisi_sllc_pmu_enable_counter,
+	.disable_counter	= hisi_sllc_pmu_disable_counter,
+	.enable_counter_int	= hisi_sllc_pmu_enable_counter_int,
+	.disable_counter_int	= hisi_sllc_pmu_disable_counter_int,
+	.write_counter		= hisi_sllc_pmu_write_counter,
+	.read_counter		= hisi_sllc_pmu_read_counter,
+	.get_int_status		= hisi_sllc_pmu_get_int_status,
+	.clear_int_status	= hisi_sllc_pmu_clear_int_status,
+	.enable_filter		= hisi_sllc_pmu_enable_filter,
+	.disable_filter		= hisi_sllc_pmu_clear_filter,
+};
+
+static int hisi_sllc_pmu_dev_probe(struct platform_device *pdev,
+				   struct hisi_pmu *sllc_pmu)
+{
+	int ret;
+
+	ret = hisi_sllc_pmu_init_data(pdev, sllc_pmu);
+	if (ret)
+		return ret;
+
+	ret = hisi_uncore_pmu_init_irq(sllc_pmu, pdev);
+	if (ret)
+		return ret;
+
+	sllc_pmu->pmu_events.attr_groups = hisi_sllc_pmu_v2_attr_groups;
+	sllc_pmu->ops = &hisi_uncore_sllc_ops;
+	sllc_pmu->check_event = SLLC_NR_EVENTS;
+	sllc_pmu->counter_bits = 64;
+	sllc_pmu->num_counters = 8;
+	sllc_pmu->dev = &pdev->dev;
+	sllc_pmu->on_cpu = -1;
+
+	return 0;
+}
+
+static int hisi_sllc_pmu_probe(struct platform_device *pdev)
+{
+	struct hisi_pmu *sllc_pmu;
+	char *name;
+	int ret;
+
+	sllc_pmu = devm_kzalloc(&pdev->dev, sizeof(*sllc_pmu), GFP_KERNEL);
+	if (!sllc_pmu)
+		return -ENOMEM;
+
+	ret = hisi_sllc_pmu_dev_probe(pdev, sllc_pmu);
+	if (ret)
+		return ret;
+
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%u_sllc%u",
+			      sllc_pmu->sccl_id, sllc_pmu->index_id);
+	if (!name)
+		return -ENOMEM;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+				       &sllc_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	sllc_pmu->pmu = (struct pmu) {
+		.module		= THIS_MODULE,
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= hisi_uncore_pmu_event_init,
+		.pmu_enable	= hisi_uncore_pmu_enable,
+		.pmu_disable	= hisi_uncore_pmu_disable,
+		.add		= hisi_uncore_pmu_add,
+		.del		= hisi_uncore_pmu_del,
+		.start		= hisi_uncore_pmu_start,
+		.stop		= hisi_uncore_pmu_stop,
+		.read		= hisi_uncore_pmu_read,
+		.attr_groups    = sllc_pmu->pmu_events.attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+	};
+
+	ret = perf_pmu_register(&sllc_pmu->pmu, name, -1);
+	if (ret) {
+		dev_err(sllc_pmu->dev, "PMU register failed, ret = %d\n", ret);
+		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+					    &sllc_pmu->node);
+		irq_set_affinity_hint(sllc_pmu->irq, NULL);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, sllc_pmu);
+
+	return ret;
+}
+
+static int hisi_sllc_pmu_remove(struct platform_device *pdev)
+{
+	struct hisi_pmu *sllc_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&sllc_pmu->pmu);
+	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+					    &sllc_pmu->node);
+	irq_set_affinity_hint(sllc_pmu->irq, NULL);
+
+	return 0;
+}
+
+static struct platform_driver hisi_sllc_pmu_driver = {
+	.driver = {
+		.name = "hisi_sllc_pmu",
+		.acpi_match_table = hisi_sllc_pmu_acpi_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe = hisi_sllc_pmu_probe,
+	.remove = hisi_sllc_pmu_remove,
+};
+
+static int __init hisi_sllc_pmu_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+				      "AP_PERF_ARM_HISI_SLLC_ONLINE",
+				      hisi_uncore_pmu_online_cpu,
+				      hisi_uncore_pmu_offline_cpu);
+	if (ret) {
+		pr_err("SLLC PMU: cpuhp state setup failed, ret = %d\n", ret);
+		return ret;
+	}
+
+	ret = platform_driver_register(&hisi_sllc_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE);
+
+	return ret;
+}
+module_init(hisi_sllc_pmu_module_init);
+
+static void __exit hisi_sllc_pmu_module_exit(void)
+{
+	platform_driver_unregister(&hisi_sllc_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE);
+}
+module_exit(hisi_sllc_pmu_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon SLLC uncore PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Shaokun Zhang <zhangshaokun@hisilicon.com>");
+MODULE_AUTHOR("Qi Liu <liuqi115@huawei.com>");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f14adb882338..5f5ce676532f 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -175,6 +175,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
-- 
cgit v1.2.3


From a0ab25cd82eeb68bfa19a4d93a097521af5011b8 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Mon, 8 Mar 2021 14:50:36 +0800
Subject: drivers/perf: hisi: Add support for HiSilicon PA PMU driver

On HiSilicon Hip09 platform, there is a PA (Protocol Adapter) module on
each chip SICL (Super I/O Cluster) which incorporates three Hydra interface
and facilitates the cache coherency between the dies on the chip. While PA
uncore PMU model is the same as other Hip09 PMU modules and many PMU events
are supported. Let's support the PMU driver using the HiSilicon uncore PMU
framework.

PA PMU supports the following filter functions:
* tracetag_en: allows user to count events according to tt_req or
tt_core set in L3C PMU. It's the same as other PMUs.

* srcid_cmd & srcid_msk: allows user to filter statistics that come from
specific CCL/ICL by configuration source ID.

* tgtid_cmd & tgtid_msk: it is the similar function to srcid_cmd &
srcid_msk. Both are used to check where the data comes from or go to.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: John Garry <john.garry@huawei.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Co-developed-by: Qi Liu <liuqi115@huawei.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/1615186237-22263-9-git-send-email-zhangshaokun@hisilicon.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/Makefile             |   3 +-
 drivers/perf/hisilicon/hisi_uncore_pa_pmu.c | 500 ++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h                  |   1 +
 3 files changed, 503 insertions(+), 1 deletion(-)
 create mode 100644 drivers/perf/hisilicon/hisi_uncore_pa_pmu.c

(limited to 'include/linux')

diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 6600a9d45dd8..7643c9f93e36 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \
-			  hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o
+			  hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \
+			  hisi_uncore_pa_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
new file mode 100644
index 000000000000..14f23eb31248
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
@@ -0,0 +1,500 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HiSilicon PA uncore Hardware event counters support
+ *
+ * Copyright (C) 2020 HiSilicon Limited
+ * Author: Shaokun Zhang <zhangshaokun@hisilicon.com>
+ *
+ * This code is based on the uncore PMUs like arm-cci and arm-ccn.
+ */
+#include <linux/acpi.h>
+#include <linux/cpuhotplug.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/list.h>
+#include <linux/smp.h>
+
+#include "hisi_uncore_pmu.h"
+
+/* PA register definition */
+#define PA_PERF_CTRL			0x1c00
+#define PA_EVENT_CTRL			0x1c04
+#define PA_TT_CTRL			0x1c08
+#define PA_TGTID_CTRL			0x1c14
+#define PA_SRCID_CTRL			0x1c18
+#define PA_INT_MASK			0x1c70
+#define PA_INT_STATUS			0x1c78
+#define PA_INT_CLEAR			0x1c7c
+#define PA_EVENT_TYPE0			0x1c80
+#define PA_PMU_VERSION			0x1cf0
+#define PA_EVENT_CNT0_L			0x1f00
+
+#define PA_EVTYPE_MASK			0xff
+#define PA_NR_COUNTERS			0x8
+#define PA_PERF_CTRL_EN			BIT(0)
+#define PA_TRACETAG_EN			BIT(4)
+#define PA_TGTID_EN			BIT(11)
+#define PA_SRCID_EN			BIT(11)
+#define PA_TGTID_NONE			0
+#define PA_SRCID_NONE			0
+#define PA_TGTID_MSK_SHIFT		12
+#define PA_SRCID_MSK_SHIFT		12
+
+HISI_PMU_EVENT_ATTR_EXTRACTOR(tgtid_cmd, config1, 10, 0);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(tgtid_msk, config1, 21, 11);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(srcid_cmd, config1, 32, 22);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(srcid_msk, config1, 43, 33);
+HISI_PMU_EVENT_ATTR_EXTRACTOR(tracetag_en, config1, 44, 44);
+
+static void hisi_pa_pmu_enable_tracetag(struct perf_event *event)
+{
+	struct hisi_pmu *pa_pmu = to_hisi_pmu(event->pmu);
+	u32 tt_en = hisi_get_tracetag_en(event);
+
+	if (tt_en) {
+		u32 val;
+
+		val = readl(pa_pmu->base + PA_TT_CTRL);
+		val |= PA_TRACETAG_EN;
+		writel(val, pa_pmu->base + PA_TT_CTRL);
+	}
+}
+
+static void hisi_pa_pmu_clear_tracetag(struct perf_event *event)
+{
+	struct hisi_pmu *pa_pmu = to_hisi_pmu(event->pmu);
+	u32 tt_en = hisi_get_tracetag_en(event);
+
+	if (tt_en) {
+		u32 val;
+
+		val = readl(pa_pmu->base + PA_TT_CTRL);
+		val &= ~PA_TRACETAG_EN;
+		writel(val, pa_pmu->base + PA_TT_CTRL);
+	}
+}
+
+static void hisi_pa_pmu_config_tgtid(struct perf_event *event)
+{
+	struct hisi_pmu *pa_pmu = to_hisi_pmu(event->pmu);
+	u32 cmd = hisi_get_tgtid_cmd(event);
+
+	if (cmd) {
+		u32 msk = hisi_get_tgtid_msk(event);
+		u32 val = cmd | PA_TGTID_EN | (msk << PA_TGTID_MSK_SHIFT);
+
+		writel(val, pa_pmu->base + PA_TGTID_CTRL);
+	}
+}
+
+static void hisi_pa_pmu_clear_tgtid(struct perf_event *event)
+{
+	struct hisi_pmu *pa_pmu = to_hisi_pmu(event->pmu);
+	u32 cmd = hisi_get_tgtid_cmd(event);
+
+	if (cmd)
+		writel(PA_TGTID_NONE, pa_pmu->base + PA_TGTID_CTRL);
+}
+
+static void hisi_pa_pmu_config_srcid(struct perf_event *event)
+{
+	struct hisi_pmu *pa_pmu = to_hisi_pmu(event->pmu);
+	u32 cmd = hisi_get_srcid_cmd(event);
+
+	if (cmd) {
+		u32 msk = hisi_get_srcid_msk(event);
+		u32 val = cmd | PA_SRCID_EN | (msk << PA_SRCID_MSK_SHIFT);
+
+		writel(val, pa_pmu->base + PA_SRCID_CTRL);
+	}
+}
+
+static void hisi_pa_pmu_clear_srcid(struct perf_event *event)
+{
+	struct hisi_pmu *pa_pmu = to_hisi_pmu(event->pmu);
+	u32 cmd = hisi_get_srcid_cmd(event);
+
+	if (cmd)
+		writel(PA_SRCID_NONE, pa_pmu->base + PA_SRCID_CTRL);
+}
+
+static void hisi_pa_pmu_enable_filter(struct perf_event *event)
+{
+	if (event->attr.config1 != 0x0) {
+		hisi_pa_pmu_enable_tracetag(event);
+		hisi_pa_pmu_config_srcid(event);
+		hisi_pa_pmu_config_tgtid(event);
+	}
+}
+
+static void hisi_pa_pmu_disable_filter(struct perf_event *event)
+{
+	if (event->attr.config1 != 0x0) {
+		hisi_pa_pmu_clear_tgtid(event);
+		hisi_pa_pmu_clear_srcid(event);
+		hisi_pa_pmu_clear_tracetag(event);
+	}
+}
+
+static u32 hisi_pa_pmu_get_counter_offset(int idx)
+{
+	return (PA_EVENT_CNT0_L + idx * 8);
+}
+
+static u64 hisi_pa_pmu_read_counter(struct hisi_pmu *pa_pmu,
+				    struct hw_perf_event *hwc)
+{
+	return readq(pa_pmu->base + hisi_pa_pmu_get_counter_offset(hwc->idx));
+}
+
+static void hisi_pa_pmu_write_counter(struct hisi_pmu *pa_pmu,
+				      struct hw_perf_event *hwc, u64 val)
+{
+	writeq(val, pa_pmu->base + hisi_pa_pmu_get_counter_offset(hwc->idx));
+}
+
+static void hisi_pa_pmu_write_evtype(struct hisi_pmu *pa_pmu, int idx,
+				     u32 type)
+{
+	u32 reg, reg_idx, shift, val;
+
+	/*
+	 * Select the appropriate event select register(PA_EVENT_TYPE0/1).
+	 * There are 2 event select registers for the 8 hardware counters.
+	 * Event code is 8-bits and for the former 4 hardware counters,
+	 * PA_EVENT_TYPE0 is chosen. For the latter 4 hardware counters,
+	 * PA_EVENT_TYPE1 is chosen.
+	 */
+	reg = PA_EVENT_TYPE0 + (idx / 4) * 4;
+	reg_idx = idx % 4;
+	shift = 8 * reg_idx;
+
+	/* Write event code to pa_EVENT_TYPEx Register */
+	val = readl(pa_pmu->base + reg);
+	val &= ~(PA_EVTYPE_MASK << shift);
+	val |= (type << shift);
+	writel(val, pa_pmu->base + reg);
+}
+
+static void hisi_pa_pmu_start_counters(struct hisi_pmu *pa_pmu)
+{
+	u32 val;
+
+	val = readl(pa_pmu->base + PA_PERF_CTRL);
+	val |= PA_PERF_CTRL_EN;
+	writel(val, pa_pmu->base + PA_PERF_CTRL);
+}
+
+static void hisi_pa_pmu_stop_counters(struct hisi_pmu *pa_pmu)
+{
+	u32 val;
+
+	val = readl(pa_pmu->base + PA_PERF_CTRL);
+	val &= ~(PA_PERF_CTRL_EN);
+	writel(val, pa_pmu->base + PA_PERF_CTRL);
+}
+
+static void hisi_pa_pmu_enable_counter(struct hisi_pmu *pa_pmu,
+				       struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Enable counter index in PA_EVENT_CTRL register */
+	val = readl(pa_pmu->base + PA_EVENT_CTRL);
+	val |= 1 << hwc->idx;
+	writel(val, pa_pmu->base + PA_EVENT_CTRL);
+}
+
+static void hisi_pa_pmu_disable_counter(struct hisi_pmu *pa_pmu,
+					struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Clear counter index in PA_EVENT_CTRL register */
+	val = readl(pa_pmu->base + PA_EVENT_CTRL);
+	val &= ~(1 << hwc->idx);
+	writel(val, pa_pmu->base + PA_EVENT_CTRL);
+}
+
+static void hisi_pa_pmu_enable_counter_int(struct hisi_pmu *pa_pmu,
+					   struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Write 0 to enable interrupt */
+	val = readl(pa_pmu->base + PA_INT_MASK);
+	val &= ~(1 << hwc->idx);
+	writel(val, pa_pmu->base + PA_INT_MASK);
+}
+
+static void hisi_pa_pmu_disable_counter_int(struct hisi_pmu *pa_pmu,
+					    struct hw_perf_event *hwc)
+{
+	u32 val;
+
+	/* Write 1 to mask interrupt */
+	val = readl(pa_pmu->base + PA_INT_MASK);
+	val |= 1 << hwc->idx;
+	writel(val, pa_pmu->base + PA_INT_MASK);
+}
+
+static u32 hisi_pa_pmu_get_int_status(struct hisi_pmu *pa_pmu)
+{
+	return readl(pa_pmu->base + PA_INT_STATUS);
+}
+
+static void hisi_pa_pmu_clear_int_status(struct hisi_pmu *pa_pmu, int idx)
+{
+	writel(1 << idx, pa_pmu->base + PA_INT_CLEAR);
+}
+
+static const struct acpi_device_id hisi_pa_pmu_acpi_match[] = {
+	{ "HISI0273", },
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, hisi_pa_pmu_acpi_match);
+
+static int hisi_pa_pmu_init_data(struct platform_device *pdev,
+				   struct hisi_pmu *pa_pmu)
+{
+	/*
+	 * Use the SCCL_ID and the index ID to identify the PA PMU,
+	 * while SCCL_ID is the nearst SCCL_ID from this SICL and
+	 * CPU core is chosen from this SCCL to manage this PMU.
+	 */
+	if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id",
+				     &pa_pmu->sccl_id)) {
+		dev_err(&pdev->dev, "Cannot read sccl-id!\n");
+		return -EINVAL;
+	}
+
+	if (device_property_read_u32(&pdev->dev, "hisilicon,idx-id",
+				     &pa_pmu->index_id)) {
+		dev_err(&pdev->dev, "Cannot read idx-id!\n");
+		return -EINVAL;
+	}
+
+	pa_pmu->ccl_id = -1;
+
+	pa_pmu->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(pa_pmu->base)) {
+		dev_err(&pdev->dev, "ioremap failed for pa_pmu resource.\n");
+		return PTR_ERR(pa_pmu->base);
+	}
+
+	pa_pmu->identifier = readl(pa_pmu->base + PA_PMU_VERSION);
+
+	return 0;
+}
+
+static struct attribute *hisi_pa_pmu_v2_format_attr[] = {
+	HISI_PMU_FORMAT_ATTR(event, "config:0-7"),
+	HISI_PMU_FORMAT_ATTR(tgtid_cmd, "config1:0-10"),
+	HISI_PMU_FORMAT_ATTR(tgtid_msk, "config1:11-21"),
+	HISI_PMU_FORMAT_ATTR(srcid_cmd, "config1:22-32"),
+	HISI_PMU_FORMAT_ATTR(srcid_msk, "config1:33-43"),
+	HISI_PMU_FORMAT_ATTR(tracetag_en, "config1:44"),
+	NULL,
+};
+
+static const struct attribute_group hisi_pa_pmu_v2_format_group = {
+	.name = "format",
+	.attrs = hisi_pa_pmu_v2_format_attr,
+};
+
+static struct attribute *hisi_pa_pmu_v2_events_attr[] = {
+	HISI_PMU_EVENT_ATTR(rx_req,		0x40),
+	HISI_PMU_EVENT_ATTR(tx_req,             0x5c),
+	HISI_PMU_EVENT_ATTR(cycle,		0x78),
+	NULL
+};
+
+static const struct attribute_group hisi_pa_pmu_v2_events_group = {
+	.name = "events",
+	.attrs = hisi_pa_pmu_v2_events_attr,
+};
+
+static DEVICE_ATTR(cpumask, 0444, hisi_cpumask_sysfs_show, NULL);
+
+static struct attribute *hisi_pa_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static const struct attribute_group hisi_pa_pmu_cpumask_attr_group = {
+	.attrs = hisi_pa_pmu_cpumask_attrs,
+};
+
+static struct device_attribute hisi_pa_pmu_identifier_attr =
+	__ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL);
+
+static struct attribute *hisi_pa_pmu_identifier_attrs[] = {
+	&hisi_pa_pmu_identifier_attr.attr,
+	NULL
+};
+
+static struct attribute_group hisi_pa_pmu_identifier_group = {
+	.attrs = hisi_pa_pmu_identifier_attrs,
+};
+
+static const struct attribute_group *hisi_pa_pmu_v2_attr_groups[] = {
+	&hisi_pa_pmu_v2_format_group,
+	&hisi_pa_pmu_v2_events_group,
+	&hisi_pa_pmu_cpumask_attr_group,
+	&hisi_pa_pmu_identifier_group,
+	NULL
+};
+
+static const struct hisi_uncore_ops hisi_uncore_pa_ops = {
+	.write_evtype		= hisi_pa_pmu_write_evtype,
+	.get_event_idx		= hisi_uncore_pmu_get_event_idx,
+	.start_counters		= hisi_pa_pmu_start_counters,
+	.stop_counters		= hisi_pa_pmu_stop_counters,
+	.enable_counter		= hisi_pa_pmu_enable_counter,
+	.disable_counter	= hisi_pa_pmu_disable_counter,
+	.enable_counter_int	= hisi_pa_pmu_enable_counter_int,
+	.disable_counter_int	= hisi_pa_pmu_disable_counter_int,
+	.write_counter		= hisi_pa_pmu_write_counter,
+	.read_counter		= hisi_pa_pmu_read_counter,
+	.get_int_status		= hisi_pa_pmu_get_int_status,
+	.clear_int_status	= hisi_pa_pmu_clear_int_status,
+	.enable_filter		= hisi_pa_pmu_enable_filter,
+	.disable_filter		= hisi_pa_pmu_disable_filter,
+};
+
+static int hisi_pa_pmu_dev_probe(struct platform_device *pdev,
+				 struct hisi_pmu *pa_pmu)
+{
+	int ret;
+
+	ret = hisi_pa_pmu_init_data(pdev, pa_pmu);
+	if (ret)
+		return ret;
+
+	ret = hisi_uncore_pmu_init_irq(pa_pmu, pdev);
+	if (ret)
+		return ret;
+
+	pa_pmu->pmu_events.attr_groups = hisi_pa_pmu_v2_attr_groups;
+	pa_pmu->num_counters = PA_NR_COUNTERS;
+	pa_pmu->ops = &hisi_uncore_pa_ops;
+	pa_pmu->check_event = 0xB0;
+	pa_pmu->counter_bits = 64;
+	pa_pmu->dev = &pdev->dev;
+	pa_pmu->on_cpu = -1;
+
+	return 0;
+}
+
+static int hisi_pa_pmu_probe(struct platform_device *pdev)
+{
+	struct hisi_pmu *pa_pmu;
+	char *name;
+	int ret;
+
+	pa_pmu = devm_kzalloc(&pdev->dev, sizeof(*pa_pmu), GFP_KERNEL);
+	if (!pa_pmu)
+		return -ENOMEM;
+
+	ret = hisi_pa_pmu_dev_probe(pdev, pa_pmu);
+	if (ret)
+		return ret;
+	/*
+	 * PA is attached in SICL and the CPU core is chosen to manage this
+	 * PMU which is the nearest SCCL, while its SCCL_ID is greater than
+	 * one with the SICL_ID.
+	 */
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sicl%u_pa%u",
+			      pa_pmu->sccl_id - 1, pa_pmu->index_id);
+	if (!name)
+		return -ENOMEM;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
+				       &pa_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	pa_pmu->pmu = (struct pmu) {
+		.module		= THIS_MODULE,
+		.task_ctx_nr	= perf_invalid_context,
+		.event_init	= hisi_uncore_pmu_event_init,
+		.pmu_enable	= hisi_uncore_pmu_enable,
+		.pmu_disable	= hisi_uncore_pmu_disable,
+		.add		= hisi_uncore_pmu_add,
+		.del		= hisi_uncore_pmu_del,
+		.start		= hisi_uncore_pmu_start,
+		.stop		= hisi_uncore_pmu_stop,
+		.read		= hisi_uncore_pmu_read,
+		.attr_groups    = pa_pmu->pmu_events.attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+	};
+
+	ret = perf_pmu_register(&pa_pmu->pmu, name, -1);
+	if (ret) {
+		dev_err(pa_pmu->dev, "PMU register failed, ret = %d\n", ret);
+		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
+					    &pa_pmu->node);
+		irq_set_affinity_hint(pa_pmu->irq, NULL);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, pa_pmu);
+	return ret;
+}
+
+static int hisi_pa_pmu_remove(struct platform_device *pdev)
+{
+	struct hisi_pmu *pa_pmu = platform_get_drvdata(pdev);
+
+	perf_pmu_unregister(&pa_pmu->pmu);
+	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
+					    &pa_pmu->node);
+	irq_set_affinity_hint(pa_pmu->irq, NULL);
+
+	return 0;
+}
+
+static struct platform_driver hisi_pa_pmu_driver = {
+	.driver = {
+		.name = "hisi_pa_pmu",
+		.acpi_match_table = hisi_pa_pmu_acpi_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe = hisi_pa_pmu_probe,
+	.remove = hisi_pa_pmu_remove,
+};
+
+static int __init hisi_pa_pmu_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
+				      "AP_PERF_ARM_HISI_PA_ONLINE",
+				      hisi_uncore_pmu_online_cpu,
+				      hisi_uncore_pmu_offline_cpu);
+	if (ret) {
+		pr_err("PA PMU: cpuhp state setup failed, ret = %d\n", ret);
+		return ret;
+	}
+
+	ret = platform_driver_register(&hisi_pa_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE);
+
+	return ret;
+}
+module_init(hisi_pa_pmu_module_init);
+
+static void __exit hisi_pa_pmu_module_exit(void)
+{
+	platform_driver_unregister(&hisi_pa_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE);
+}
+module_exit(hisi_pa_pmu_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon Protocol Adapter uncore PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Shaokun Zhang <zhangshaokun@hisilicon.com>");
+MODULE_AUTHOR("Qi Liu <liuqi115@huawei.com>");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 5f5ce676532f..8ef744007e38 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -175,6 +175,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
-- 
cgit v1.2.3


From d85aecf2844ff02a0e5f077252b2461d4f10c9f0 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Wed, 24 Mar 2021 21:37:17 -0700
Subject: hugetlb_cgroup: fix imbalanced css_get and css_put pair for shared
 mappings

The current implementation of hugetlb_cgroup for shared mappings could
have different behavior.  Consider the following two scenarios:

 1.Assume initial css reference count of hugetlb_cgroup is 1:
  1.1 Call hugetlb_reserve_pages with from = 1, to = 2. So css reference
      count is 2 associated with 1 file_region.
  1.2 Call hugetlb_reserve_pages with from = 2, to = 3. So css reference
      count is 3 associated with 2 file_region.
  1.3 coalesce_file_region will coalesce these two file_regions into
      one. So css reference count is 3 associated with 1 file_region
      now.

 2.Assume initial css reference count of hugetlb_cgroup is 1 again:
  2.1 Call hugetlb_reserve_pages with from = 1, to = 3. So css reference
      count is 2 associated with 1 file_region.

Therefore, we might have one file_region while holding one or more css
reference counts. This inconsistency could lead to imbalanced css_get()
and css_put() pair. If we do css_put one by one (i.g. hole punch case),
scenario 2 would put one more css reference. If we do css_put all
together (i.g. truncate case), scenario 1 will leak one css reference.

The imbalanced css_get() and css_put() pair would result in a non-zero
reference when we try to destroy the hugetlb cgroup. The hugetlb cgroup
directory is removed __but__ associated resource is not freed. This
might result in OOM or can not create a new hugetlb cgroup in a busy
workload ultimately.

In order to fix this, we have to make sure that one file_region must
hold exactly one css reference. So in coalesce_file_region case, we
should release one css reference before coalescence. Also only put css
reference when the entire file_region is removed.

The last thing to note is that the caller of region_add() will only hold
one reference to h_cg->css for the whole contiguous reservation region.
But this area might be scattered when there are already some
file_regions reside in it. As a result, many file_regions may share only
one h_cg->css reference. In order to ensure that one file_region must
hold exactly one css reference, we should do css_get() for each
file_region and release the reference held by caller when they are done.

[linmiaohe@huawei.com: fix imbalanced css_get and css_put pair for shared mappings]
  Link: https://lkml.kernel.org/r/20210316023002.53921-1-linmiaohe@huawei.com

Link: https://lkml.kernel.org/r/20210301120540.37076-1-linmiaohe@huawei.com
Fixes: 075a61d07a8e ("hugetlb_cgroup: add accounting for shared mappings")
Reported-by: kernel test robot <lkp@intel.com> (auto build test ERROR)
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Wanpeng Li <liwp.linux@gmail.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb_cgroup.h | 15 +++++++++++++--
 mm/hugetlb.c                   | 41 +++++++++++++++++++++++++++++++++++++----
 mm/hugetlb_cgroup.c            | 10 ++++++++--
 3 files changed, 58 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 2ad6e92f124a..0bff345c4bc6 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -113,6 +113,11 @@ static inline bool hugetlb_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
 }
 
+static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
+{
+	css_put(&h_cg->css);
+}
+
 extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 					struct hugetlb_cgroup **ptr);
 extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
@@ -138,7 +143,8 @@ extern void hugetlb_cgroup_uncharge_counter(struct resv_map *resv,
 
 extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
 						struct file_region *rg,
-						unsigned long nr_pages);
+						unsigned long nr_pages,
+						bool region_del);
 
 extern void hugetlb_cgroup_file_init(void) __init;
 extern void hugetlb_cgroup_migrate(struct page *oldhpage,
@@ -147,7 +153,8 @@ extern void hugetlb_cgroup_migrate(struct page *oldhpage,
 #else
 static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
 						       struct file_region *rg,
-						       unsigned long nr_pages)
+						       unsigned long nr_pages,
+						       bool region_del)
 {
 }
 
@@ -185,6 +192,10 @@ static inline bool hugetlb_cgroup_disabled(void)
 	return true;
 }
 
+static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
+{
+}
+
 static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
 					       struct hugetlb_cgroup **ptr)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5b1ab1f427c5..a86a58ef132d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -280,6 +280,17 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
 		nrg->reservation_counter =
 			&h_cg->rsvd_hugepage[hstate_index(h)];
 		nrg->css = &h_cg->css;
+		/*
+		 * The caller will hold exactly one h_cg->css reference for the
+		 * whole contiguous reservation region. But this area might be
+		 * scattered when there are already some file_regions reside in
+		 * it. As a result, many file_regions may share only one css
+		 * reference. In order to ensure that one file_region must hold
+		 * exactly one h_cg->css reference, we should do css_get for
+		 * each file_region and leave the reference held by caller
+		 * untouched.
+		 */
+		css_get(&h_cg->css);
 		if (!resv->pages_per_hpage)
 			resv->pages_per_hpage = pages_per_huge_page(h);
 		/* pages_per_hpage should be the same for all entries in
@@ -293,6 +304,14 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
 #endif
 }
 
+static void put_uncharge_info(struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+	if (rg->css)
+		css_put(rg->css);
+#endif
+}
+
 static bool has_same_uncharge_info(struct file_region *rg,
 				   struct file_region *org)
 {
@@ -316,6 +335,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
 		prg->to = rg->to;
 
 		list_del(&rg->link);
+		put_uncharge_info(rg);
 		kfree(rg);
 
 		rg = prg;
@@ -327,6 +347,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
 		nrg->from = rg->from;
 
 		list_del(&rg->link);
+		put_uncharge_info(rg);
 		kfree(rg);
 	}
 }
@@ -662,7 +683,7 @@ retry:
 
 			del += t - f;
 			hugetlb_cgroup_uncharge_file_region(
-				resv, rg, t - f);
+				resv, rg, t - f, false);
 
 			/* New entry for end of split region */
 			nrg->from = t;
@@ -683,7 +704,7 @@ retry:
 		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
 			del += rg->to - rg->from;
 			hugetlb_cgroup_uncharge_file_region(resv, rg,
-							    rg->to - rg->from);
+							    rg->to - rg->from, true);
 			list_del(&rg->link);
 			kfree(rg);
 			continue;
@@ -691,13 +712,13 @@ retry:
 
 		if (f <= rg->from) {	/* Trim beginning of region */
 			hugetlb_cgroup_uncharge_file_region(resv, rg,
-							    t - rg->from);
+							    t - rg->from, false);
 
 			del += t - rg->from;
 			rg->from = t;
 		} else {		/* Trim end of region */
 			hugetlb_cgroup_uncharge_file_region(resv, rg,
-							    rg->to - f);
+							    rg->to - f, false);
 
 			del += rg->to - f;
 			rg->to = f;
@@ -5187,6 +5208,10 @@ bool hugetlb_reserve_pages(struct inode *inode,
 			 */
 			long rsv_adjust;
 
+			/*
+			 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
+			 * reference to h_cg->css. See comment below for detail.
+			 */
 			hugetlb_cgroup_uncharge_cgroup_rsvd(
 				hstate_index(h),
 				(chg - add) * pages_per_huge_page(h), h_cg);
@@ -5194,6 +5219,14 @@ bool hugetlb_reserve_pages(struct inode *inode,
 			rsv_adjust = hugepage_subpool_put_pages(spool,
 								chg - add);
 			hugetlb_acct_memory(h, -rsv_adjust);
+		} else if (h_cg) {
+			/*
+			 * The file_regions will hold their own reference to
+			 * h_cg->css. So we should release the reference held
+			 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
+			 * done.
+			 */
+			hugetlb_cgroup_put_rsvd_cgroup(h_cg);
 		}
 	}
 	return true;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index f68b51fcda3d..603a131e262d 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -391,7 +391,8 @@ void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
 
 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
 					 struct file_region *rg,
-					 unsigned long nr_pages)
+					 unsigned long nr_pages,
+					 bool region_del)
 {
 	if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
 		return;
@@ -400,7 +401,12 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
 	    !resv->reservation_counter) {
 		page_counter_uncharge(rg->reservation_counter,
 				      nr_pages * resv->pages_per_hpage);
-		css_put(rg->css);
+		/*
+		 * Only do css_put(rg->css) when we delete the entire region
+		 * because one file_region must hold exactly one css reference.
+		 */
+		if (region_del)
+			css_put(rg->css);
 	}
 }
 
-- 
cgit v1.2.3


From cf10bd4c4aff8dd64d1aa7f2a529d0c672bc16af Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 24 Mar 2021 21:37:20 -0700
Subject: kasan: fix per-page tags for non-page_alloc pages

To allow performing tag checks on page_alloc addresses obtained via
page_address(), tag-based KASAN modes store tags for page_alloc
allocations in page->flags.

Currently, the default tag value stored in page->flags is 0x00.
Therefore, page_address() returns a 0x00ffff...  address for pages that
were not allocated via page_alloc.

This might cause problems.  A particular case we encountered is a
conflict with KFENCE.  If a KFENCE-allocated slab object is being freed
via kfree(page_address(page) + offset), the address passed to kfree()
will get tagged with 0x00 (as slab pages keep the default per-page
tags).  This leads to is_kfence_address() check failing, and a KFENCE
object ending up in normal slab freelist, which causes memory
corruptions.

This patch changes the way KASAN stores tag in page-flags: they are now
stored xor'ed with 0xff.  This way, KASAN doesn't need to initialize
per-page flags for every created page, which might be slow.

With this change, page_address() returns natively-tagged (with 0xff)
pointers for pages that didn't have tags set explicitly.

This patch fixes the encountered conflict with KFENCE and prevents more
similar issues that can occur in the future.

Link: https://lkml.kernel.org/r/1a41abb11c51b264511d9e71c303bb16d5cb367b.1615475452.git.andreyknvl@google.com
Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc")
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64a71bf20536..8ba434287387 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1461,16 +1461,28 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
+/*
+ * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
+ * setting tags for all pages to native kernel tag value 0xff, as the default
+ * value 0x00 maps to 0xff.
+ */
+
 static inline u8 page_kasan_tag(const struct page *page)
 {
-	if (kasan_enabled())
-		return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
-	return 0xff;
+	u8 tag = 0xff;
+
+	if (kasan_enabled()) {
+		tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+		tag ^= 0xff;
+	}
+
+	return tag;
 }
 
 static inline void page_kasan_tag_set(struct page *page, u8 tag)
 {
 	if (kasan_enabled()) {
+		tag ^= 0xff;
 		page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
 		page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
 	}
-- 
cgit v1.2.3


From c2655835fd8cabdfe7dab737253de3ffb88da126 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 24 Mar 2021 21:37:23 -0700
Subject: mm/mmu_notifiers: ensure range_end() is paired with range_start()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If one or more notifiers fails .invalidate_range_start(), invoke
.invalidate_range_end() for "all" notifiers.  If there are multiple
notifiers, those that did not fail are expecting _start() and _end() to
be paired, e.g.  KVM's mmu_notifier_count would become imbalanced.
Disallow notifiers that can fail _start() from implementing _end() so
that it's unnecessary to either track which notifiers rejected _start(),
or had already succeeded prior to a failed _start().

Note, the existing behavior of calling _start() on all notifiers even
after a previous notifier failed _start() was an unintented "feature".
Make it canon now that the behavior is depended on for correctness.

As of today, the bug is likely benign:

  1. The only caller of the non-blocking notifier is OOM kill.
  2. The only notifiers that can fail _start() are the i915 and Nouveau
     drivers.
  3. The only notifiers that utilize _end() are the SGI UV GRU driver
     and KVM.
  4. The GRU driver will never coincide with the i195/Nouveau drivers.
  5. An imbalanced kvm->mmu_notifier_count only causes soft lockup in the
     _guest_, and the guest is already doomed due to being an OOM victim.

Fix the bug now to play nice with future usage, e.g.  KVM has a
potential use case for blocking memslot updates in KVM while an
invalidation is in-progress, and failure to unblock would result in said
updates being blocked indefinitely and hanging.

Found by inspection.  Verified by adding a second notifier in KVM that
periodically returns -EAGAIN on non-blockable ranges, triggering OOM,
and observing that KVM exits with an elevated notifier count.

Link: https://lkml.kernel.org/r/20210311180057.1582638-1-seanjc@google.com
Fixes: 93065ac753e4 ("mm, oom: distinguish blockable mode for mmu notifiers")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ben Gardon <bgardon@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dimitri Sivanich <dimitri.sivanich@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 10 +++++-----
 mm/mmu_notifier.c            | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b8200782dede..1a6a9eb6d3fa 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -169,11 +169,11 @@ struct mmu_notifier_ops {
 	 * the last refcount is dropped.
 	 *
 	 * If blockable argument is set to false then the callback cannot
-	 * sleep and has to return with -EAGAIN. 0 should be returned
-	 * otherwise. Please note that if invalidate_range_start approves
-	 * a non-blocking behavior then the same applies to
-	 * invalidate_range_end.
-	 *
+	 * sleep and has to return with -EAGAIN if sleeping would be required.
+	 * 0 should be returned otherwise. Please note that notifiers that can
+	 * fail invalidate_range_start are not allowed to implement
+	 * invalidate_range_end, as there is no mechanism for informing the
+	 * notifier that its start failed.
 	 */
 	int (*invalidate_range_start)(struct mmu_notifier *subscription,
 				      const struct mmu_notifier_range *range);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 61ee40ed804e..459d195d2ff6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -501,10 +501,33 @@ static int mn_hlist_invalidate_range_start(
 						"");
 				WARN_ON(mmu_notifier_range_blockable(range) ||
 					_ret != -EAGAIN);
+				/*
+				 * We call all the notifiers on any EAGAIN,
+				 * there is no way for a notifier to know if
+				 * its start method failed, thus a start that
+				 * does EAGAIN can't also do end.
+				 */
+				WARN_ON(ops->invalidate_range_end);
 				ret = _ret;
 			}
 		}
 	}
+
+	if (ret) {
+		/*
+		 * Must be non-blocking to get here.  If there are multiple
+		 * notifiers and one or more failed start, any that succeeded
+		 * start are expecting their end to be called.  Do so now.
+		 */
+		hlist_for_each_entry_rcu(subscription, &subscriptions->list,
+					 hlist, srcu_read_lock_held(&srcu)) {
+			if (!subscription->ops->invalidate_range_end)
+				continue;
+
+			subscription->ops->invalidate_range_end(subscription,
+								range);
+		}
+	}
 	srcu_read_unlock(&srcu, id);
 
 	return ret;
-- 
cgit v1.2.3


From a024b7c2850dddd01e65b8270f0971deaf272f27 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 24 Mar 2021 21:37:50 -0700
Subject: mm: memblock: fix section mismatch warning again

Commit 34dc2efb39a2 ("memblock: fix section mismatch warning") marked
memblock_bottom_up() and memblock_set_bottom_up() as __init, but they
could be referenced from non-init functions like
memblock_find_in_range_node() on architectures that enable
CONFIG_ARCH_KEEP_MEMBLOCK.

For such builds kernel test robot reports:

   WARNING: modpost: vmlinux.o(.text+0x74fea4): Section mismatch in reference from the function memblock_find_in_range_node() to the function .init.text:memblock_bottom_up()
   The function memblock_find_in_range_node() references the function __init memblock_bottom_up().
   This is often because memblock_find_in_range_node lacks a __init  annotation or the annotation of memblock_bottom_up is wrong.

Replace __init annotations with __init_memblock annotations so that the
appropriate section will be selected depending on
CONFIG_ARCH_KEEP_MEMBLOCK.

Link: https://lore.kernel.org/lkml/202103160133.UzhgY0wt-lkp@intel.com
Link: https://lkml.kernel.org/r/20210316171347.14084-1-rppt@kernel.org
Fixes: 34dc2efb39a2 ("memblock: fix section mismatch warning")
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index d13e3cd938b4..5984fff3f175 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -460,7 +460,7 @@ static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
-static inline __init void memblock_set_bottom_up(bool enable)
+static inline __init_memblock void memblock_set_bottom_up(bool enable)
 {
 	memblock.bottom_up = enable;
 }
@@ -470,7 +470,7 @@ static inline __init void memblock_set_bottom_up(bool enable)
  * if this is true, that said, memblock will allocate memory
  * in bottom-up direction.
  */
-static inline __init bool memblock_bottom_up(void)
+static inline __init_memblock bool memblock_bottom_up(void)
 {
 	return memblock.bottom_up;
 }
-- 
cgit v1.2.3


From 22f8b5df881e9f1302514bbbbbb8649c2051de55 Mon Sep 17 00:00:00 2001
From: Norbert Ciosek <norbertx.ciosek@intel.com>
Date: Thu, 28 Jan 2021 10:17:02 -0800
Subject: virtchnl: Fix layout of RSS structures

Remove padding from RSS structures. Previous layout
could lead to unwanted compiler optimizations
in loops when iterating over key and lut arrays.

Fixes: 65ece6de0114 ("virtchnl: Add missing explicit padding to structures")
Signed-off-by: Norbert Ciosek <norbertx.ciosek@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/avf/virtchnl.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 40bad71865ea..532bcbfc4716 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -476,7 +476,6 @@ struct virtchnl_rss_key {
 	u16 vsi_id;
 	u16 key_len;
 	u8 key[1];         /* RSS hash key, packed bytes */
-	u8 pad[1];
 };
 
 VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_key);
@@ -485,7 +484,6 @@ struct virtchnl_rss_lut {
 	u16 vsi_id;
 	u16 lut_entries;
 	u8 lut[1];        /* RSS lookup table */
-	u8 pad[1];
 };
 
 VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_lut);
-- 
cgit v1.2.3


From 374be283ad429bf703d9036b799331dda793aeb7 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Wed, 10 Feb 2021 18:45:59 -0800
Subject: platform/chrome: cros_ec: Add SW_FRONT_PROXIMITY MKBP define

Some cros ECs support a front proximity MKBP event via
'EC_MKBP_FRONT_PROXIMITY'. Add this define so it can be used in a
future patch.

Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Benson Leung <bleung@chromium.org>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Link: https://lore.kernel.org/r/20210211024601.1963379-2-swboyd@chromium.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/platform_data/cros_ec_commands.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 5ff8597ceabd..6035d9a98fb8 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -3467,6 +3467,7 @@ struct ec_response_get_next_event_v1 {
 #define EC_MKBP_LID_OPEN	0
 #define EC_MKBP_TABLET_MODE	1
 #define EC_MKBP_BASE_ATTACHED	2
+#define EC_MKBP_FRONT_PROXIMITY	3
 
 /* Run keyboard factory test scanning */
 #define EC_CMD_KEYBOARD_FACTORY_TEST 0x0068
-- 
cgit v1.2.3


From 995071d36bb9804b644265450142fcb91c427ee8 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Tue, 9 Mar 2021 11:36:13 -0800
Subject: iio: set default trig->dev.parent

When allocated with [devm_]iio_trigger_alloc(), set trig device parent to
the device the trigger is allocated for by default.

It can always be reassigned in the probe routine.

Change iio_trigger_alloc() API to add the device pointer to be coherent
with devm_iio_trigger_alloc, using similar interface to
iio_device_alloc().

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20210309193620.2176163-2-gwendal@chromium.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/bma180.c                         |  3 +-
 drivers/iio/adc/ad_sigma_delta.c                   |  6 ++--
 drivers/iio/adc/at91_adc.c                         |  3 +-
 .../iio/common/hid-sensors/hid-sensor-trigger.c    |  4 +--
 drivers/iio/common/st_sensors/st_sensors_trigger.c |  4 +--
 drivers/iio/gyro/itg3200_buffer.c                  |  3 +-
 drivers/iio/industrialio-trigger.c                 | 34 +++++++++++++++-------
 drivers/iio/trigger/iio-trig-hrtimer.c             |  2 +-
 drivers/iio/trigger/iio-trig-interrupt.c           |  2 +-
 drivers/iio/trigger/iio-trig-loop.c                |  2 +-
 drivers/iio/trigger/iio-trig-sysfs.c               |  3 +-
 include/linux/iio/iio.h                            |  2 +-
 include/linux/iio/trigger.h                        |  3 +-
 13 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/bma180.c b/drivers/iio/accel/bma180.c
index 71f85a3e525b..b8a7469cdae4 100644
--- a/drivers/iio/accel/bma180.c
+++ b/drivers/iio/accel/bma180.c
@@ -1044,7 +1044,7 @@ static int bma180_probe(struct i2c_client *client,
 	indio_dev->info = &bma180_info;
 
 	if (client->irq > 0) {
-		data->trig = iio_trigger_alloc("%s-dev%d", indio_dev->name,
+		data->trig = iio_trigger_alloc(dev, "%s-dev%d", indio_dev->name,
 			indio_dev->id);
 		if (!data->trig) {
 			ret = -ENOMEM;
@@ -1059,7 +1059,6 @@ static int bma180_probe(struct i2c_client *client,
 			goto err_trigger_free;
 		}
 
-		data->trig->dev.parent = dev;
 		data->trig->ops = &bma180_trigger_ops;
 		iio_trigger_set_drvdata(data->trig, indio_dev);
 		indio_dev->trig = iio_trigger_get(data->trig);
diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index 3a6f239d4acc..9289812c0a94 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -475,8 +475,9 @@ static int ad_sd_probe_trigger(struct iio_dev *indio_dev)
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
 	int ret;
 
-	sigma_delta->trig = iio_trigger_alloc("%s-dev%d", indio_dev->name,
-						indio_dev->id);
+	sigma_delta->trig = iio_trigger_alloc(&sigma_delta->spi->dev,
+					      "%s-dev%d", indio_dev->name,
+					      indio_dev->id);
 	if (sigma_delta->trig == NULL) {
 		ret = -ENOMEM;
 		goto error_ret;
@@ -496,7 +497,6 @@ static int ad_sd_probe_trigger(struct iio_dev *indio_dev)
 		sigma_delta->irq_dis = true;
 		disable_irq_nosync(sigma_delta->spi->irq);
 	}
-	sigma_delta->trig->dev.parent = &sigma_delta->spi->dev;
 	iio_trigger_set_drvdata(sigma_delta->trig, sigma_delta);
 
 	ret = iio_trigger_register(sigma_delta->trig);
diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
index 70750abb5dea..0b5f0c91d0d7 100644
--- a/drivers/iio/adc/at91_adc.c
+++ b/drivers/iio/adc/at91_adc.c
@@ -625,12 +625,11 @@ static struct iio_trigger *at91_adc_allocate_trigger(struct iio_dev *idev,
 	struct iio_trigger *trig;
 	int ret;
 
-	trig = iio_trigger_alloc("%s-dev%d-%s", idev->name,
+	trig = iio_trigger_alloc(idev->dev.parent, "%s-dev%d-%s", idev->name,
 				 idev->id, trigger->name);
 	if (trig == NULL)
 		return NULL;
 
-	trig->dev.parent = idev->dev.parent;
 	iio_trigger_set_drvdata(trig, idev);
 	trig->ops = &at91_adc_trigger_ops;
 
diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
index 064c32bec9c7..95ddccb44f1c 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
@@ -255,14 +255,14 @@ int hid_sensor_setup_trigger(struct iio_dev *indio_dev, const char *name,
 		return ret;
 	}
 
-	trig = iio_trigger_alloc("%s-dev%d", name, indio_dev->id);
+	trig = iio_trigger_alloc(indio_dev->dev.parent,
+				 "%s-dev%d", name, indio_dev->id);
 	if (trig == NULL) {
 		dev_err(&indio_dev->dev, "Trigger Allocate Failed\n");
 		ret = -ENOMEM;
 		goto error_triggered_buffer_cleanup;
 	}
 
-	trig->dev.parent = indio_dev->dev.parent;
 	iio_trigger_set_drvdata(trig, attrb);
 	trig->ops = &hid_sensor_trigger_ops;
 	ret = iio_trigger_register(trig);
diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c
index 2dbd2646e44e..0b511665dee5 100644
--- a/drivers/iio/common/st_sensors/st_sensors_trigger.c
+++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c
@@ -123,7 +123,8 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 	unsigned long irq_trig;
 	int err;
 
-	sdata->trig = iio_trigger_alloc("%s-trigger", indio_dev->name);
+	sdata->trig = iio_trigger_alloc(sdata->dev, "%s-trigger",
+					indio_dev->name);
 	if (sdata->trig == NULL) {
 		dev_err(&indio_dev->dev, "failed to allocate iio trigger.\n");
 		return -ENOMEM;
@@ -131,7 +132,6 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 
 	iio_trigger_set_drvdata(sdata->trig, indio_dev);
 	sdata->trig->ops = trigger_ops;
-	sdata->trig->dev.parent = sdata->dev;
 
 	irq_trig = irqd_get_trigger_type(irq_get_irq_data(sdata->irq));
 	/*
diff --git a/drivers/iio/gyro/itg3200_buffer.c b/drivers/iio/gyro/itg3200_buffer.c
index 1c3c1bd53374..af0aaa146f0c 100644
--- a/drivers/iio/gyro/itg3200_buffer.c
+++ b/drivers/iio/gyro/itg3200_buffer.c
@@ -113,7 +113,7 @@ int itg3200_probe_trigger(struct iio_dev *indio_dev)
 	int ret;
 	struct itg3200 *st = iio_priv(indio_dev);
 
-	st->trig = iio_trigger_alloc("%s-dev%d", indio_dev->name,
+	st->trig = iio_trigger_alloc(&st->i2c->dev, "%s-dev%d", indio_dev->name,
 				     indio_dev->id);
 	if (!st->trig)
 		return -ENOMEM;
@@ -127,7 +127,6 @@ int itg3200_probe_trigger(struct iio_dev *indio_dev)
 		goto error_free_trig;
 
 
-	st->trig->dev.parent = &st->i2c->dev;
 	st->trig->ops = &itg3200_trigger_ops;
 	iio_trigger_set_drvdata(st->trig, indio_dev);
 	ret = iio_trigger_register(st->trig);
diff --git a/drivers/iio/industrialio-trigger.c b/drivers/iio/industrialio-trigger.c
index a035d5c2a445..32ac1bec25e3 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -514,8 +514,10 @@ static void iio_trig_subirqunmask(struct irq_data *d)
 	trig->subirqs[d->irq - trig->subirq_base].enabled = true;
 }
 
-static __printf(1, 0)
-struct iio_trigger *viio_trigger_alloc(const char *fmt, va_list vargs)
+static __printf(2, 0)
+struct iio_trigger *viio_trigger_alloc(struct device *parent,
+				       const char *fmt,
+				       va_list vargs)
 {
 	struct iio_trigger *trig;
 	int i;
@@ -524,6 +526,7 @@ struct iio_trigger *viio_trigger_alloc(const char *fmt, va_list vargs)
 	if (!trig)
 		return NULL;
 
+	trig->dev.parent = parent;
 	trig->dev.type = &iio_trig_type;
 	trig->dev.bus = &iio_bus_type;
 	device_initialize(&trig->dev);
@@ -559,13 +562,23 @@ free_trig:
 	return NULL;
 }
 
-struct iio_trigger *iio_trigger_alloc(const char *fmt, ...)
+/**
+ * iio_trigger_alloc - Allocate a trigger
+ * @parent:		Device to allocate iio_trigger for
+ * @fmt:		trigger name format. If it includes format
+ *			specifiers, the additional arguments following
+ *			format are formatted and inserted in the resulting
+ *			string replacing their respective specifiers.
+ * RETURNS:
+ * Pointer to allocated iio_trigger on success, NULL on failure.
+ */
+struct iio_trigger *iio_trigger_alloc(struct device *parent, const char *fmt, ...)
 {
 	struct iio_trigger *trig;
 	va_list vargs;
 
 	va_start(vargs, fmt);
-	trig = viio_trigger_alloc(fmt, vargs);
+	trig = viio_trigger_alloc(parent, fmt, vargs);
 	va_end(vargs);
 
 	return trig;
@@ -586,20 +599,19 @@ static void devm_iio_trigger_release(struct device *dev, void *res)
 
 /**
  * devm_iio_trigger_alloc - Resource-managed iio_trigger_alloc()
- * @dev:		Device to allocate iio_trigger for
+ * Managed iio_trigger_alloc.  iio_trigger allocated with this function is
+ * automatically freed on driver detach.
+ * @parent:		Device to allocate iio_trigger for
  * @fmt:		trigger name format. If it includes format
  *			specifiers, the additional arguments following
  *			format are formatted and inserted in the resulting
  *			string replacing their respective specifiers.
  *
- * Managed iio_trigger_alloc.  iio_trigger allocated with this function is
- * automatically freed on driver detach.
  *
  * RETURNS:
  * Pointer to allocated iio_trigger on success, NULL on failure.
  */
-struct iio_trigger *devm_iio_trigger_alloc(struct device *dev,
-						const char *fmt, ...)
+struct iio_trigger *devm_iio_trigger_alloc(struct device *parent, const char *fmt, ...)
 {
 	struct iio_trigger **ptr, *trig;
 	va_list vargs;
@@ -611,11 +623,11 @@ struct iio_trigger *devm_iio_trigger_alloc(struct device *dev,
 
 	/* use raw alloc_dr for kmalloc caller tracing */
 	va_start(vargs, fmt);
-	trig = viio_trigger_alloc(fmt, vargs);
+	trig = viio_trigger_alloc(parent, fmt, vargs);
 	va_end(vargs);
 	if (trig) {
 		*ptr = trig;
-		devres_add(dev, ptr);
+		devres_add(parent, ptr);
 	} else {
 		devres_free(ptr);
 	}
diff --git a/drivers/iio/trigger/iio-trig-hrtimer.c b/drivers/iio/trigger/iio-trig-hrtimer.c
index e68a2c56d459..51e362f091c2 100644
--- a/drivers/iio/trigger/iio-trig-hrtimer.c
+++ b/drivers/iio/trigger/iio-trig-hrtimer.c
@@ -135,7 +135,7 @@ static struct iio_sw_trigger *iio_trig_hrtimer_probe(const char *name)
 	if (!trig_info)
 		return ERR_PTR(-ENOMEM);
 
-	trig_info->swt.trigger = iio_trigger_alloc("%s", name);
+	trig_info->swt.trigger = iio_trigger_alloc(NULL, "%s", name);
 	if (!trig_info->swt.trigger) {
 		ret = -ENOMEM;
 		goto err_free_trig_info;
diff --git a/drivers/iio/trigger/iio-trig-interrupt.c b/drivers/iio/trigger/iio-trig-interrupt.c
index 94a487caf421..f746c460bf2a 100644
--- a/drivers/iio/trigger/iio-trig-interrupt.c
+++ b/drivers/iio/trigger/iio-trig-interrupt.c
@@ -45,7 +45,7 @@ static int iio_interrupt_trigger_probe(struct platform_device *pdev)
 
 	irq = irq_res->start;
 
-	trig = iio_trigger_alloc("irqtrig%d", irq);
+	trig = iio_trigger_alloc(NULL, "irqtrig%d", irq);
 	if (!trig) {
 		ret = -ENOMEM;
 		goto error_ret;
diff --git a/drivers/iio/trigger/iio-trig-loop.c b/drivers/iio/trigger/iio-trig-loop.c
index 4a00668e3258..96ec06bbe546 100644
--- a/drivers/iio/trigger/iio-trig-loop.c
+++ b/drivers/iio/trigger/iio-trig-loop.c
@@ -84,7 +84,7 @@ static struct iio_sw_trigger *iio_trig_loop_probe(const char *name)
 	if (!trig_info)
 		return ERR_PTR(-ENOMEM);
 
-	trig_info->swt.trigger = iio_trigger_alloc("%s", name);
+	trig_info->swt.trigger = iio_trigger_alloc(NULL, "%s", name);
 	if (!trig_info->swt.trigger) {
 		ret = -ENOMEM;
 		goto err_free_trig_info;
diff --git a/drivers/iio/trigger/iio-trig-sysfs.c b/drivers/iio/trigger/iio-trig-sysfs.c
index 0f6b512a5c37..e9adfff45b39 100644
--- a/drivers/iio/trigger/iio-trig-sysfs.c
+++ b/drivers/iio/trigger/iio-trig-sysfs.c
@@ -149,7 +149,7 @@ static int iio_sysfs_trigger_probe(int id)
 		goto out1;
 	}
 	t->id = id;
-	t->trig = iio_trigger_alloc("sysfstrig%d", id);
+	t->trig = iio_trigger_alloc(&iio_sysfs_trig_dev, "sysfstrig%d", id);
 	if (!t->trig) {
 		ret = -ENOMEM;
 		goto free_t;
@@ -157,7 +157,6 @@ static int iio_sysfs_trigger_probe(int id)
 
 	t->trig->dev.groups = iio_sysfs_trigger_attr_groups;
 	t->trig->ops = &iio_sysfs_trigger_ops;
-	t->trig->dev.parent = &iio_sysfs_trig_dev;
 	iio_trigger_set_drvdata(t->trig, t);
 
 	t->work = IRQ_WORK_INIT_HARD(iio_sysfs_trigger_work);
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index f8585d01fc76..f2d65e2e88b6 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -693,7 +693,7 @@ static inline void *iio_priv(const struct iio_dev *indio_dev)
 void iio_device_free(struct iio_dev *indio_dev);
 struct iio_dev *devm_iio_device_alloc(struct device *parent, int sizeof_priv);
 __printf(2, 3)
-struct iio_trigger *devm_iio_trigger_alloc(struct device *dev,
+struct iio_trigger *devm_iio_trigger_alloc(struct device *parent,
 					   const char *fmt, ...);
 /**
  * iio_buffer_enabled() - helper function to test if the buffer is enabled
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index 055890b6ffcf..096f68dd2e0c 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -161,7 +161,8 @@ void iio_trigger_poll_chained(struct iio_trigger *trig);
 
 irqreturn_t iio_trigger_generic_data_rdy_poll(int irq, void *private);
 
-__printf(1, 2) struct iio_trigger *iio_trigger_alloc(const char *fmt, ...);
+__printf(2, 3)
+struct iio_trigger *iio_trigger_alloc(struct device *parent, const char *fmt, ...);
 void iio_trigger_free(struct iio_trigger *trig);
 
 /**
-- 
cgit v1.2.3


From 635ef601b2387c3215252c9931786524d122c0e7 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 8 Mar 2021 11:02:18 +0100
Subject: iio: Provide iio_read_channel_processed_scale() API

Since the old iio_read_channel_processed() would
lose precision if we fall back to reading raw and
scaling, we introduce a new API that will pass in
a scale factor when reading a processed channel:
iio_read_channel_processed_scale().

Refactor iio_read_channel_processed() as a special
case with scale factor 1.

Cc: Peter Rosin <peda@axentia.se>
Cc: Chris Lesiak <chris.lesiak@licor.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: linux-iio@vger.kernel.org
Link: https://lore.kernel.org/linux-iio/20201224011607.1059534-1-linus.walleij@linaro.org/
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210308100219.2732156-1-linus.walleij@linaro.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/inkern.c         | 16 ++++++++++++++--
 include/linux/iio/consumer.h | 15 +++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c
index db77a2d4a56b..c61fc06f98b8 100644
--- a/drivers/iio/inkern.c
+++ b/drivers/iio/inkern.c
@@ -688,7 +688,8 @@ int iio_read_channel_offset(struct iio_channel *chan, int *val, int *val2)
 }
 EXPORT_SYMBOL_GPL(iio_read_channel_offset);
 
-int iio_read_channel_processed(struct iio_channel *chan, int *val)
+int iio_read_channel_processed_scale(struct iio_channel *chan, int *val,
+				     unsigned int scale)
 {
 	int ret;
 
@@ -701,11 +702,15 @@ int iio_read_channel_processed(struct iio_channel *chan, int *val)
 	if (iio_channel_has_info(chan->channel, IIO_CHAN_INFO_PROCESSED)) {
 		ret = iio_channel_read(chan, val, NULL,
 				       IIO_CHAN_INFO_PROCESSED);
+		if (ret)
+			goto err_unlock;
+		*val *= scale;
 	} else {
 		ret = iio_channel_read(chan, val, NULL, IIO_CHAN_INFO_RAW);
 		if (ret < 0)
 			goto err_unlock;
-		ret = iio_convert_raw_to_processed_unlocked(chan, *val, val, 1);
+		ret = iio_convert_raw_to_processed_unlocked(chan, *val, val,
+							    scale);
 	}
 
 err_unlock:
@@ -713,6 +718,13 @@ err_unlock:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(iio_read_channel_processed_scale);
+
+int iio_read_channel_processed(struct iio_channel *chan, int *val)
+{
+	/* This is just a special case with scale factor 1 */
+	return iio_read_channel_processed_scale(chan, val, 1);
+}
 EXPORT_SYMBOL_GPL(iio_read_channel_processed);
 
 int iio_read_channel_scale(struct iio_channel *chan, int *val, int *val2)
diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h
index 0a90ba8fa1bb..5fa5957586cf 100644
--- a/include/linux/iio/consumer.h
+++ b/include/linux/iio/consumer.h
@@ -241,6 +241,21 @@ int iio_read_channel_average_raw(struct iio_channel *chan, int *val);
  */
 int iio_read_channel_processed(struct iio_channel *chan, int *val);
 
+/**
+ * iio_read_channel_processed_scale() - read and scale a processed value
+ * @chan:		The channel being queried.
+ * @val:		Value read back.
+ * @scale:		Scale factor to apply during the conversion
+ *
+ * Returns an error code or 0.
+ *
+ * This function will read a processed value from a channel. This will work
+ * like @iio_read_channel_processed() but also scale with an additional
+ * scale factor while attempting to minimize any precision loss.
+ */
+int iio_read_channel_processed_scale(struct iio_channel *chan, int *val,
+				     unsigned int scale);
+
 /**
  * iio_write_channel_attribute() - Write values to the device attribute.
  * @chan:	The channel being queried.
-- 
cgit v1.2.3


From 0a21526bc1d41456f1b320cce35c9c66238fb1c9 Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <aardelean@deviqon.com>
Date: Thu, 11 Mar 2021 11:10:42 +0200
Subject: iio: kfifo: add devm_iio_triggered_buffer_setup_ext variant

This is similar to the {devm_}iio_triggered_buffer_setup_ext variants added
via commit 5164c7889857 ("iio: triggered-buffer: add
{devm_}iio_triggered_buffer_setup_ext variants").

These can be used to pass extra buffer attributes to the buffer object.
This is a bit of temporary mechanism (hopefully) so that drivers that want
to allocate a kfifo buffer with extra buffer attributes, don't need to
include 'buffer_impl.h' directly. This can also become an API function (in
it's own right, unfortunately), but it may be a little less bad vs drivers
having to include 'buffer_impl.h'.

So, far the drivers that want to pass buffer attributes, all have to do
with some HW FIFO attributes, so there may be a chance of unifying them
into IIO core somehow (as some standard API). But, until that happens, we
just need to let them register their HW FIFO attributes directly (without
having to let them include 'buffer_impl.h' directly).

Signed-off-by: Alexandru Ardelean <aardelean@deviqon.com>
Link: https://lore.kernel.org/r/20210311091042.22417-1-aardelean@deviqon.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/kfifo_buf.c | 16 ++++++++++------
 include/linux/iio/kfifo_buf.h  | 12 ++++++++----
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/buffer/kfifo_buf.c b/drivers/iio/buffer/kfifo_buf.c
index d72cfb354be6..516eb3465de1 100644
--- a/drivers/iio/buffer/kfifo_buf.c
+++ b/drivers/iio/buffer/kfifo_buf.c
@@ -206,22 +206,24 @@ static struct iio_buffer *devm_iio_kfifo_allocate(struct device *dev)
 }
 
 /**
- * devm_iio_kfifo_buffer_setup - Allocate a kfifo buffer & attach it to an IIO device
+ * devm_iio_kfifo_buffer_setup_ext - Allocate a kfifo buffer & attach it to an IIO device
  * @dev: Device object to which to attach the life-time of this kfifo buffer
  * @indio_dev: The device the buffer should be attached to
  * @mode_flags: The mode flags for this buffer (INDIO_BUFFER_SOFTWARE and/or
  *		INDIO_BUFFER_TRIGGERED).
  * @setup_ops: The setup_ops required to configure the HW part of the buffer (optional)
+ * @buffer_attrs: Extra sysfs buffer attributes for this IIO buffer
  *
  * This function allocates a kfifo buffer via devm_iio_kfifo_allocate() and
  * attaches it to the IIO device via iio_device_attach_buffer().
  * This is meant to be a bit of a short-hand/helper function as there are a few
  * drivers that seem to do this.
  */
-int devm_iio_kfifo_buffer_setup(struct device *dev,
-				struct iio_dev *indio_dev,
-				int mode_flags,
-				const struct iio_buffer_setup_ops *setup_ops)
+int devm_iio_kfifo_buffer_setup_ext(struct device *dev,
+				    struct iio_dev *indio_dev,
+				    int mode_flags,
+				    const struct iio_buffer_setup_ops *setup_ops,
+				    const struct attribute **buffer_attrs)
 {
 	struct iio_buffer *buffer;
 
@@ -237,8 +239,10 @@ int devm_iio_kfifo_buffer_setup(struct device *dev,
 	indio_dev->modes |= mode_flags;
 	indio_dev->setup_ops = setup_ops;
 
+	buffer->attrs = buffer_attrs;
+
 	return iio_device_attach_buffer(indio_dev, buffer);
 }
-EXPORT_SYMBOL_GPL(devm_iio_kfifo_buffer_setup);
+EXPORT_SYMBOL_GPL(devm_iio_kfifo_buffer_setup_ext);
 
 MODULE_LICENSE("GPL");
diff --git a/include/linux/iio/kfifo_buf.h b/include/linux/iio/kfifo_buf.h
index 1522896e1daf..ccd2ceae7b25 100644
--- a/include/linux/iio/kfifo_buf.h
+++ b/include/linux/iio/kfifo_buf.h
@@ -10,9 +10,13 @@ struct device;
 struct iio_buffer *iio_kfifo_allocate(void);
 void iio_kfifo_free(struct iio_buffer *r);
 
-int devm_iio_kfifo_buffer_setup(struct device *dev,
-				struct iio_dev *indio_dev,
-				int mode_flags,
-				const struct iio_buffer_setup_ops *setup_ops);
+int devm_iio_kfifo_buffer_setup_ext(struct device *dev,
+				    struct iio_dev *indio_dev,
+				    int mode_flags,
+				    const struct iio_buffer_setup_ops *setup_ops,
+				    const struct attribute **buffer_attrs);
+
+#define devm_iio_kfifo_buffer_setup(dev, indio_dev, mode_flags, setup_ops)	\
+	devm_iio_kfifo_buffer_setup_ext((dev), (indio_dev), (mode_flags), (setup_ops), NULL)
 
 #endif
-- 
cgit v1.2.3


From 80346b2b55fcbb042acd0b90120004da8738101f Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Thu, 18 Mar 2021 11:48:57 -0700
Subject: iio: cros: unify hw fifo attributes without API changes

Commit 2e2366c2d141 ("iio: cros_ec: unify hw fifo attributes into the core file")
should be reverted as it set buffer extended attributes at
the wrong place. However, to revert it will requires to revert more
commits:
commit 165aea80e2e2 ("iio: cros_ec: use devm_iio_triggered_buffer_setup_ext()")
commit 21232b4456ba ("iio: buffer: remove iio_buffer_set_attrs() helper")).
and we would still have conflict with more recent development.
commit ee708e6baacd ("iio: buffer: introduce support for attaching more IIO buffers")

Instead, this commit reverts the first 2 commits without re-adding
iio_buffer_set_attrs() and set the buffer extended attributes at the
right place:

1. Instead of adding has_fw_fifo, deduct it from the configuration:
- EC must support FIFO (EC_FEATURE_MOTION_SENSE_FIFO) set.
- sensors send data a regular interval (accelerometer, gyro,
  magnetomer, barometer, light sensor).
- "Legacy accelerometer" is only present on EC without FIFO, so we don't
need to set buffer attributes.

2. devm_iio_triggered_buffer_setup_ext() does not need to be called when
EC does not support FIFO, as there is no FIFO to manage.

3. Use devm_iio_triggered_buffer_setup_ext() when EC has a FIFO to
specify the buffer extended attributes.

Fixes: 2e2366c2d141 ("iio: cros_ec: unify hw fifo attributes into the core file")
Fixes: 165aea80e2e2 ("iio: cros_ec: use devm_iio_triggered_buffer_setup_ext()")
Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20210318184857.2679181-1-gwendal@chromium.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/accel/cros_ec_accel_legacy.c           |  2 +-
 .../iio/common/cros_ec_sensors/cros_ec_lid_angle.c |  3 +--
 .../iio/common/cros_ec_sensors/cros_ec_sensors.c   |  3 +--
 .../common/cros_ec_sensors/cros_ec_sensors_core.c  | 24 ++++++++--------------
 drivers/iio/light/cros_ec_light_prox.c             |  3 +--
 drivers/iio/pressure/cros_ec_baro.c                |  3 +--
 include/linux/iio/common/cros_ec_sensors_core.h    |  3 +--
 7 files changed, 14 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/cros_ec_accel_legacy.c b/drivers/iio/accel/cros_ec_accel_legacy.c
index 8f1232c38e0d..b6f3471b62dc 100644
--- a/drivers/iio/accel/cros_ec_accel_legacy.c
+++ b/drivers/iio/accel/cros_ec_accel_legacy.c
@@ -215,7 +215,7 @@ static int cros_ec_accel_legacy_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
-					cros_ec_sensors_capture, NULL, false);
+					cros_ec_sensors_capture, NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c b/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
index 752f59037715..af801e203623 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
@@ -97,8 +97,7 @@ static int cros_ec_lid_angle_probe(struct platform_device *pdev)
 	if (!indio_dev)
 		return -ENOMEM;
 
-	ret = cros_ec_sensors_core_init(pdev, indio_dev, false, NULL,
-					NULL, false);
+	ret = cros_ec_sensors_core_init(pdev, indio_dev, false, NULL, NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
index dee1191de752..376a5b30010a 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
@@ -236,8 +236,7 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
 
 	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
 					cros_ec_sensors_capture,
-					cros_ec_sensors_push_data,
-					true);
+					cros_ec_sensors_push_data);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
index a266eedd9ec1..28bde13003b7 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
@@ -12,6 +12,7 @@
 #include <linux/iio/iio.h>
 #include <linux/iio/kfifo_buf.h>
 #include <linux/iio/sysfs.h>
+#include <linux/iio/trigger.h>
 #include <linux/iio/trigger_consumer.h>
 #include <linux/iio/triggered_buffer.h>
 #include <linux/kernel.h>
@@ -240,7 +241,6 @@ static void cros_ec_sensors_core_clean(void *arg)
  *    for backward compatibility.
  * @push_data:          function to call when cros_ec_sensorhub receives
  *    a sample for that sensor.
- * @has_hw_fifo:	Set true if this device has/uses a HW FIFO
  *
  * Return: 0 on success, -errno on failure.
  */
@@ -248,8 +248,7 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
 			      struct iio_dev *indio_dev,
 			      bool physical_device,
 			      cros_ec_sensors_capture_t trigger_capture,
-			      cros_ec_sensorhub_push_data_cb_t push_data,
-			      bool has_hw_fifo)
+			      cros_ec_sensorhub_push_data_cb_t push_data)
 {
 	struct device *dev = &pdev->dev;
 	struct cros_ec_sensors_core_state *state = iio_priv(indio_dev);
@@ -334,9 +333,9 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
 			 * We can not use trigger here, as events are generated
 			 * as soon as sample_frequency is set.
 			 */
-			ret = devm_iio_kfifo_buffer_setup(dev, indio_dev,
-							  INDIO_BUFFER_SOFTWARE,
-							  NULL);
+			ret = devm_iio_kfifo_buffer_setup_ext(dev, indio_dev,
+							      INDIO_BUFFER_SOFTWARE, NULL,
+							      cros_ec_sensor_fifo_attributes);
 			if (ret)
 				return ret;
 
@@ -355,21 +354,14 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
 			ret = iio_device_set_clock(indio_dev, CLOCK_BOOTTIME);
 			if (ret)
 				return ret;
-		} else {
-			const struct attribute **fifo_attrs;
-
-			if (has_hw_fifo)
-				fifo_attrs = cros_ec_sensor_fifo_attributes;
-			else
-				fifo_attrs = NULL;
 
+		} else {
 			/*
 			 * The only way to get samples in buffer is to set a
 			 * software trigger (systrig, hrtimer).
 			 */
-			ret = devm_iio_triggered_buffer_setup_ext(
-					dev, indio_dev, NULL, trigger_capture,
-					NULL, fifo_attrs);
+			ret = devm_iio_triggered_buffer_setup(dev, indio_dev,
+					NULL, trigger_capture, NULL);
 			if (ret)
 				return ret;
 		}
diff --git a/drivers/iio/light/cros_ec_light_prox.c b/drivers/iio/light/cros_ec_light_prox.c
index 75d6b5fcf2cc..de472f23d1cb 100644
--- a/drivers/iio/light/cros_ec_light_prox.c
+++ b/drivers/iio/light/cros_ec_light_prox.c
@@ -182,8 +182,7 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
 
 	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
 					cros_ec_sensors_capture,
-					cros_ec_sensors_push_data,
-					true);
+					cros_ec_sensors_push_data);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/pressure/cros_ec_baro.c b/drivers/iio/pressure/cros_ec_baro.c
index aa043cb9ac42..2f882e109423 100644
--- a/drivers/iio/pressure/cros_ec_baro.c
+++ b/drivers/iio/pressure/cros_ec_baro.c
@@ -139,8 +139,7 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
 
 	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
 					cros_ec_sensors_capture,
-					cros_ec_sensors_push_data,
-					true);
+					cros_ec_sensors_push_data);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h
index c9b80be82440..7ce8a8adad58 100644
--- a/include/linux/iio/common/cros_ec_sensors_core.h
+++ b/include/linux/iio/common/cros_ec_sensors_core.h
@@ -96,8 +96,7 @@ struct platform_device;
 int cros_ec_sensors_core_init(struct platform_device *pdev,
 			      struct iio_dev *indio_dev, bool physical_device,
 			      cros_ec_sensors_capture_t trigger_capture,
-			      cros_ec_sensorhub_push_data_cb_t push_data,
-			      bool has_hw_fifo);
+			      cros_ec_sensorhub_push_data_cb_t push_data);
 
 irqreturn_t cros_ec_sensors_capture(int irq, void *p);
 int cros_ec_sensors_push_data(struct iio_dev *indio_dev,
-- 
cgit v1.2.3


From 5d9034938720a15fa0f62db3e195c0c473c72c1b Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Thu, 18 Mar 2021 21:22:22 +0100
Subject: bpf: Fix typo 'accesible' into 'accessible'

Trivial fix.

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210318202223.164873-8-ribalda@chromium.org
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 39dce9d3c3a5..24678d6ecbcf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ struct bpf_iter_seq_info {
 	u32 seq_priv_size;
 };
 
-/* map is generic key/value storage optionally accesible by eBPF programs */
+/* map is generic key/value storage optionally accessible by eBPF programs */
 struct bpf_map_ops {
 	/* funcs callable from userspace (via syscall) */
 	int (*map_alloc_check)(union bpf_attr *attr);
-- 
cgit v1.2.3


From 8532f613bc78b6e0e32b486e720848d3f5569287 Mon Sep 17 00:00:00 2001
From: Ong Boon Leong <boon.leong.ong@intel.com>
Date: Fri, 26 Mar 2021 01:39:14 +0800
Subject: net: stmmac: introduce MSI Interrupt routines for mac, safety, RX &
 TX

Now we introduce MSI interrupt service routines and hook these routines
up if stmmac_open() sees valid irq line being requested:-

stmmac_mac_interrupt()    :- MAC (dev->irq), WOL (wol_irq), LPI (lpi_irq)
stmmac_safety_interrupt() :- Safety Feat Correctible Error (sfty_ce_irq)
                             & Uncorrectible Error (sfty_ue_irq)
stmmac_msi_intr_rx()      :- For all RX MSI irq (rx_irq)
stmmac_msi_intr_tx()      :- For all TX MSI irq (tx_irq)

Each of IRQs will have its unique name so that we can differentiate
them easily under /proc/interrupts.

Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h      |  15 +
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      |  16 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 437 +++++++++++++++++++---
 include/linux/stmmac.h                            |   8 +
 4 files changed, 431 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 5afb36a5c94c..c54a56b732b3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -259,6 +259,9 @@ struct stmmac_safety_stats {
 #define DMA_HW_FEAT_ACTPHYIF	0x70000000	/* Active/selected PHY iface */
 #define DEFAULT_DMA_PBL		8
 
+/* MSI defines */
+#define STMMAC_MSI_VEC_MAX	32
+
 /* PCS status and mask defines */
 #define	PCS_ANE_IRQ		BIT(2)	/* PCS Auto-Negotiation */
 #define	PCS_LINK_IRQ		BIT(1)	/* PCS Link */
@@ -315,6 +318,18 @@ enum dma_irq_dir {
 	DMA_DIR_RXTX = 0x3,
 };
 
+enum request_irq_err {
+	REQ_IRQ_ERR_ALL,
+	REQ_IRQ_ERR_TX,
+	REQ_IRQ_ERR_RX,
+	REQ_IRQ_ERR_SFTY_UE,
+	REQ_IRQ_ERR_SFTY_CE,
+	REQ_IRQ_ERR_LPI,
+	REQ_IRQ_ERR_WOL,
+	REQ_IRQ_ERR_MAC,
+	REQ_IRQ_ERR_NO,
+};
+
 /* EEE and LPI defines */
 #define	CORE_IRQ_TX_PATH_IN_LPI_MODE	(1 << 0)
 #define	CORE_IRQ_TX_PATH_EXIT_LPI_MODE	(1 << 1)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 4faad331a4ca..9966f6f10905 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -30,6 +30,10 @@ struct stmmac_resources {
 	int wol_irq;
 	int lpi_irq;
 	int irq;
+	int sfty_ce_irq;
+	int sfty_ue_irq;
+	int rx_irq[MTL_MAX_RX_QUEUES];
+	int tx_irq[MTL_MAX_TX_QUEUES];
 };
 
 struct stmmac_tx_info {
@@ -225,6 +229,18 @@ struct stmmac_priv {
 	void __iomem *mmcaddr;
 	void __iomem *ptpaddr;
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	int sfty_ce_irq;
+	int sfty_ue_irq;
+	int rx_irq[MTL_MAX_RX_QUEUES];
+	int tx_irq[MTL_MAX_TX_QUEUES];
+	/*irq name */
+	char int_name_mac[IFNAMSIZ + 9];
+	char int_name_wol[IFNAMSIZ + 9];
+	char int_name_lpi[IFNAMSIZ + 9];
+	char int_name_sfty_ce[IFNAMSIZ + 10];
+	char int_name_sfty_ue[IFNAMSIZ + 10];
+	char int_name_rx_irq[MTL_MAX_TX_QUEUES][IFNAMSIZ + 14];
+	char int_name_tx_irq[MTL_MAX_TX_QUEUES][IFNAMSIZ + 18];
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dbgfs_dir;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index abe990b9b07b..459477db455c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -105,6 +105,11 @@ module_param(chain_mode, int, 0444);
 MODULE_PARM_DESC(chain_mode, "To use chain instead of ring mode");
 
 static irqreturn_t stmmac_interrupt(int irq, void *dev_id);
+/* For MSI interrupts handling */
+static irqreturn_t stmmac_mac_interrupt(int irq, void *dev_id);
+static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id);
+static irqreturn_t stmmac_msi_intr_tx(int irq, void *data);
+static irqreturn_t stmmac_msi_intr_rx(int irq, void *data);
 
 #ifdef CONFIG_DEBUG_FS
 static const struct net_device_ops stmmac_netdev_ops;
@@ -2986,6 +2991,260 @@ static void stmmac_hw_teardown(struct net_device *dev)
 	clk_disable_unprepare(priv->plat->clk_ptp_ref);
 }
 
+static void stmmac_free_irq(struct net_device *dev,
+			    enum request_irq_err irq_err, int irq_idx)
+{
+	struct stmmac_priv *priv = netdev_priv(dev);
+	int j;
+
+	switch (irq_err) {
+	case REQ_IRQ_ERR_ALL:
+		irq_idx = priv->plat->tx_queues_to_use;
+		fallthrough;
+	case REQ_IRQ_ERR_TX:
+		for (j = irq_idx - 1; j >= 0; j--) {
+			if (priv->tx_irq[j] > 0)
+				free_irq(priv->tx_irq[j], &priv->tx_queue[j]);
+		}
+		irq_idx = priv->plat->rx_queues_to_use;
+		fallthrough;
+	case REQ_IRQ_ERR_RX:
+		for (j = irq_idx - 1; j >= 0; j--) {
+			if (priv->rx_irq[j] > 0)
+				free_irq(priv->rx_irq[j], &priv->rx_queue[j]);
+		}
+
+		if (priv->sfty_ue_irq > 0 && priv->sfty_ue_irq != dev->irq)
+			free_irq(priv->sfty_ue_irq, dev);
+		fallthrough;
+	case REQ_IRQ_ERR_SFTY_UE:
+		if (priv->sfty_ce_irq > 0 && priv->sfty_ce_irq != dev->irq)
+			free_irq(priv->sfty_ce_irq, dev);
+		fallthrough;
+	case REQ_IRQ_ERR_SFTY_CE:
+		if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq)
+			free_irq(priv->lpi_irq, dev);
+		fallthrough;
+	case REQ_IRQ_ERR_LPI:
+		if (priv->wol_irq > 0 && priv->wol_irq != dev->irq)
+			free_irq(priv->wol_irq, dev);
+		fallthrough;
+	case REQ_IRQ_ERR_WOL:
+		free_irq(dev->irq, dev);
+		fallthrough;
+	case REQ_IRQ_ERR_MAC:
+	case REQ_IRQ_ERR_NO:
+		/* If MAC IRQ request error, no more IRQ to free */
+		break;
+	}
+}
+
+static int stmmac_request_irq_multi_msi(struct net_device *dev)
+{
+	enum request_irq_err irq_err = REQ_IRQ_ERR_NO;
+	struct stmmac_priv *priv = netdev_priv(dev);
+	int irq_idx = 0;
+	char *int_name;
+	int ret;
+	int i;
+
+	/* For common interrupt */
+	int_name = priv->int_name_mac;
+	sprintf(int_name, "%s:%s", dev->name, "mac");
+	ret = request_irq(dev->irq, stmmac_mac_interrupt,
+			  0, int_name, dev);
+	if (unlikely(ret < 0)) {
+		netdev_err(priv->dev,
+			   "%s: alloc mac MSI %d (error: %d)\n",
+			   __func__, dev->irq, ret);
+		irq_err = REQ_IRQ_ERR_MAC;
+		goto irq_error;
+	}
+
+	/* Request the Wake IRQ in case of another line
+	 * is used for WoL
+	 */
+	if (priv->wol_irq > 0 && priv->wol_irq != dev->irq) {
+		int_name = priv->int_name_wol;
+		sprintf(int_name, "%s:%s", dev->name, "wol");
+		ret = request_irq(priv->wol_irq,
+				  stmmac_mac_interrupt,
+				  0, int_name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc wol MSI %d (error: %d)\n",
+				   __func__, priv->wol_irq, ret);
+			irq_err = REQ_IRQ_ERR_WOL;
+			goto irq_error;
+		}
+	}
+
+	/* Request the LPI IRQ in case of another line
+	 * is used for LPI
+	 */
+	if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq) {
+		int_name = priv->int_name_lpi;
+		sprintf(int_name, "%s:%s", dev->name, "lpi");
+		ret = request_irq(priv->lpi_irq,
+				  stmmac_mac_interrupt,
+				  0, int_name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc lpi MSI %d (error: %d)\n",
+				   __func__, priv->lpi_irq, ret);
+			irq_err = REQ_IRQ_ERR_LPI;
+			goto irq_error;
+		}
+	}
+
+	/* Request the Safety Feature Correctible Error line in
+	 * case of another line is used
+	 */
+	if (priv->sfty_ce_irq > 0 && priv->sfty_ce_irq != dev->irq) {
+		int_name = priv->int_name_sfty_ce;
+		sprintf(int_name, "%s:%s", dev->name, "safety-ce");
+		ret = request_irq(priv->sfty_ce_irq,
+				  stmmac_safety_interrupt,
+				  0, int_name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc sfty ce MSI %d (error: %d)\n",
+				   __func__, priv->sfty_ce_irq, ret);
+			irq_err = REQ_IRQ_ERR_SFTY_CE;
+			goto irq_error;
+		}
+	}
+
+	/* Request the Safety Feature Uncorrectible Error line in
+	 * case of another line is used
+	 */
+	if (priv->sfty_ue_irq > 0 && priv->sfty_ue_irq != dev->irq) {
+		int_name = priv->int_name_sfty_ue;
+		sprintf(int_name, "%s:%s", dev->name, "safety-ue");
+		ret = request_irq(priv->sfty_ue_irq,
+				  stmmac_safety_interrupt,
+				  0, int_name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc sfty ue MSI %d (error: %d)\n",
+				   __func__, priv->sfty_ue_irq, ret);
+			irq_err = REQ_IRQ_ERR_SFTY_UE;
+			goto irq_error;
+		}
+	}
+
+	/* Request Rx MSI irq */
+	for (i = 0; i < priv->plat->rx_queues_to_use; i++) {
+		if (priv->rx_irq[i] == 0)
+			continue;
+
+		int_name = priv->int_name_rx_irq[i];
+		sprintf(int_name, "%s:%s-%d", dev->name, "rx", i);
+		ret = request_irq(priv->rx_irq[i],
+				  stmmac_msi_intr_rx,
+				  0, int_name, &priv->rx_queue[i]);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc rx-%d  MSI %d (error: %d)\n",
+				   __func__, i, priv->rx_irq[i], ret);
+			irq_err = REQ_IRQ_ERR_RX;
+			irq_idx = i;
+			goto irq_error;
+		}
+	}
+
+	/* Request Tx MSI irq */
+	for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
+		if (priv->tx_irq[i] == 0)
+			continue;
+
+		int_name = priv->int_name_tx_irq[i];
+		sprintf(int_name, "%s:%s-%d", dev->name, "tx", i);
+		ret = request_irq(priv->tx_irq[i],
+				  stmmac_msi_intr_tx,
+				  0, int_name, &priv->tx_queue[i]);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: alloc tx-%d  MSI %d (error: %d)\n",
+				   __func__, i, priv->tx_irq[i], ret);
+			irq_err = REQ_IRQ_ERR_TX;
+			irq_idx = i;
+			goto irq_error;
+		}
+	}
+
+	return 0;
+
+irq_error:
+	stmmac_free_irq(dev, irq_err, irq_idx);
+	return ret;
+}
+
+static int stmmac_request_irq_single(struct net_device *dev)
+{
+	enum request_irq_err irq_err = REQ_IRQ_ERR_NO;
+	struct stmmac_priv *priv = netdev_priv(dev);
+	int ret;
+
+	ret = request_irq(dev->irq, stmmac_interrupt,
+			  IRQF_SHARED, dev->name, dev);
+	if (unlikely(ret < 0)) {
+		netdev_err(priv->dev,
+			   "%s: ERROR: allocating the IRQ %d (error: %d)\n",
+			   __func__, dev->irq, ret);
+		irq_err = REQ_IRQ_ERR_MAC;
+		return ret;
+	}
+
+	/* Request the Wake IRQ in case of another line
+	 * is used for WoL
+	 */
+	if (priv->wol_irq > 0 && priv->wol_irq != dev->irq) {
+		ret = request_irq(priv->wol_irq, stmmac_interrupt,
+				  IRQF_SHARED, dev->name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: ERROR: allocating the WoL IRQ %d (%d)\n",
+				   __func__, priv->wol_irq, ret);
+			irq_err = REQ_IRQ_ERR_WOL;
+			return ret;
+		}
+	}
+
+	/* Request the IRQ lines */
+	if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq) {
+		ret = request_irq(priv->lpi_irq, stmmac_interrupt,
+				  IRQF_SHARED, dev->name, dev);
+		if (unlikely(ret < 0)) {
+			netdev_err(priv->dev,
+				   "%s: ERROR: allocating the LPI IRQ %d (%d)\n",
+				   __func__, priv->lpi_irq, ret);
+			irq_err = REQ_IRQ_ERR_LPI;
+			goto irq_error;
+		}
+	}
+
+	return 0;
+
+irq_error:
+	stmmac_free_irq(dev, irq_err, 0);
+	return ret;
+}
+
+static int stmmac_request_irq(struct net_device *dev)
+{
+	struct stmmac_priv *priv = netdev_priv(dev);
+	int ret;
+
+	/* Request the IRQ lines */
+	if (priv->plat->multi_msi_en)
+		ret = stmmac_request_irq_multi_msi(dev);
+	else
+		ret = stmmac_request_irq_single(dev);
+
+	return ret;
+}
+
 /**
  *  stmmac_open - open entry point of the driver
  *  @dev : pointer to the device structure.
@@ -3077,50 +3336,15 @@ static int stmmac_open(struct net_device *dev)
 	/* We may have called phylink_speed_down before */
 	phylink_speed_up(priv->phylink);
 
-	/* Request the IRQ lines */
-	ret = request_irq(dev->irq, stmmac_interrupt,
-			  IRQF_SHARED, dev->name, dev);
-	if (unlikely(ret < 0)) {
-		netdev_err(priv->dev,
-			   "%s: ERROR: allocating the IRQ %d (error: %d)\n",
-			   __func__, dev->irq, ret);
+	ret = stmmac_request_irq(dev);
+	if (ret)
 		goto irq_error;
-	}
-
-	/* Request the Wake IRQ in case of another line is used for WoL */
-	if (priv->wol_irq != dev->irq) {
-		ret = request_irq(priv->wol_irq, stmmac_interrupt,
-				  IRQF_SHARED, dev->name, dev);
-		if (unlikely(ret < 0)) {
-			netdev_err(priv->dev,
-				   "%s: ERROR: allocating the WoL IRQ %d (%d)\n",
-				   __func__, priv->wol_irq, ret);
-			goto wolirq_error;
-		}
-	}
-
-	/* Request the IRQ lines */
-	if (priv->lpi_irq > 0) {
-		ret = request_irq(priv->lpi_irq, stmmac_interrupt, IRQF_SHARED,
-				  dev->name, dev);
-		if (unlikely(ret < 0)) {
-			netdev_err(priv->dev,
-				   "%s: ERROR: allocating the LPI IRQ %d (%d)\n",
-				   __func__, priv->lpi_irq, ret);
-			goto lpiirq_error;
-		}
-	}
 
 	stmmac_enable_all_queues(priv);
 	netif_tx_start_all_queues(priv->dev);
 
 	return 0;
 
-lpiirq_error:
-	if (priv->wol_irq != dev->irq)
-		free_irq(priv->wol_irq, dev);
-wolirq_error:
-	free_irq(dev->irq, dev);
 irq_error:
 	phylink_stop(priv->phylink);
 
@@ -3170,11 +3394,7 @@ static int stmmac_release(struct net_device *dev)
 		hrtimer_cancel(&priv->tx_queue[chan].txtimer);
 
 	/* Free the IRQ lines */
-	free_irq(dev->irq, dev);
-	if (priv->wol_irq != dev->irq)
-		free_irq(priv->wol_irq, dev);
-	if (priv->lpi_irq > 0)
-		free_irq(priv->lpi_irq, dev);
+	stmmac_free_irq(dev, REQ_IRQ_ERR_ALL, 0);
 
 	if (priv->eee_enabled) {
 		priv->tx_path_in_lpi_mode = false;
@@ -4477,15 +4697,136 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t stmmac_mac_interrupt(int irq, void *dev_id)
+{
+	struct net_device *dev = (struct net_device *)dev_id;
+	struct stmmac_priv *priv = netdev_priv(dev);
+
+	if (unlikely(!dev)) {
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
+		return IRQ_NONE;
+	}
+
+	/* Check if adapter is up */
+	if (test_bit(STMMAC_DOWN, &priv->state))
+		return IRQ_HANDLED;
+
+	/* To handle Common interrupts */
+	stmmac_common_interrupt(priv);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id)
+{
+	struct net_device *dev = (struct net_device *)dev_id;
+	struct stmmac_priv *priv = netdev_priv(dev);
+
+	if (unlikely(!dev)) {
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
+		return IRQ_NONE;
+	}
+
+	/* Check if adapter is up */
+	if (test_bit(STMMAC_DOWN, &priv->state))
+		return IRQ_HANDLED;
+
+	/* Check if a fatal error happened */
+	stmmac_safety_feat_interrupt(priv);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t stmmac_msi_intr_tx(int irq, void *data)
+{
+	struct stmmac_tx_queue *tx_q = (struct stmmac_tx_queue *)data;
+	int chan = tx_q->queue_index;
+	struct stmmac_priv *priv;
+	int status;
+
+	priv = container_of(tx_q, struct stmmac_priv, tx_queue[chan]);
+
+	if (unlikely(!data)) {
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
+		return IRQ_NONE;
+	}
+
+	/* Check if adapter is up */
+	if (test_bit(STMMAC_DOWN, &priv->state))
+		return IRQ_HANDLED;
+
+	status = stmmac_napi_check(priv, chan, DMA_DIR_TX);
+
+	if (unlikely(status & tx_hard_error_bump_tc)) {
+		/* Try to bump up the dma threshold on this failure */
+		if (unlikely(priv->xstats.threshold != SF_DMA_MODE) &&
+		    tc <= 256) {
+			tc += 64;
+			if (priv->plat->force_thresh_dma_mode)
+				stmmac_set_dma_operation_mode(priv,
+							      tc,
+							      tc,
+							      chan);
+			else
+				stmmac_set_dma_operation_mode(priv,
+							      tc,
+							      SF_DMA_MODE,
+							      chan);
+			priv->xstats.threshold = tc;
+		}
+	} else if (unlikely(status == tx_hard_error)) {
+		stmmac_tx_err(priv, chan);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t stmmac_msi_intr_rx(int irq, void *data)
+{
+	struct stmmac_rx_queue *rx_q = (struct stmmac_rx_queue *)data;
+	int chan = rx_q->queue_index;
+	struct stmmac_priv *priv;
+
+	priv = container_of(rx_q, struct stmmac_priv, rx_queue[chan]);
+
+	if (unlikely(!data)) {
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
+		return IRQ_NONE;
+	}
+
+	/* Check if adapter is up */
+	if (test_bit(STMMAC_DOWN, &priv->state))
+		return IRQ_HANDLED;
+
+	stmmac_napi_check(priv, chan, DMA_DIR_RX);
+
+	return IRQ_HANDLED;
+}
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
 /* Polling receive - used by NETCONSOLE and other diagnostic tools
  * to allow network I/O with interrupts disabled.
  */
 static void stmmac_poll_controller(struct net_device *dev)
 {
-	disable_irq(dev->irq);
-	stmmac_interrupt(dev->irq, dev);
-	enable_irq(dev->irq);
+	struct stmmac_priv *priv = netdev_priv(dev);
+	int i;
+
+	/* If adapter is down, do nothing */
+	if (test_bit(STMMAC_DOWN, &priv->state))
+		return;
+
+	if (priv->plat->multi_msi_en) {
+		for (i = 0; i < priv->plat->rx_queues_to_use; i++)
+			stmmac_msi_intr_rx(0, &priv->rx_queue[i]);
+
+		for (i = 0; i < priv->plat->tx_queues_to_use; i++)
+			stmmac_msi_intr_tx(0, &priv->tx_queue[i]);
+	} else {
+		disable_irq(dev->irq);
+		stmmac_interrupt(dev->irq, dev);
+		enable_irq(dev->irq);
+	}
 }
 #endif
 
@@ -5283,6 +5624,12 @@ int stmmac_dvr_probe(struct device *device,
 	priv->dev->irq = res->irq;
 	priv->wol_irq = res->wol_irq;
 	priv->lpi_irq = res->lpi_irq;
+	priv->sfty_ce_irq = res->sfty_ce_irq;
+	priv->sfty_ue_irq = res->sfty_ue_irq;
+	for (i = 0; i < MTL_MAX_RX_QUEUES; i++)
+		priv->rx_irq[i] = res->rx_irq[i];
+	for (i = 0; i < MTL_MAX_TX_QUEUES; i++)
+		priv->tx_irq[i] = res->tx_irq[i];
 
 	if (!IS_ERR_OR_NULL(res->mac))
 		memcpy(priv->dev->dev_addr, res->mac, ETH_ALEN);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index febdb43d27e5..afc12b9385db 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -237,5 +237,13 @@ struct plat_stmmacenet_data {
 	struct pci_dev *pdev;
 	bool has_crossts;
 	int int_snapshot_num;
+	bool multi_msi_en;
+	int msi_mac_vec;
+	int msi_wol_vec;
+	int msi_lpi_vec;
+	int msi_sfty_ce_vec;
+	int msi_sfty_ue_vec;
+	int msi_rx_base_vec;
+	int msi_tx_base_vec;
 };
 #endif
-- 
cgit v1.2.3


From 6ccf12ae111e49324b439410066e8cc359aeee6d Mon Sep 17 00:00:00 2001
From: "Wong, Vee Khee" <vee.khee.wong@intel.com>
Date: Fri, 26 Mar 2021 01:39:16 +0800
Subject: net: stmmac: use interrupt mode INTM=1 for multi-MSI

For interrupt mode INTM=0, TX/RX transfer complete will trigger signal
not only on sbd_perch_[tx|rx]_intr_o (Transmit/Receive Per Channel) but
also on the sbd_intr_o (Common).

As for multi-MSI implementation, setting interrupt mode INTM=1 is more
efficient as each TX intr and RX intr (TI/RI) will be handled by TX/RX ISR
without the need of calling the common MAC ISR.

Updated the TX/RX NORMAL interrupts status checking process as the
NIS status bit is not asserted for any RI/TI events for INTM=1.

Signed-off-by: Wong, Vee Khee <vee.khee.wong@intel.com>
Co-developed-by: Voon Weifeng <weifeng.voon@intel.com>
Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c  |  7 +++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h  |  3 +++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c  | 23 +++++++++++------------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  1 +
 include/linux/stmmac.h                            |  1 +
 5 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 8954b85eb850..cb17f6c35e54 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -161,6 +161,13 @@ static void dwmac4_dma_init(void __iomem *ioaddr,
 		value |= DMA_SYS_BUS_EAME;
 
 	writel(value, ioaddr + DMA_SYS_BUS_MODE);
+
+	if (dma_cfg->multi_msi_en) {
+		value = readl(ioaddr + DMA_BUS_MODE);
+		value &= ~DMA_BUS_MODE_INTM_MASK;
+		value |= (DMA_BUS_MODE_INTM_MODE1 << DMA_BUS_MODE_INTM_SHIFT);
+		writel(value, ioaddr + DMA_BUS_MODE);
+	}
 }
 
 static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
index 5c0c53832adb..05481eb13ba6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
@@ -25,6 +25,9 @@
 #define DMA_TBS_CTRL			0x00001050
 
 /* DMA Bus Mode bitmap */
+#define DMA_BUS_MODE_INTM_MASK		GENMASK(17, 16)
+#define DMA_BUS_MODE_INTM_SHIFT		16
+#define DMA_BUS_MODE_INTM_MODE1		0x1
 #define DMA_BUS_MODE_SFT_RESET		BIT(0)
 
 /* DMA SYS Bus Mode bitmap */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
index 3fa602dabf49..e63270267578 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
@@ -166,20 +166,19 @@ int dwmac4_dma_interrupt(void __iomem *ioaddr,
 		}
 	}
 	/* TX/RX NORMAL interrupts */
-	if (likely(intr_status & DMA_CHAN_STATUS_NIS)) {
+	if (likely(intr_status & DMA_CHAN_STATUS_NIS))
 		x->normal_irq_n++;
-		if (likely(intr_status & DMA_CHAN_STATUS_RI)) {
-			x->rx_normal_irq_n++;
-			ret |= handle_rx;
-		}
-		if (likely(intr_status & (DMA_CHAN_STATUS_TI |
-					  DMA_CHAN_STATUS_TBU))) {
-			x->tx_normal_irq_n++;
-			ret |= handle_tx;
-		}
-		if (unlikely(intr_status & DMA_CHAN_STATUS_ERI))
-			x->rx_early_irq++;
+	if (likely(intr_status & DMA_CHAN_STATUS_RI)) {
+		x->rx_normal_irq_n++;
+		ret |= handle_rx;
+	}
+	if (likely(intr_status & (DMA_CHAN_STATUS_TI |
+		DMA_CHAN_STATUS_TBU))) {
+		x->tx_normal_irq_n++;
+		ret |= handle_tx;
 	}
+	if (unlikely(intr_status & DMA_CHAN_STATUS_ERI))
+		x->rx_early_irq++;
 
 	writel(intr_status & intr_en, ioaddr + DMA_CHAN_STATUS(chan));
 	return ret;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 459477db455c..f4fa5402cd64 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5620,6 +5620,7 @@ int stmmac_dvr_probe(struct device *device,
 	priv->plat = plat_dat;
 	priv->ioaddr = res->addr;
 	priv->dev->base_addr = (unsigned long)res->addr;
+	priv->plat->dma_cfg->multi_msi_en = priv->plat->multi_msi_en;
 
 	priv->dev->irq = res->irq;
 	priv->wol_irq = res->wol_irq;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index afc12b9385db..e338ef7abc00 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -96,6 +96,7 @@ struct stmmac_dma_cfg {
 	int mixed_burst;
 	bool aal;
 	bool eame;
+	bool multi_msi_en;
 };
 
 #define AXI_BLEN	7
-- 
cgit v1.2.3


From cb9444130662c6c13022579c861098f212db2562 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 25 Mar 2021 11:08:13 -0700
Subject: sysctl: add proc_dou8vec_minmax()

Networking has many sysctls that could fit in one u8.

This patch adds proc_dou8vec_minmax() for this purpose.

Note that the .extra1 and .extra2 fields are pointing
to integers, because it makes conversions easier.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_sysctl.c  |  6 +++++
 include/linux/sysctl.h |  2 ++
 kernel/sysctl.c        | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 984e42f8cb11..7256b8962e3c 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1108,6 +1108,11 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
 			err |= sysctl_err(path, table, "array not allowed");
 	}
 
+	if (table->proc_handler == proc_dou8vec_minmax) {
+		if (table->maxlen != sizeof(u8))
+			err |= sysctl_err(path, table, "array not allowed");
+	}
+
 	return err;
 }
 
@@ -1123,6 +1128,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
 		    (table->proc_handler == proc_douintvec) ||
 		    (table->proc_handler == proc_douintvec_minmax) ||
 		    (table->proc_handler == proc_dointvec_minmax) ||
+		    (table->proc_handler == proc_dou8vec_minmax) ||
 		    (table->proc_handler == proc_dointvec_jiffies) ||
 		    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
 		    (table->proc_handler == proc_dointvec_ms_jiffies) ||
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 51298a4f4623..d99ca99837de 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -53,6 +53,8 @@ int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
+int proc_dou8vec_minmax(struct ctl_table *table, int write, void *buffer,
+			size_t *lenp, loff_t *ppos);
 int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *);
 int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..90d2892ef6a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1034,6 +1034,65 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 do_proc_douintvec_minmax_conv, &param);
 }
 
+/**
+ * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(u8) unsigned chars
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success or an error on write when the range check fails.
+ */
+int proc_dou8vec_minmax(struct ctl_table *table, int write,
+			void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp;
+	unsigned int min = 0, max = 255U, val;
+	u8 *data = table->data;
+	struct do_proc_douintvec_minmax_conv_param param = {
+		.min = &min,
+		.max = &max,
+	};
+	int res;
+
+	/* Do not support arrays yet. */
+	if (table->maxlen != sizeof(u8))
+		return -EINVAL;
+
+	if (table->extra1) {
+		min = *(unsigned int *) table->extra1;
+		if (min > 255U)
+			return -EINVAL;
+	}
+	if (table->extra2) {
+		max = *(unsigned int *) table->extra2;
+		if (max > 255U)
+			return -EINVAL;
+	}
+
+	tmp = *table;
+
+	tmp.maxlen = sizeof(val);
+	tmp.data = &val;
+	val = *data;
+	res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos,
+				do_proc_douintvec_minmax_conv, &param);
+	if (res)
+		return res;
+	if (write)
+		*data = val;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
+
 static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 					unsigned int *valp,
 					int write, void *data)
@@ -1582,6 +1641,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_dou8vec_minmax(struct ctl_table *table, int write,
+			void *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void *buffer, size_t *lenp, loff_t *ppos)
 {
-- 
cgit v1.2.3


From b910eaaaa4b89976ef02e5d6448f3f73dc671d91 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 22 Mar 2021 22:51:46 -0700
Subject: bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper

Jiri Olsa reported a bug ([1]) in kernel where cgroup local
storage pointer may be NULL in bpf_get_local_storage() helper.
There are two issues uncovered by this bug:
  (1). kprobe or tracepoint prog incorrectly sets cgroup local storage
       before prog run,
  (2). due to change from preempt_disable to migrate_disable,
       preemption is possible and percpu storage might be overwritten
       by other tasks.

This issue (1) is fixed in [2]. This patch tried to address issue (2).
The following shows how things can go wrong:
  task 1:   bpf_cgroup_storage_set() for percpu local storage
         preemption happens
  task 2:   bpf_cgroup_storage_set() for percpu local storage
         preemption happens
  task 1:   run bpf program

task 1 will effectively use the percpu local storage setting by task 2
which will be either NULL or incorrect ones.

Instead of just one common local storage per cpu, this patch fixed
the issue by permitting 8 local storages per cpu and each local
storage is identified by a task_struct pointer. This way, we
allow at most 8 nested preemption between bpf_cgroup_storage_set()
and bpf_cgroup_storage_unset(). The percpu local storage slot
is released (calling bpf_cgroup_storage_unset()) by the same task
after bpf program finished running.
bpf_test_run() is also fixed to use the new bpf_cgroup_storage_set()
interface.

The patch is tested on top of [2] with reproducer in [1].
Without this patch, kernel will emit error in 2-3 minutes.
With this patch, after one hour, still no error.

 [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T
 [2] https://lore.kernel.org/bpf/20210309185028.3763817-1-yhs@fb.com

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Link: https://lore.kernel.org/bpf/20210323055146.3334476-1-yhs@fb.com
---
 include/linux/bpf-cgroup.h | 57 +++++++++++++++++++++++++++++++++++++++-------
 include/linux/bpf.h        | 22 ++++++++++++++----
 kernel/bpf/helpers.c       | 15 ++++++++----
 kernel/bpf/local_storage.c |  5 ++--
 net/bpf/test_run.c         |  6 ++++-
 5 files changed, 86 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c42e02b4d84b..6a29fe11485d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -20,14 +20,25 @@ struct bpf_sock_ops_kern;
 struct bpf_cgroup_storage;
 struct ctl_table;
 struct ctl_table_header;
+struct task_struct;
 
 #ifdef CONFIG_CGROUP_BPF
 
 extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
 #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
 
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
-		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+#define BPF_CGROUP_STORAGE_NEST_MAX	8
+
+struct bpf_cgroup_storage_info {
+	struct task_struct *task;
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+};
+
+/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks
+ * to use bpf cgroup storage simultaneously.
+ */
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
@@ -161,13 +172,42 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	return BPF_CGROUP_STORAGE_SHARED;
 }
 
-static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
-					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
+static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
+					 *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
 {
 	enum bpf_cgroup_storage_type stype;
+	int i, err = 0;
+
+	preempt_disable();
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
+			continue;
+
+		this_cpu_write(bpf_cgroup_storage_info[i].task, current);
+		for_each_cgroup_storage_type(stype)
+			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
+				       storage[stype]);
+		goto out;
+	}
+	err = -EBUSY;
+	WARN_ON_ONCE(1);
+
+out:
+	preempt_enable();
+	return err;
+}
+
+static inline void bpf_cgroup_storage_unset(void)
+{
+	int i;
+
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+			continue;
 
-	for_each_cgroup_storage_type(stype)
-		this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
+		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
+		return;
+	}
 }
 
 struct bpf_cgroup_storage *
@@ -448,8 +488,9 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
-static inline void bpf_cgroup_storage_set(
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
+static inline int bpf_cgroup_storage_set(
+	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; }
+static inline void bpf_cgroup_storage_unset(void) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 24678d6ecbcf..5a0801b420ca 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1106,6 +1106,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 /* BPF program asks to set CN on the packet. */
 #define BPF_RET_SET_CN						(1 << 0)
 
+/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY,
+ * if bpf_cgroup_storage_set() failed, the rest of programs
+ * will not execute. This should be a really rare scenario
+ * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of
+ * preemptions all between bpf_cgroup_storage_set() and
+ * bpf_cgroup_storage_unset() on the same cpu.
+ */
 #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags)		\
 	({								\
 		struct bpf_prog_array_item *_item;			\
@@ -1118,10 +1125,12 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		_array = rcu_dereference(array);			\
 		_item = &_array->items[0];				\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
+				break;					\
 			func_ret = func(_prog, ctx);			\
 			_ret &= (func_ret & 1);				\
 			*(ret_flags) |= (func_ret >> 1);			\
+			bpf_cgroup_storage_unset();			\
 			_item++;					\
 		}							\
 		rcu_read_unlock();					\
@@ -1142,9 +1151,14 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			goto _out;			\
 		_item = &_array->items[0];		\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (set_cg_storage)		\
-				bpf_cgroup_storage_set(_item->cgroup_storage);	\
-			_ret &= func(_prog, ctx);	\
+			if (!set_cg_storage) {			\
+				_ret &= func(_prog, ctx);	\
+			} else {				\
+				if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
+					break;			\
+				_ret &= func(_prog, ctx);	\
+				bpf_cgroup_storage_unset();	\
+			}				\
 			_item++;			\
 		}					\
 _out:							\
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 074800226327..f306611c4ddf 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -382,8 +382,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
 };
 
 #ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage*,
-		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
+DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
+		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
@@ -392,10 +392,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 * verifier checks that its value is correct.
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
-	struct bpf_cgroup_storage *storage;
+	struct bpf_cgroup_storage *storage = NULL;
 	void *ptr;
+	int i;
 
-	storage = this_cpu_read(bpf_cgroup_storage[stype]);
+	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
+		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+			continue;
+
+		storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
+		break;
+	}
 
 	if (stype == BPF_CGROUP_STORAGE_SHARED)
 		ptr = &READ_ONCE(storage->buf)->data[0];
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2d4f9ac12377..bd11db9774c3 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,10 +9,11 @@
 #include <linux/slab.h>
 #include <uapi/linux/btf.h>
 
-DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
-
 #ifdef CONFIG_CGROUP_BPF
 
+DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
+	       bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
+
 #include "../cgroup/cgroup-internal.h"
 
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0abdd67f44b1..4aabf71cd95d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -106,12 +106,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 
 	bpf_test_timer_enter(&t);
 	do {
-		bpf_cgroup_storage_set(storage);
+		ret = bpf_cgroup_storage_set(storage);
+		if (ret)
+			break;
 
 		if (xdp)
 			*retval = bpf_prog_run_xdp(prog, ctx);
 		else
 			*retval = BPF_PROG_RUN(prog, ctx);
+
+		bpf_cgroup_storage_unset();
 	} while (bpf_test_timer_continue(&t, repeat, &ret, time));
 	bpf_test_timer_leave(&t);
 
-- 
cgit v1.2.3


From 7547738d28dd572d40e0e1c1f854c80e3cb41bec Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Tue, 16 Mar 2021 17:07:31 -0400
Subject: oid_registry: Add OIDs for ECDSA with SHA224/256/384/512

Add OIDs for ECDSA with SHA224/256/384/512.

Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/oid_registry.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 4462ed2c18cd..b504e2f36b25 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -19,8 +19,12 @@
 enum OID {
 	OID_id_dsa_with_sha1,		/* 1.2.840.10030.4.3 */
 	OID_id_dsa,			/* 1.2.840.10040.4.1 */
-	OID_id_ecdsa_with_sha1,		/* 1.2.840.10045.4.1 */
 	OID_id_ecPublicKey,		/* 1.2.840.10045.2.1 */
+	OID_id_ecdsa_with_sha1,		/* 1.2.840.10045.4.1 */
+	OID_id_ecdsa_with_sha224,	/* 1.2.840.10045.4.3.1 */
+	OID_id_ecdsa_with_sha256,	/* 1.2.840.10045.4.3.2 */
+	OID_id_ecdsa_with_sha384,	/* 1.2.840.10045.4.3.3 */
+	OID_id_ecdsa_with_sha512,	/* 1.2.840.10045.4.3.4 */
 
 	/* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */
 	OID_rsaEncryption,		/* 1.2.840.113549.1.1.1 */
-- 
cgit v1.2.3


From d1a303e8616c5ba1260722bb9068bbc0d1704847 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Tue, 16 Mar 2021 17:07:36 -0400
Subject: x509: Detect sm2 keys by their parameters OID

Detect whether a key is an sm2 type of key by its OID in the parameters
array rather than assuming that everything under OID_id_ecPublicKey
is sm2, which is not the case.

Cc: David Howells <dhowells@redhat.com>
Cc: keyrings@vger.kernel.org
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Tested-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/asymmetric_keys/x509_cert_parser.c | 12 +++++++++++-
 include/linux/oid_registry.h              |  1 +
 lib/oid_registry.c                        | 24 ++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 52c9b455fc7d..1621ceaf5c95 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -459,6 +459,7 @@ int x509_extract_key_data(void *context, size_t hdrlen,
 			  const void *value, size_t vlen)
 {
 	struct x509_parse_context *ctx = context;
+	enum OID oid;
 
 	ctx->key_algo = ctx->last_oid;
 	switch (ctx->last_oid) {
@@ -470,7 +471,16 @@ int x509_extract_key_data(void *context, size_t hdrlen,
 		ctx->cert->pub->pkey_algo = "ecrdsa";
 		break;
 	case OID_id_ecPublicKey:
-		ctx->cert->pub->pkey_algo = "sm2";
+		if (parse_OID(ctx->params, ctx->params_size, &oid) != 0)
+			return -EBADMSG;
+
+		switch (oid) {
+		case OID_sm2:
+			ctx->cert->pub->pkey_algo = "sm2";
+			break;
+		default:
+			return -ENOPKG;
+		}
 		break;
 	default:
 		return -ENOPKG;
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index b504e2f36b25..f32d91895e4d 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -121,6 +121,7 @@ enum OID {
 };
 
 extern enum OID look_up_OID(const void *data, size_t datasize);
+extern int parse_OID(const void *data, size_t datasize, enum OID *oid);
 extern int sprint_oid(const void *, size_t, char *, size_t);
 extern int sprint_OID(enum OID, char *, size_t);
 
diff --git a/lib/oid_registry.c b/lib/oid_registry.c
index f7ad43f28579..3dfaa836e7c5 100644
--- a/lib/oid_registry.c
+++ b/lib/oid_registry.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/bug.h>
+#include <linux/asn1.h>
 #include "oid_registry_data.c"
 
 MODULE_DESCRIPTION("OID Registry");
@@ -92,6 +93,29 @@ enum OID look_up_OID(const void *data, size_t datasize)
 }
 EXPORT_SYMBOL_GPL(look_up_OID);
 
+/**
+ * parse_OID - Parse an OID from a bytestream
+ * @data: Binary representation of the header + OID
+ * @datasize: Size of the binary representation
+ * @oid: Pointer to oid to return result
+ *
+ * Parse an OID from a bytestream that holds the OID in the format
+ * ASN1_OID | length | oid. The length indicator must equal to datasize - 2.
+ * -EBADMSG is returned if the bytestream is too short.
+ */
+int parse_OID(const void *data, size_t datasize, enum OID *oid)
+{
+	const unsigned char *v = data;
+
+	/* we need 2 bytes of header and at least 1 byte for oid */
+	if (datasize < 3 || v[0] != ASN1_OID || v[1] != datasize - 2)
+		return -EBADMSG;
+
+	*oid = look_up_OID(data + 2, datasize - 2);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(parse_OID);
+
 /*
  * sprint_OID - Print an Object Identifier into a buffer
  * @data: The encoded OID to print
-- 
cgit v1.2.3


From 299f561a66939debba70e6d7c67aa01ed32613d9 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Tue, 16 Mar 2021 17:07:37 -0400
Subject: x509: Add support for parsing x509 certs with ECDSA keys

Add support for parsing of x509 certificates that contain ECDSA keys,
such as NIST P256, that have been signed by a CA using any of the
current SHA hash algorithms.

Cc: David Howells <dhowells@redhat.com>
Cc: keyrings@vger.kernel.org
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/asymmetric_keys/public_key.c       |  4 +++-
 crypto/asymmetric_keys/x509_cert_parser.c | 34 ++++++++++++++++++++++++++++++-
 crypto/asymmetric_keys/x509_public_key.c  |  4 +++-
 include/linux/oid_registry.h              |  2 ++
 4 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index 788a4ba1e2e7..4fefb219bfdc 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/scatterlist.h>
+#include <linux/asn1.h>
 #include <keys/asymmetric-subtype.h>
 #include <crypto/public_key.h>
 #include <crypto/akcipher.h>
@@ -85,7 +86,8 @@ int software_key_determine_akcipher(const char *encoding,
 		return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0;
 	}
 
-	if (strcmp(encoding, "raw") == 0) {
+	if (strcmp(encoding, "raw") == 0 ||
+	    strcmp(encoding, "x962") == 0) {
 		strcpy(alg_name, pkey->pkey_algo);
 		return 0;
 	}
diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 1621ceaf5c95..f5d547c6dfb5 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -227,6 +227,26 @@ int x509_note_pkey_algo(void *context, size_t hdrlen,
 		ctx->cert->sig->hash_algo = "sha224";
 		goto rsa_pkcs1;
 
+	case OID_id_ecdsa_with_sha1:
+		ctx->cert->sig->hash_algo = "sha1";
+		goto ecdsa;
+
+	case OID_id_ecdsa_with_sha224:
+		ctx->cert->sig->hash_algo = "sha224";
+		goto ecdsa;
+
+	case OID_id_ecdsa_with_sha256:
+		ctx->cert->sig->hash_algo = "sha256";
+		goto ecdsa;
+
+	case OID_id_ecdsa_with_sha384:
+		ctx->cert->sig->hash_algo = "sha384";
+		goto ecdsa;
+
+	case OID_id_ecdsa_with_sha512:
+		ctx->cert->sig->hash_algo = "sha512";
+		goto ecdsa;
+
 	case OID_gost2012Signature256:
 		ctx->cert->sig->hash_algo = "streebog256";
 		goto ecrdsa;
@@ -255,6 +275,11 @@ sm2:
 	ctx->cert->sig->encoding = "raw";
 	ctx->algo_oid = ctx->last_oid;
 	return 0;
+ecdsa:
+	ctx->cert->sig->pkey_algo = "ecdsa";
+	ctx->cert->sig->encoding = "x962";
+	ctx->algo_oid = ctx->last_oid;
+	return 0;
 }
 
 /*
@@ -276,7 +301,8 @@ int x509_note_signature(void *context, size_t hdrlen,
 
 	if (strcmp(ctx->cert->sig->pkey_algo, "rsa") == 0 ||
 	    strcmp(ctx->cert->sig->pkey_algo, "ecrdsa") == 0 ||
-	    strcmp(ctx->cert->sig->pkey_algo, "sm2") == 0) {
+	    strcmp(ctx->cert->sig->pkey_algo, "sm2") == 0 ||
+	    strcmp(ctx->cert->sig->pkey_algo, "ecdsa") == 0) {
 		/* Discard the BIT STRING metadata */
 		if (vlen < 1 || *(const u8 *)value != 0)
 			return -EBADMSG;
@@ -478,6 +504,12 @@ int x509_extract_key_data(void *context, size_t hdrlen,
 		case OID_sm2:
 			ctx->cert->pub->pkey_algo = "sm2";
 			break;
+		case OID_id_prime192v1:
+			ctx->cert->pub->pkey_algo = "ecdsa-nist-p192";
+			break;
+		case OID_id_prime256v1:
+			ctx->cert->pub->pkey_algo = "ecdsa-nist-p256";
+			break;
 		default:
 			return -ENOPKG;
 		}
diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c
index ae450eb8be14..3d45161b271a 100644
--- a/crypto/asymmetric_keys/x509_public_key.c
+++ b/crypto/asymmetric_keys/x509_public_key.c
@@ -129,7 +129,9 @@ int x509_check_for_self_signed(struct x509_certificate *cert)
 	}
 
 	ret = -EKEYREJECTED;
-	if (strcmp(cert->pub->pkey_algo, cert->sig->pkey_algo) != 0)
+	if (strcmp(cert->pub->pkey_algo, cert->sig->pkey_algo) != 0 &&
+	    (strncmp(cert->pub->pkey_algo, "ecdsa-", 6) != 0 ||
+	     strcmp(cert->sig->pkey_algo, "ecdsa") != 0))
 		goto out;
 
 	ret = public_key_verify_signature(cert->pub, cert->sig);
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index f32d91895e4d..3583908cf1ca 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -20,6 +20,8 @@ enum OID {
 	OID_id_dsa_with_sha1,		/* 1.2.840.10030.4.3 */
 	OID_id_dsa,			/* 1.2.840.10040.4.1 */
 	OID_id_ecPublicKey,		/* 1.2.840.10045.2.1 */
+	OID_id_prime192v1,		/* 1.2.840.10045.3.1.1 */
+	OID_id_prime256v1,		/* 1.2.840.10045.3.1.7 */
 	OID_id_ecdsa_with_sha1,		/* 1.2.840.10045.4.1 */
 	OID_id_ecdsa_with_sha224,	/* 1.2.840.10045.4.3.1 */
 	OID_id_ecdsa_with_sha256,	/* 1.2.840.10045.4.3.2 */
-- 
cgit v1.2.3


From 2a8e615436de4cd59a7b0af43590ede899906bdf Mon Sep 17 00:00:00 2001
From: Saulo Alessandre <saulo.alessandre@tse.jus.br>
Date: Tue, 16 Mar 2021 17:07:39 -0400
Subject: x509: Add OID for NIST P384 and extend parser for it

Prepare the x509 parser to accept NIST P384 certificates and add the
OID for ansip384r1, which is the identifier for NIST P384.

Summary of changes:

* crypto/asymmetric_keys/x509_cert_parser.c
  - prepare x509 parser to load NIST P384

* include/linux/oid_registry.h
  - add OID_ansip384r1

Signed-off-by: Saulo Alessandre <saulo.alessandre@tse.jus.br>
Tested-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/asymmetric_keys/x509_cert_parser.c | 3 +++
 include/linux/oid_registry.h              | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index f5d547c6dfb5..6d003096b5bc 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -510,6 +510,9 @@ int x509_extract_key_data(void *context, size_t hdrlen,
 		case OID_id_prime256v1:
 			ctx->cert->pub->pkey_algo = "ecdsa-nist-p256";
 			break;
+		case OID_id_ansip384r1:
+			ctx->cert->pub->pkey_algo = "ecdsa-nist-p384";
+			break;
 		default:
 			return -ENOPKG;
 		}
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 3583908cf1ca..cc64d9419746 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -64,6 +64,7 @@ enum OID {
 
 	OID_certAuthInfoAccess,		/* 1.3.6.1.5.5.7.1.1 */
 	OID_sha1,			/* 1.3.14.3.2.26 */
+	OID_id_ansip384r1,		/* 1.3.132.0.34 */
 	OID_sha256,			/* 2.16.840.1.101.3.4.2.1 */
 	OID_sha384,			/* 2.16.840.1.101.3.4.2.2 */
 	OID_sha512,			/* 2.16.840.1.101.3.4.2.3 */
-- 
cgit v1.2.3


From 67196fea0fcef92b25608882f62f3985bc59f1fe Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 9 Mar 2021 11:37:31 +0200
Subject: irqdomain: Introduce irq_domain_create_simple() API

Linus Walleij pointed out that ird_domain_add_simple() gained
additional functionality and can't be anymore replaced with
a simple conditional. In preparation to upgrade GPIO library
to use fwnode, introduce irq_domain_create_simple() API which is
functional equivalent to the existing irq_domain_add_simple(),
but takes a pointer to the struct fwnode_handle as a parameter.

While at it, amend documentation to mention irq_domain_create_*()
functions where it makes sense.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 Documentation/core-api/irq/irq-domain.rst | 22 ++++++++++++----------
 include/linux/irqdomain.h                 | 19 ++++++++++++++-----
 kernel/irq/irqdomain.c                    | 20 ++++++++++----------
 3 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst
index a77c24c27f7b..8214e215a8bf 100644
--- a/Documentation/core-api/irq/irq-domain.rst
+++ b/Documentation/core-api/irq/irq-domain.rst
@@ -42,10 +42,10 @@ irq_domain usage
 ================
 
 An interrupt controller driver creates and registers an irq_domain by
-calling one of the irq_domain_add_*() functions (each mapping method
-has a different allocator function, more on that later).  The function
-will return a pointer to the irq_domain on success.  The caller must
-provide the allocator function with an irq_domain_ops structure.
+calling one of the irq_domain_add_*() or irq_domain_create_*() functions
+(each mapping method has a different allocator function, more on that later).
+The function will return a pointer to the irq_domain on success. The caller
+must provide the allocator function with an irq_domain_ops structure.
 
 In most cases, the irq_domain will begin empty without any mappings
 between hwirq and IRQ numbers.  Mappings are added to the irq_domain
@@ -147,6 +147,7 @@ Legacy
 	irq_domain_add_simple()
 	irq_domain_add_legacy()
 	irq_domain_add_legacy_isa()
+	irq_domain_create_simple()
 	irq_domain_create_legacy()
 
 The Legacy mapping is a special case for drivers that already have a
@@ -169,13 +170,13 @@ supported.  For example, ISA controllers would use the legacy map for
 mapping Linux IRQs 0-15 so that existing ISA drivers get the correct IRQ
 numbers.
 
-Most users of legacy mappings should use irq_domain_add_simple() which
-will use a legacy domain only if an IRQ range is supplied by the
-system and will otherwise use a linear domain mapping. The semantics
-of this call are such that if an IRQ range is specified then
+Most users of legacy mappings should use irq_domain_add_simple() or
+irq_domain_create_simple() which will use a legacy domain only if an IRQ range
+is supplied by the system and will otherwise use a linear domain mapping.
+The semantics of this call are such that if an IRQ range is specified then
 descriptors will be allocated on-the-fly for it, and if no range is
-specified it will fall through to irq_domain_add_linear() which means
-*no* irq descriptors will be allocated.
+specified it will fall through to irq_domain_add_linear() or
+irq_domain_create_linear() which means *no* irq descriptors will be allocated.
 
 A typical use case for simple domains is where an irqchip provider
 is supporting both dynamic and static IRQ assignments.
@@ -186,6 +187,7 @@ that the driver using the simple domain call irq_create_mapping()
 before any irq_find_mapping() since the latter will actually work
 for the static IRQ assignment case.
 
+irq_domain_add_simple() and irq_domain_create_simple() as well as
 irq_domain_add_legacy() and irq_domain_create_legacy() are functionally
 equivalent, except for the first argument is different - the former
 accepts an Open Firmware specific 'struct device_node', while the latter
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 33cacc8af26d..1ad8d5328715 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -256,11 +256,11 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 				    irq_hw_number_t hwirq_max, int direct_max,
 				    const struct irq_domain_ops *ops,
 				    void *host_data);
-struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
-					 unsigned int size,
-					 unsigned int first_irq,
-					 const struct irq_domain_ops *ops,
-					 void *host_data);
+struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
+					    unsigned int size,
+					    unsigned int first_irq,
+					    const struct irq_domain_ops *ops,
+					    void *host_data);
 struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 unsigned int size,
 					 unsigned int first_irq,
@@ -325,6 +325,15 @@ static inline struct irq_domain *irq_find_host(struct device_node *node)
 	return d;
 }
 
+static inline struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
+						       unsigned int size,
+						       unsigned int first_irq,
+						       const struct irq_domain_ops *ops,
+						       void *host_data)
+{
+	return irq_domain_create_simple(of_node_to_fwnode(of_node), size, first_irq, ops, host_data);
+}
+
 /**
  * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
  * @of_node: pointer to interrupt controller's device tree node.
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d10ab1d689d5..2681dc43813c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -295,8 +295,8 @@ void irq_domain_update_bus_token(struct irq_domain *domain,
 EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
 
 /**
- * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
- * @of_node: pointer to interrupt controller's device tree node.
+ * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs
+ * @fwnode: firmware node for the interrupt controller
  * @size: total number of irqs in mapping
  * @first_irq: first number of irq block assigned to the domain,
  *	pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
@@ -312,15 +312,15 @@ EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
  * irqs get mapped dynamically on the fly. However, if the controller requires
  * static virq assignments (non-DT boot) then it will set that up correctly.
  */
-struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
-					 unsigned int size,
-					 unsigned int first_irq,
-					 const struct irq_domain_ops *ops,
-					 void *host_data)
+struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
+					    unsigned int size,
+					    unsigned int first_irq,
+					    const struct irq_domain_ops *ops,
+					    void *host_data)
 {
 	struct irq_domain *domain;
 
-	domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data);
+	domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data);
 	if (!domain)
 		return NULL;
 
@@ -328,7 +328,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
 		if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
 			/* attempt to allocated irq_descs */
 			int rc = irq_alloc_descs(first_irq, first_irq, size,
-						 of_node_to_nid(of_node));
+						 of_node_to_nid(to_of_node(fwnode)));
 			if (rc < 0)
 				pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
 					first_irq);
@@ -338,7 +338,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
 
 	return domain;
 }
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+EXPORT_SYMBOL_GPL(irq_domain_create_simple);
 
 /**
  * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
-- 
cgit v1.2.3


From 2d93018fe67d42c44d65a898da2a6a5a0209b9ee Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 23 Mar 2021 15:19:05 -0700
Subject: gpiolib: some edits of kernel docs for clarity

Fix a few typos and some punctuation. Also, change CONFIG_OF to
CONFIG_OF_GPIO in one comment.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Cc: linux-gpio@vger.kernel.org
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
[Bartosz: tweaked the commit message]
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 include/linux/gpio/driver.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 286de0520574..63283de9daeb 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -227,7 +227,7 @@ struct gpio_irq_chip {
 	/**
 	 * @valid_mask:
 	 *
-	 * If not %NULL holds bitmask of GPIOs which are valid to be included
+	 * If not %NULL, holds bitmask of GPIOs which are valid to be included
 	 * in IRQ domain of the chip.
 	 */
 	unsigned long *valid_mask;
@@ -346,7 +346,7 @@ struct gpio_irq_chip {
  *	output.
  *
  * A gpio_chip can help platforms abstract various sources of GPIOs so
- * they can all be accessed through a common programing interface.
+ * they can all be accessed through a common programming interface.
  * Example sources would be SOC controllers, FPGAs, multifunction
  * chips, dedicated GPIO expanders, and so on.
  *
@@ -435,15 +435,15 @@ struct gpio_chip {
 	/**
 	 * @valid_mask:
 	 *
-	 * If not %NULL holds bitmask of GPIOs which are valid to be used
+	 * If not %NULL, holds bitmask of GPIOs which are valid to be used
 	 * from the chip.
 	 */
 	unsigned long *valid_mask;
 
 #if defined(CONFIG_OF_GPIO)
 	/*
-	 * If CONFIG_OF is enabled, then all GPIO controllers described in the
-	 * device tree automatically may have an OF translation
+	 * If CONFIG_OF_GPIO is enabled, then all GPIO controllers described in
+	 * the device tree automatically may have an OF translation
 	 */
 
 	/**
@@ -508,7 +508,7 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
  * for GPIOs will fail rudely.
  *
  * gpiochip_add_data() must only be called after gpiolib initialization,
- * ie after core_initcall().
+ * i.e. after core_initcall().
  *
  * If gc->base is negative, this requests dynamic assignment of
  * a range of valid GPIOs.
-- 
cgit v1.2.3


From 29d96eb261345c8d888e248ae79484e681be2faa Mon Sep 17 00:00:00 2001
From: Roja Rani Yarubandi <rojay@codeaurora.org>
Date: Wed, 24 Mar 2021 15:48:35 +0530
Subject: soc: qcom-geni-se: Cleanup the code to remove proxy votes

This reverts commit 048eb908a1f2 ("soc: qcom-geni-se: Add interconnect
support to fix earlycon crash")

ICC core and platforms drivers supports sync_state feature, which
ensures that the default ICC BW votes from the bootloader is not
removed until all it's consumers are probes.

The proxy votes were needed in case other QUP child drivers
I2C, SPI probes before UART, they can turn off the QUP-CORE clock
which is shared resources for all QUP driver, this causes unclocked
access to HW from earlycon.

Given above support from ICC there is no longer need to maintain
proxy votes on QUP-CORE ICC node from QUP wrapper driver for early
console usecase, the default votes won't be removed until real
console is probed.

Cc: stable@vger.kernel.org
Fixes: 266cd33b5913 ("interconnect: qcom: Ensure that the floor bandwidth value is enforced")
Fixes: 7d3b0b0d8184 ("interconnect: qcom: Use icc_sync_state")
Signed-off-by: Roja Rani Yarubandi <rojay@codeaurora.org>
Signed-off-by: Akash Asthana <akashast@codeaurora.org>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Link: https://lore.kernel.org/r/20210324101836.25272-2-rojay@codeaurora.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/soc/qcom/qcom-geni-se.c       | 74 -----------------------------------
 drivers/tty/serial/qcom_geni_serial.c |  7 ----
 include/linux/qcom-geni-se.h          |  2 -
 3 files changed, 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/qcom-geni-se.c b/drivers/soc/qcom/qcom-geni-se.c
index f42954e2c98e..1fd29f93ff6d 100644
--- a/drivers/soc/qcom/qcom-geni-se.c
+++ b/drivers/soc/qcom/qcom-geni-se.c
@@ -3,7 +3,6 @@
 
 #include <linux/acpi.h>
 #include <linux/clk.h>
-#include <linux/console.h>
 #include <linux/slab.h>
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
@@ -92,14 +91,11 @@ struct geni_wrapper {
 	struct device *dev;
 	void __iomem *base;
 	struct clk_bulk_data ahb_clks[NUM_AHB_CLKS];
-	struct geni_icc_path to_core;
 };
 
 static const char * const icc_path_names[] = {"qup-core", "qup-config",
 						"qup-memory"};
 
-static struct geni_wrapper *earlycon_wrapper;
-
 #define QUP_HW_VER_REG			0x4
 
 /* Common SE registers */
@@ -843,44 +839,11 @@ int geni_icc_disable(struct geni_se *se)
 }
 EXPORT_SYMBOL(geni_icc_disable);
 
-void geni_remove_earlycon_icc_vote(void)
-{
-	struct platform_device *pdev;
-	struct geni_wrapper *wrapper;
-	struct device_node *parent;
-	struct device_node *child;
-
-	if (!earlycon_wrapper)
-		return;
-
-	wrapper = earlycon_wrapper;
-	parent = of_get_next_parent(wrapper->dev->of_node);
-	for_each_child_of_node(parent, child) {
-		if (!of_device_is_compatible(child, "qcom,geni-se-qup"))
-			continue;
-
-		pdev = of_find_device_by_node(child);
-		if (!pdev)
-			continue;
-
-		wrapper = platform_get_drvdata(pdev);
-		icc_put(wrapper->to_core.path);
-		wrapper->to_core.path = NULL;
-
-	}
-	of_node_put(parent);
-
-	earlycon_wrapper = NULL;
-}
-EXPORT_SYMBOL(geni_remove_earlycon_icc_vote);
-
 static int geni_se_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct resource *res;
 	struct geni_wrapper *wrapper;
-	struct console __maybe_unused *bcon;
-	bool __maybe_unused has_earlycon = false;
 	int ret;
 
 	wrapper = devm_kzalloc(dev, sizeof(*wrapper), GFP_KERNEL);
@@ -903,43 +866,6 @@ static int geni_se_probe(struct platform_device *pdev)
 		}
 	}
 
-#ifdef CONFIG_SERIAL_EARLYCON
-	for_each_console(bcon) {
-		if (!strcmp(bcon->name, "qcom_geni")) {
-			has_earlycon = true;
-			break;
-		}
-	}
-	if (!has_earlycon)
-		goto exit;
-
-	wrapper->to_core.path = devm_of_icc_get(dev, "qup-core");
-	if (IS_ERR(wrapper->to_core.path))
-		return PTR_ERR(wrapper->to_core.path);
-	/*
-	 * Put minmal BW request on core clocks on behalf of early console.
-	 * The vote will be removed earlycon exit function.
-	 *
-	 * Note: We are putting vote on each QUP wrapper instead only to which
-	 * earlycon is connected because QUP core clock of different wrapper
-	 * share same voltage domain. If core1 is put to 0, then core2 will
-	 * also run at 0, if not voted. Default ICC vote will be removed ASA
-	 * we touch any of the core clock.
-	 * core1 = core2 = max(core1, core2)
-	 */
-	ret = icc_set_bw(wrapper->to_core.path, GENI_DEFAULT_BW,
-				GENI_DEFAULT_BW);
-	if (ret) {
-		dev_err(&pdev->dev, "%s: ICC BW voting failed for core: %d\n",
-			__func__, ret);
-		return ret;
-	}
-
-	if (of_get_compatible_child(pdev->dev.of_node, "qcom,geni-debug-uart"))
-		earlycon_wrapper = wrapper;
-	of_node_put(pdev->dev.of_node);
-exit:
-#endif
 	dev_set_drvdata(dev, wrapper);
 	dev_dbg(dev, "GENI SE Driver probed\n");
 	return devm_of_platform_populate(dev);
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index 291649f02821..0d85b55ea823 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -1177,12 +1177,6 @@ static inline void qcom_geni_serial_enable_early_read(struct geni_se *se,
 						      struct console *con) { }
 #endif
 
-static int qcom_geni_serial_earlycon_exit(struct console *con)
-{
-	geni_remove_earlycon_icc_vote();
-	return 0;
-}
-
 static struct qcom_geni_private_data earlycon_private_data;
 
 static int __init qcom_geni_serial_earlycon_setup(struct earlycon_device *dev,
@@ -1233,7 +1227,6 @@ static int __init qcom_geni_serial_earlycon_setup(struct earlycon_device *dev,
 	writel(stop_bit_len, uport->membase + SE_UART_TX_STOP_BIT_LEN);
 
 	dev->con->write = qcom_geni_serial_earlycon_write;
-	dev->con->exit = qcom_geni_serial_earlycon_exit;
 	dev->con->setup = NULL;
 	qcom_geni_serial_enable_early_read(&se, dev->con);
 
diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h
index ec2ad4b0fe14..c4fdb4463f7d 100644
--- a/include/linux/qcom-geni-se.h
+++ b/include/linux/qcom-geni-se.h
@@ -460,7 +460,5 @@ void geni_icc_set_tag(struct geni_se *se, u32 tag);
 int geni_icc_enable(struct geni_se *se);
 
 int geni_icc_disable(struct geni_se *se);
-
-void geni_remove_earlycon_icc_vote(void);
 #endif
 #endif
-- 
cgit v1.2.3


From 861de02e5f3f2a104eecc5af1d248cb7bf8c5f75 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 26 Mar 2021 11:59:00 +0100
Subject: bpf: Take module reference for trampoline in module

Currently module can be unloaded even if there's a trampoline
register in it. It's easily reproduced by running in parallel:

  # while :; do ./test_progs -t module_attach; done
  # while :; do rmmod bpf_testmod; sleep 0.5; done

Taking the module reference in case the trampoline's ip is
within the module code. Releasing it when the trampoline's
ip is unregistered.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210326105900.151466-1-jolsa@kernel.org
---
 include/linux/bpf.h     |  2 ++
 kernel/bpf/trampoline.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3625f019767d..fdac0534ce79 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -40,6 +40,7 @@ struct bpf_local_storage;
 struct bpf_local_storage_map;
 struct kobject;
 struct mem_cgroup;
+struct module;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -623,6 +624,7 @@ struct bpf_trampoline {
 	/* Executable image of trampoline */
 	struct bpf_tramp_image *cur_image;
 	u64 selector;
+	struct module *mod;
 };
 
 struct bpf_attach_target_info {
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 1f3a4be4b175..4aa8b52adf25 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,6 +9,7 @@
 #include <linux/btf.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/module.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -87,6 +88,26 @@ out:
 	return tr;
 }
 
+static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
+{
+	struct module *mod;
+	int err = 0;
+
+	preempt_disable();
+	mod = __module_text_address((unsigned long) tr->func.addr);
+	if (mod && !try_module_get(mod))
+		err = -ENOENT;
+	preempt_enable();
+	tr->mod = mod;
+	return err;
+}
+
+static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
+{
+	module_put(tr->mod);
+	tr->mod = NULL;
+}
+
 static int is_ftrace_location(void *ip)
 {
 	long addr;
@@ -108,6 +129,9 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 		ret = unregister_ftrace_direct((long)ip, (long)old_addr);
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
+
+	if (!ret)
+		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
@@ -134,10 +158,16 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		return ret;
 	tr->func.ftrace_managed = ret;
 
+	if (bpf_trampoline_module_get(tr))
+		return -ENOENT;
+
 	if (tr->func.ftrace_managed)
 		ret = register_ftrace_direct((long)ip, (long)new_addr);
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+
+	if (ret)
+		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
-- 
cgit v1.2.3


From e16301fbe1837c9594f9c1957c28fd1bb18fbd15 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:30 -0700
Subject: bpf: Simplify freeing logic in linfo and jited_linfo

This patch simplifies the linfo freeing logic by combining
"bpf_prog_free_jited_linfo()" and "bpf_prog_free_unused_jited_linfo()"
into the new "bpf_prog_jit_attempt_done()".
It is a prep work for the kernel function call support.  In a later
patch, freeing the kernel function call descriptors will also
be done in the "bpf_prog_jit_attempt_done()".

"bpf_prog_free_linfo()" is removed since it is only called by
"__bpf_prog_put_noref()".  The kvfree() are directly called
instead.

It also takes this chance to s/kcalloc/kvcalloc/ for the jited_linfo
allocation.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015130.1544323-1-kafai@fb.com
---
 include/linux/filter.h |  3 +--
 kernel/bpf/core.c      | 35 ++++++++++++-----------------------
 kernel/bpf/syscall.c   |  3 ++-
 kernel/bpf/verifier.c  |  4 ++--
 4 files changed, 17 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index b2b85b2cad8e..0d9c710eb050 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -877,8 +877,7 @@ void bpf_prog_free_linfo(struct bpf_prog *prog);
 void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
 			       const u32 *insn_to_jit_off);
 int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 75244ecb2389..a35eb3d7b126 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -143,25 +143,22 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
 	if (!prog->aux->nr_linfo || !prog->jit_requested)
 		return 0;
 
-	prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
-					 sizeof(*prog->aux->jited_linfo),
-					 GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+	prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
+					  sizeof(*prog->aux->jited_linfo),
+					  GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (!prog->aux->jited_linfo)
 		return -ENOMEM;
 
 	return 0;
 }
 
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
 {
-	kfree(prog->aux->jited_linfo);
-	prog->aux->jited_linfo = NULL;
-}
-
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
-{
-	if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
-		bpf_prog_free_jited_linfo(prog);
+	if (prog->aux->jited_linfo &&
+	    (!prog->jited || !prog->aux->jited_linfo[0])) {
+		kvfree(prog->aux->jited_linfo);
+		prog->aux->jited_linfo = NULL;
+	}
 }
 
 /* The jit engine is responsible to provide an array
@@ -217,12 +214,6 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
 			insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
 }
 
-void bpf_prog_free_linfo(struct bpf_prog *prog)
-{
-	bpf_prog_free_jited_linfo(prog);
-	kvfree(prog->aux->linfo);
-}
-
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags)
 {
@@ -1866,15 +1857,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 			return fp;
 
 		fp = bpf_int_jit_compile(fp);
-		if (!fp->jited) {
-			bpf_prog_free_jited_linfo(fp);
+		bpf_prog_jit_attempt_done(fp);
 #ifdef CONFIG_BPF_JIT_ALWAYS_ON
+		if (!fp->jited) {
 			*err = -ENOTSUPP;
 			return fp;
-#endif
-		} else {
-			bpf_prog_free_unused_jited_linfo(fp);
 		}
+#endif
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 250503482cda..eaf85bf51c5a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1694,7 +1694,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 {
 	bpf_prog_kallsyms_del_all(prog);
 	btf_put(prog->aux->btf);
-	bpf_prog_free_linfo(prog);
+	kvfree(prog->aux->jited_linfo);
+	kvfree(prog->aux->linfo);
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 85f9f842d15c..b7df3f06a279 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11741,7 +11741,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
 	prog->aux->func_cnt = env->subprog_cnt;
-	bpf_prog_free_unused_jited_linfo(prog);
+	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
 	for (i = 0; i < env->subprog_cnt; i++) {
@@ -11764,7 +11764,7 @@ out_undo_insn:
 		insn->off = 0;
 		insn->imm = env->insn_aux_data[i].call_imm;
 	}
-	bpf_prog_free_jited_linfo(prog);
+	bpf_prog_jit_attempt_done(prog);
 	return err;
 }
 
-- 
cgit v1.2.3


From 34747c4120418143097d4343312a0ca96c986d86 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:36 -0700
Subject: bpf: Refactor btf_check_func_arg_match

This patch moved the subprog specific logic from
btf_check_func_arg_match() to the new btf_check_subprog_arg_match().
The core logic is left in btf_check_func_arg_match() which
will be reused later to check the kernel function call.

The "if (!btf_type_is_ptr(t))" is checked first to improve the
indentation which will be useful for a later patch.

Some of the "btf_kind_str[]" usages is replaced with the shortcut
"btf_type_str(t)".

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015136.1544504-1-kafai@fb.com
---
 include/linux/bpf.h   |   4 +-
 include/linux/btf.h   |   5 ++
 kernel/bpf/btf.c      | 159 +++++++++++++++++++++++++++-----------------------
 kernel/bpf/verifier.c |   4 +-
 4 files changed, 95 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5a0801b420ca..eaae618a90b5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1545,8 +1545,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf_func_model *m);
 
 struct bpf_reg_state;
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
-			     struct bpf_reg_state *regs);
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+				struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 9c1b52738bbe..8a05687a4ee2 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -141,6 +141,11 @@ static inline bool btf_type_is_enum(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
 }
 
+static inline bool btf_type_is_scalar(const struct btf_type *t)
+{
+	return btf_type_is_int(t) || btf_type_is_enum(t);
+}
+
 static inline bool btf_type_is_typedef(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 369faeddf1df..3c489adacf3b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4377,7 +4377,7 @@ static u8 bpf_ctx_convert_map[] = {
 #undef BPF_LINK_TYPE
 
 static const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, enum bpf_prog_type prog_type,
 		      int arg)
 {
@@ -5362,122 +5362,135 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
-/* Compare BTF of a function with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
-			     struct bpf_reg_state *regs)
+static int btf_check_func_arg_match(struct bpf_verifier_env *env,
+				    const struct btf *btf, u32 func_id,
+				    struct bpf_reg_state *regs,
+				    bool ptr_to_mem_ok)
 {
 	struct bpf_verifier_log *log = &env->log;
-	struct bpf_prog *prog = env->prog;
-	struct btf *btf = prog->aux->btf;
-	const struct btf_param *args;
+	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
-	u32 i, nargs, btf_id, type_size;
-	const char *tname;
-	bool is_global;
-
-	if (!prog->aux->func_info)
-		return -EINVAL;
-
-	btf_id = prog->aux->func_info[subprog].type_id;
-	if (!btf_id)
-		return -EFAULT;
-
-	if (prog->aux->func_info_aux[subprog].unreliable)
-		return -EINVAL;
+	const struct btf_param *args;
+	u32 i, nargs;
 
-	t = btf_type_by_id(btf, btf_id);
+	t = btf_type_by_id(btf, func_id);
 	if (!t || !btf_type_is_func(t)) {
 		/* These checks were already done by the verifier while loading
 		 * struct bpf_func_info
 		 */
-		bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
-			subprog);
+		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
+			func_id);
 		return -EFAULT;
 	}
-	tname = btf_name_by_offset(btf, t->name_off);
+	func_name = btf_name_by_offset(btf, t->name_off);
 
 	t = btf_type_by_id(btf, t->type);
 	if (!t || !btf_type_is_func_proto(t)) {
-		bpf_log(log, "Invalid BTF of func %s\n", tname);
+		bpf_log(log, "Invalid BTF of func %s\n", func_name);
 		return -EFAULT;
 	}
 	args = (const struct btf_param *)(t + 1);
 	nargs = btf_type_vlen(t);
 	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
-		bpf_log(log, "Function %s has %d > %d args\n", tname, nargs,
+		bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
 			MAX_BPF_FUNC_REG_ARGS);
-		goto out;
+		return -EINVAL;
 	}
 
-	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
 	/* check that BTF function arguments match actual types that the
 	 * verifier sees.
 	 */
 	for (i = 0; i < nargs; i++) {
-		struct bpf_reg_state *reg = &regs[i + 1];
+		u32 regno = i + 1;
+		struct bpf_reg_state *reg = &regs[regno];
 
-		t = btf_type_by_id(btf, args[i].type);
-		while (btf_type_is_modifier(t))
-			t = btf_type_by_id(btf, t->type);
-		if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		if (btf_type_is_scalar(t)) {
 			if (reg->type == SCALAR_VALUE)
 				continue;
-			bpf_log(log, "R%d is not a scalar\n", i + 1);
-			goto out;
+			bpf_log(log, "R%d is not a scalar\n", regno);
+			return -EINVAL;
 		}
-		if (btf_type_is_ptr(t)) {
+
+		if (!btf_type_is_ptr(t)) {
+			bpf_log(log, "Unrecognized arg#%d type %s\n",
+				i, btf_type_str(t));
+			return -EINVAL;
+		}
+
+		ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+		if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
-			if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
-				if (reg->type != PTR_TO_CTX) {
-					bpf_log(log,
-						"arg#%d expected pointer to ctx, but got %s\n",
-						i, btf_kind_str[BTF_INFO_KIND(t->info)]);
-					goto out;
-				}
-				if (check_ctx_reg(env, reg, i + 1))
-					goto out;
-				continue;
+			if (reg->type != PTR_TO_CTX) {
+				bpf_log(log,
+					"arg#%d expected pointer to ctx, but got %s\n",
+					i, btf_type_str(t));
+				return -EINVAL;
 			}
+			if (check_ctx_reg(env, reg, regno))
+				return -EINVAL;
+		} else if (ptr_to_mem_ok) {
+			const struct btf_type *resolve_ret;
+			u32 type_size;
 
-			if (!is_global)
-				goto out;
-
-			t = btf_type_skip_modifiers(btf, t->type, NULL);
-
-			ref_t = btf_resolve_size(btf, t, &type_size);
-			if (IS_ERR(ref_t)) {
+			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+			if (IS_ERR(resolve_ret)) {
 				bpf_log(log,
-				    "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
-				    i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
-					PTR_ERR(ref_t));
-				goto out;
+					"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+					i, btf_type_str(ref_t), ref_tname,
+					PTR_ERR(resolve_ret));
+				return -EINVAL;
 			}
 
-			if (check_mem_reg(env, reg, i + 1, type_size))
-				goto out;
-
-			continue;
+			if (check_mem_reg(env, reg, regno, type_size))
+				return -EINVAL;
+		} else {
+			return -EINVAL;
 		}
-		bpf_log(log, "Unrecognized arg#%d type %s\n",
-			i, btf_kind_str[BTF_INFO_KIND(t->info)]);
-		goto out;
 	}
+
 	return 0;
-out:
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+				struct bpf_reg_state *regs)
+{
+	struct bpf_prog *prog = env->prog;
+	struct btf *btf = prog->aux->btf;
+	bool is_global;
+	u32 btf_id;
+	int err;
+
+	if (!prog->aux->func_info)
+		return -EINVAL;
+
+	btf_id = prog->aux->func_info[subprog].type_id;
+	if (!btf_id)
+		return -EFAULT;
+
+	if (prog->aux->func_info_aux[subprog].unreliable)
+		return -EINVAL;
+
+	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
 	 * In such cases mark the function as unreliable from BTF point of view.
 	 */
-	prog->aux->func_info_aux[subprog].unreliable = true;
-	return -EINVAL;
+	if (err)
+		prog->aux->func_info_aux[subprog].unreliable = true;
+	return err;
 }
 
 /* Convert BTF of a function into bpf_reg_state if possible
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b7df3f06a279..b31e62daafbd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5365,7 +5365,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	func_info_aux = env->prog->aux->func_info_aux;
 	if (func_info_aux)
 		is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, subprog, caller->regs);
+	err = btf_check_subprog_arg_match(env, subprog, caller->regs);
 	if (err == -EFAULT)
 		return err;
 	if (is_global) {
@@ -12288,7 +12288,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
 		mark_reg_known_zero(env, regs, BPF_REG_1);
-		ret = btf_check_func_arg_match(env, subprog, regs);
+		ret = btf_check_subprog_arg_match(env, subprog, regs);
 		if (ret == -EFAULT)
 			/* unlikely verifier bug. abort.
 			 * ret == 0 and ret < 0 are sadly acceptable for
-- 
cgit v1.2.3


From e6ac2450d6dee3121cd8bbf2907b78a68a8a353d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:51:42 -0700
Subject: bpf: Support bpf program calling kernel function

This patch adds support to BPF verifier to allow bpf program calling
kernel function directly.

The use case included in this set is to allow bpf-tcp-cc to directly
call some tcp-cc helper functions (e.g. "tcp_cong_avoid_ai()").  Those
functions have already been used by some kernel tcp-cc implementations.

This set will also allow the bpf-tcp-cc program to directly call the
kernel tcp-cc implementation,  For example, a bpf_dctcp may only want to
implement its own dctcp_cwnd_event() and reuse other dctcp_*() directly
from the kernel tcp_dctcp.c instead of reimplementing (or
copy-and-pasting) them.

The tcp-cc kernel functions mentioned above will be white listed
for the struct_ops bpf-tcp-cc programs to use in a later patch.
The white listed functions are not bounded to a fixed ABI contract.
Those functions have already been used by the existing kernel tcp-cc.
If any of them has changed, both in-tree and out-of-tree kernel tcp-cc
implementations have to be changed.  The same goes for the struct_ops
bpf-tcp-cc programs which have to be adjusted accordingly.

This patch is to make the required changes in the bpf verifier.

First change is in btf.c, it adds a case in "btf_check_func_arg_match()".
When the passed in "btf->kernel_btf == true", it means matching the
verifier regs' states with a kernel function.  This will handle the
PTR_TO_BTF_ID reg.  It also maps PTR_TO_SOCK_COMMON, PTR_TO_SOCKET,
and PTR_TO_TCP_SOCK to its kernel's btf_id.

In the later libbpf patch, the insn calling a kernel function will
look like:

insn->code == (BPF_JMP | BPF_CALL)
insn->src_reg == BPF_PSEUDO_KFUNC_CALL /* <- new in this patch */
insn->imm == func_btf_id /* btf_id of the running kernel */

[ For the future calling function-in-kernel-module support, an array
  of module btf_fds can be passed at the load time and insn->off
  can be used to index into this array. ]

At the early stage of verifier, the verifier will collect all kernel
function calls into "struct bpf_kfunc_desc".  Those
descriptors are stored in "prog->aux->kfunc_tab" and will
be available to the JIT.  Since this "add" operation is similar
to the current "add_subprog()" and looking for the same insn->code,
they are done together in the new "add_subprog_and_kfunc()".

In the "do_check()" stage, the new "check_kfunc_call()" is added
to verify the kernel function call instruction:
1. Ensure the kernel function can be used by a particular BPF_PROG_TYPE.
   A new bpf_verifier_ops "check_kfunc_call" is added to do that.
   The bpf-tcp-cc struct_ops program will implement this function in
   a later patch.
2. Call "btf_check_kfunc_args_match()" to ensure the regs can be
   used as the args of a kernel function.
3. Mark the regs' type, subreg_def, and zext_dst.

At the later do_misc_fixups() stage, the new fixup_kfunc_call()
will replace the insn->imm with the function address (relative
to __bpf_call_base).  If needed, the jit can find the btf_func_model
by calling the new bpf_jit_find_kfunc_model(prog, insn).
With the imm set to the function address, "bpftool prog dump xlated"
will be able to display the kernel function calls the same way as
it displays other bpf helper calls.

gpl_compatible program is required to call kernel function.

This feature currently requires JIT.

The verifier selftests are adjusted because of the changes in
the verbose log in add_subprog_and_kfunc().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015142.1544736-1-kafai@fb.com
---
 arch/x86/net/bpf_jit_comp.c                      |   5 +
 include/linux/bpf.h                              |  24 ++
 include/linux/btf.h                              |   1 +
 include/linux/filter.h                           |   1 +
 include/uapi/linux/bpf.h                         |   4 +
 kernel/bpf/btf.c                                 |  65 +++-
 kernel/bpf/core.c                                |  18 +-
 kernel/bpf/disasm.c                              |  13 +-
 kernel/bpf/syscall.c                             |   1 +
 kernel/bpf/verifier.c                            | 368 +++++++++++++++++++++--
 tools/include/uapi/linux/bpf.h                   |   4 +
 tools/testing/selftests/bpf/verifier/calls.c     |  12 +-
 tools/testing/selftests/bpf/verifier/dead_code.c |  10 +-
 13 files changed, 480 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b35fc8023884..9eead60f0301 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2346,3 +2346,8 @@ out:
 					   tmp : orig_prog);
 	return prog;
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+	return true;
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eaae618a90b5..b5b7967e3ff3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -427,6 +427,7 @@ enum bpf_reg_type {
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
 	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
+	__BPF_REG_TYPE_MAX,
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -480,6 +481,7 @@ struct bpf_verifier_ops {
 				 const struct btf_type *t, int off, int size,
 				 enum bpf_access_type atype,
 				 u32 *next_btf_id);
+	bool (*check_kfunc_call)(u32 kfunc_btf_id);
 };
 
 struct bpf_prog_offload_ops {
@@ -796,6 +798,8 @@ struct btf_mod_pair {
 	struct module *module;
 };
 
+struct bpf_kfunc_desc_tab;
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -832,6 +836,7 @@ struct bpf_prog_aux {
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct bpf_jit_poke_descriptor *poke_tab;
+	struct bpf_kfunc_desc_tab *kfunc_tab;
 	u32 size_poke_tab;
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
@@ -1547,6 +1552,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 struct bpf_reg_state;
 int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+			      const struct btf *btf, u32 func_id,
+			      struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
@@ -1557,6 +1565,10 @@ struct bpf_link *bpf_link_by_id(u32 id);
 
 const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 void bpf_task_storage_free(struct task_struct *task);
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -1737,6 +1749,18 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 static inline void bpf_task_storage_free(struct task_struct *task)
 {
 }
+
+static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+	return false;
+}
+
+static inline const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn)
+{
+	return NULL;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 8a05687a4ee2..3bac66e0183a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -110,6 +110,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
 const struct btf_type *
 btf_resolve_size(const struct btf *btf, const struct btf_type *type,
 		 u32 *type_size);
+const char *btf_type_str(const struct btf_type *t);
 
 #define for_each_member(i, struct_type, member)			\
 	for (i = 0, member = btf_type_member(struct_type);	\
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0d9c710eb050..eecfd82db648 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -918,6 +918,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
+bool bpf_jit_supports_kfunc_call(void);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 008edc1dc8c1..598716742593 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1117,6 +1117,10 @@ enum bpf_link_type {
  * offset to another bpf function
  */
 #define BPF_PSEUDO_CALL		1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL	2
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3c489adacf3b..ec8afc4bc560 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -283,7 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_FLOAT]	= "FLOAT",
 };
 
-static const char *btf_type_str(const struct btf_type *t)
+const char *btf_type_str(const struct btf_type *t)
 {
 	return btf_kind_str[BTF_INFO_KIND(t->info)];
 }
@@ -5362,6 +5362,14 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+	[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+	[PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+	[PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
@@ -5371,12 +5379,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
-	u32 i, nargs;
+	u32 i, nargs, ref_id;
 
 	t = btf_type_by_id(btf, func_id);
 	if (!t || !btf_type_is_func(t)) {
 		/* These checks were already done by the verifier while loading
-		 * struct bpf_func_info
+		 * struct bpf_func_info or in add_kfunc_call().
 		 */
 		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
 			func_id);
@@ -5418,9 +5426,49 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
-		ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-		if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) {
+		if (btf_is_kernel(btf)) {
+			const struct btf_type *reg_ref_t;
+			const struct btf *reg_btf;
+			const char *reg_ref_tname;
+			u32 reg_ref_id;
+
+			if (!btf_type_is_struct(ref_t)) {
+				bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+					func_name, i, btf_type_str(ref_t),
+					ref_tname);
+				return -EINVAL;
+			}
+
+			if (reg->type == PTR_TO_BTF_ID) {
+				reg_btf = reg->btf;
+				reg_ref_id = reg->btf_id;
+			} else if (reg2btf_ids[reg->type]) {
+				reg_btf = btf_vmlinux;
+				reg_ref_id = *reg2btf_ids[reg->type];
+			} else {
+				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
+					func_name, i,
+					btf_type_str(ref_t), ref_tname, regno);
+				return -EINVAL;
+			}
+
+			reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
+							    &reg_ref_id);
+			reg_ref_tname = btf_name_by_offset(reg_btf,
+							   reg_ref_t->name_off);
+			if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
+						  reg->off, btf, ref_id)) {
+				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+					func_name, i,
+					btf_type_str(ref_t), ref_tname,
+					regno, btf_type_str(reg_ref_t),
+					reg_ref_tname);
+				return -EINVAL;
+			}
+		} else if (btf_get_prog_ctx_type(log, btf, t,
+						 env->prog->type, i)) {
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
@@ -5493,6 +5541,13 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 	return err;
 }
 
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+			      const struct btf *btf, u32 func_id,
+			      struct bpf_reg_state *regs)
+{
+	return btf_check_func_arg_match(env, btf, func_id, regs, false);
+}
+
 /* Convert BTF of a function into bpf_reg_state if possible
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a35eb3d7b126..f5423251c118 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -159,6 +159,9 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
 		kvfree(prog->aux->jited_linfo);
 		prog->aux->jited_linfo = NULL;
 	}
+
+	kfree(prog->aux->kfunc_tab);
+	prog->aux->kfunc_tab = NULL;
 }
 
 /* The jit engine is responsible to provide an array
@@ -1840,9 +1843,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	/* In case of BPF to BPF calls, verifier did all the prep
 	 * work with regards to JITing, etc.
 	 */
+	bool jit_needed = false;
+
 	if (fp->bpf_func)
 		goto finalize;
 
+	if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+	    bpf_prog_has_kfunc_call(fp))
+		jit_needed = true;
+
 	bpf_prog_select_func(fp);
 
 	/* eBPF JITs can rewrite the program in case constant
@@ -1858,12 +1867,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 
 		fp = bpf_int_jit_compile(fp);
 		bpf_prog_jit_attempt_done(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
-		if (!fp->jited) {
+		if (!fp->jited && jit_needed) {
 			*err = -ENOTSUPP;
 			return fp;
 		}
-#endif
 	} else {
 		*err = bpf_prog_offload_compile(fp);
 		if (*err)
@@ -2343,6 +2350,11 @@ bool __weak bpf_jit_needs_zext(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 3acc7e0b6916..dad821c8ecd0 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
 {
 	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
 
-	if (insn->src_reg != BPF_PSEUDO_CALL &&
+	if (!insn->src_reg &&
 	    insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
 	    func_id_str[insn->imm])
 		return func_id_str[insn->imm];
 
-	if (cbs && cbs->cb_call)
-		return cbs->cb_call(cbs->private_data, insn);
+	if (cbs && cbs->cb_call) {
+		const char *res;
+
+		res = cbs->cb_call(cbs->private_data, insn);
+		if (res)
+			return res;
+	}
 
 	if (insn->src_reg == BPF_PSEUDO_CALL)
 		snprintf(buff, len, "%+d", insn->imm);
+	else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+		snprintf(buff, len, "kernel-function");
 
 	return buff;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eaf85bf51c5a..9603de81811a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1696,6 +1696,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 	btf_put(prog->aux->btf);
 	kvfree(prog->aux->jited_linfo);
 	kvfree(prog->aux->linfo);
+	kfree(prog->aux->kfunc_tab);
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b31e62daafbd..852541a435ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn)
 	       insn->src_reg == BPF_PSEUDO_CALL;
 }
 
+static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+	       insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
+}
+
 static bool bpf_pseudo_func(const struct bpf_insn *insn)
 {
 	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
@@ -1554,47 +1560,205 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
+	/* determine subprog starts. The end is one before the next starts */
 	env->subprog_info[env->subprog_cnt++].start = off;
 	sort(env->subprog_info, env->subprog_cnt,
 	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
 	return env->subprog_cnt - 1;
 }
 
-static int check_subprogs(struct bpf_verifier_env *env)
+struct bpf_kfunc_desc {
+	struct btf_func_model func_model;
+	u32 func_id;
+	s32 imm;
+};
+
+#define MAX_KFUNC_DESCS 256
+struct bpf_kfunc_desc_tab {
+	struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
+	u32 nr_descs;
+};
+
+static int kfunc_desc_cmp_by_id(const void *a, const void *b)
+{
+	const struct bpf_kfunc_desc *d0 = a;
+	const struct bpf_kfunc_desc *d1 = b;
+
+	/* func_id is not greater than BTF_MAX_TYPE */
+	return d0->func_id - d1->func_id;
+}
+
+static const struct bpf_kfunc_desc *
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id)
+{
+	struct bpf_kfunc_desc desc = {
+		.func_id = func_id,
+	};
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	return bsearch(&desc, tab->descs, tab->nr_descs,
+		       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id)
+{
+	const struct btf_type *func, *func_proto;
+	struct bpf_kfunc_desc_tab *tab;
+	struct bpf_prog_aux *prog_aux;
+	struct bpf_kfunc_desc *desc;
+	const char *func_name;
+	unsigned long addr;
+	int err;
+
+	prog_aux = env->prog->aux;
+	tab = prog_aux->kfunc_tab;
+	if (!tab) {
+		if (!btf_vmlinux) {
+			verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
+			return -ENOTSUPP;
+		}
+
+		if (!env->prog->jit_requested) {
+			verbose(env, "JIT is required for calling kernel function\n");
+			return -ENOTSUPP;
+		}
+
+		if (!bpf_jit_supports_kfunc_call()) {
+			verbose(env, "JIT does not support calling kernel function\n");
+			return -ENOTSUPP;
+		}
+
+		if (!env->prog->gpl_compatible) {
+			verbose(env, "cannot call kernel function from non-GPL compatible program\n");
+			return -EINVAL;
+		}
+
+		tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+		if (!tab)
+			return -ENOMEM;
+		prog_aux->kfunc_tab = tab;
+	}
+
+	if (find_kfunc_desc(env->prog, func_id))
+		return 0;
+
+	if (tab->nr_descs == MAX_KFUNC_DESCS) {
+		verbose(env, "too many different kernel function calls\n");
+		return -E2BIG;
+	}
+
+	func = btf_type_by_id(btf_vmlinux, func_id);
+	if (!func || !btf_type_is_func(func)) {
+		verbose(env, "kernel btf_id %u is not a function\n",
+			func_id);
+		return -EINVAL;
+	}
+	func_proto = btf_type_by_id(btf_vmlinux, func->type);
+	if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+		verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
+			func_id);
+		return -EINVAL;
+	}
+
+	func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+	addr = kallsyms_lookup_name(func_name);
+	if (!addr) {
+		verbose(env, "cannot find address for kernel function %s\n",
+			func_name);
+		return -EINVAL;
+	}
+
+	desc = &tab->descs[tab->nr_descs++];
+	desc->func_id = func_id;
+	desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base;
+	err = btf_distill_func_proto(&env->log, btf_vmlinux,
+				     func_proto, func_name,
+				     &desc->func_model);
+	if (!err)
+		sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+		     kfunc_desc_cmp_by_id, NULL);
+	return err;
+}
+
+static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+{
+	const struct bpf_kfunc_desc *d0 = a;
+	const struct bpf_kfunc_desc *d1 = b;
+
+	if (d0->imm > d1->imm)
+		return 1;
+	else if (d0->imm < d1->imm)
+		return -1;
+	return 0;
+}
+
+static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+{
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	if (!tab)
+		return;
+
+	sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+	     kfunc_desc_cmp_by_imm, NULL);
+}
+
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+	return !!prog->aux->kfunc_tab;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+			 const struct bpf_insn *insn)
+{
+	const struct bpf_kfunc_desc desc = {
+		.imm = insn->imm,
+	};
+	const struct bpf_kfunc_desc *res;
+	struct bpf_kfunc_desc_tab *tab;
+
+	tab = prog->aux->kfunc_tab;
+	res = bsearch(&desc, tab->descs, tab->nr_descs,
+		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+
+	return res ? &res->func_model : NULL;
+}
+
+static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 {
-	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
 	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
+	int i, ret, insn_cnt = env->prog->len;
 
 	/* Add entry function. */
 	ret = add_subprog(env, 0);
-	if (ret < 0)
+	if (ret)
 		return ret;
 
-	/* determine subprog starts. The end is one before the next starts */
-	for (i = 0; i < insn_cnt; i++) {
-		if (bpf_pseudo_func(insn + i)) {
-			if (!env->bpf_capable) {
-				verbose(env,
-					"function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
-				return -EPERM;
-			}
-			ret = add_subprog(env, i + insn[i].imm + 1);
-			if (ret < 0)
-				return ret;
-			/* remember subprog */
-			insn[i + 1].imm = ret;
-			continue;
-		}
-		if (!bpf_pseudo_call(insn + i))
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
+		    !bpf_pseudo_kfunc_call(insn))
 			continue;
+
 		if (!env->bpf_capable) {
-			verbose(env,
-				"function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+			verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
 			return -EPERM;
 		}
-		ret = add_subprog(env, i + insn[i].imm + 1);
+
+		if (bpf_pseudo_func(insn)) {
+			ret = add_subprog(env, i + insn->imm + 1);
+			if (ret >= 0)
+				/* remember subprog */
+				insn[1].imm = ret;
+		} else if (bpf_pseudo_call(insn)) {
+			ret = add_subprog(env, i + insn->imm + 1);
+		} else {
+			ret = add_kfunc_call(env, insn->imm);
+		}
+
 		if (ret < 0)
 			return ret;
 	}
@@ -1608,6 +1772,16 @@ static int check_subprogs(struct bpf_verifier_env *env)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
+	return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+	int i, subprog_start, subprog_end, off, cur_subprog = 0;
+	struct bpf_subprog_info *subprog = env->subprog_info;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+
 	/* now check that all jumps are within the same subprog */
 	subprog_start = subprog[cur_subprog].start;
 	subprog_end = subprog[cur_subprog + 1].start;
@@ -1916,6 +2090,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
 	return i;
 }
 
+static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
+{
+	const struct btf_type *func;
+
+	if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
+		return NULL;
+
+	func = btf_type_by_id(btf_vmlinux, insn->imm);
+	return btf_name_by_offset(btf_vmlinux, func->name_off);
+}
+
 /* For given verifier state backtrack_insn() is called from the last insn to
  * the first insn. Its purpose is to compute a bitmask of registers and
  * stack slots that needs precision in the parent verifier state.
@@ -1924,6 +2109,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
 			  u32 *reg_mask, u64 *stack_mask)
 {
 	const struct bpf_insn_cbs cbs = {
+		.cb_call	= disasm_kfunc_name,
 		.cb_print	= verbose,
 		.private_data	= env,
 	};
@@ -5960,6 +6146,98 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	return 0;
 }
 
+/* mark_btf_func_reg_size() is used when the reg size is determined by
+ * the BTF func_proto's return value size and argument.
+ */
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+				   size_t reg_size)
+{
+	struct bpf_reg_state *reg = &cur_regs(env)[regno];
+
+	if (regno == BPF_REG_0) {
+		/* Function return value */
+		reg->live |= REG_LIVE_WRITTEN;
+		reg->subreg_def = reg_size == sizeof(u64) ?
+			DEF_NOT_SUBREG : env->insn_idx + 1;
+	} else {
+		/* Function argument */
+		if (reg_size == sizeof(u64)) {
+			mark_insn_zext(env, reg);
+			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+		} else {
+			mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
+		}
+	}
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+	const struct btf_type *t, *func, *func_proto, *ptr_type;
+	struct bpf_reg_state *regs = cur_regs(env);
+	const char *func_name, *ptr_type_name;
+	u32 i, nargs, func_id, ptr_type_id;
+	const struct btf_param *args;
+	int err;
+
+	func_id = insn->imm;
+	func = btf_type_by_id(btf_vmlinux, func_id);
+	func_name = btf_name_by_offset(btf_vmlinux, func->name_off);
+	func_proto = btf_type_by_id(btf_vmlinux, func->type);
+
+	if (!env->ops->check_kfunc_call ||
+	    !env->ops->check_kfunc_call(func_id)) {
+		verbose(env, "calling kernel function %s is not allowed\n",
+			func_name);
+		return -EACCES;
+	}
+
+	/* Check the arguments */
+	err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs);
+	if (err)
+		return err;
+
+	for (i = 0; i < CALLER_SAVED_REGS; i++)
+		mark_reg_not_init(env, regs, caller_saved[i]);
+
+	/* Check return type */
+	t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL);
+	if (btf_type_is_scalar(t)) {
+		mark_reg_unknown(env, regs, BPF_REG_0);
+		mark_btf_func_reg_size(env, BPF_REG_0, t->size);
+	} else if (btf_type_is_ptr(t)) {
+		ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type,
+						   &ptr_type_id);
+		if (!btf_type_is_struct(ptr_type)) {
+			ptr_type_name = btf_name_by_offset(btf_vmlinux,
+							   ptr_type->name_off);
+			verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
+				func_name, btf_type_str(ptr_type),
+				ptr_type_name);
+			return -EINVAL;
+		}
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].btf = btf_vmlinux;
+		regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+		regs[BPF_REG_0].btf_id = ptr_type_id;
+		mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+
+	nargs = btf_type_vlen(func_proto);
+	args = (const struct btf_param *)(func_proto + 1);
+	for (i = 0; i < nargs; i++) {
+		u32 regno = i + 1;
+
+		t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL);
+		if (btf_type_is_ptr(t))
+			mark_btf_func_reg_size(env, regno, sizeof(void *));
+		else
+			/* scalar. ensured by btf_check_kfunc_arg_match() */
+			mark_btf_func_reg_size(env, regno, t->size);
+	}
+
+	return 0;
+}
+
 static bool signed_add_overflows(s64 a, s64 b)
 {
 	/* Do the add in u64, where overflow is well-defined */
@@ -10162,6 +10440,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level & BPF_LOG_LEVEL) {
 			const struct bpf_insn_cbs cbs = {
+				.cb_call	= disasm_kfunc_name,
 				.cb_print	= verbose,
 				.private_data	= env,
 			};
@@ -10309,7 +10588,8 @@ static int do_check(struct bpf_verifier_env *env)
 				if (BPF_SRC(insn->code) != BPF_K ||
 				    insn->off != 0 ||
 				    (insn->src_reg != BPF_REG_0 &&
-				     insn->src_reg != BPF_PSEUDO_CALL) ||
+				     insn->src_reg != BPF_PSEUDO_CALL &&
+				     insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
 				    insn->dst_reg != BPF_REG_0 ||
 				    class == BPF_JMP32) {
 					verbose(env, "BPF_CALL uses reserved fields\n");
@@ -10324,6 +10604,8 @@ static int do_check(struct bpf_verifier_env *env)
 				}
 				if (insn->src_reg == BPF_PSEUDO_CALL)
 					err = check_func_call(env, insn, &env->insn_idx);
+				else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+					err = check_kfunc_call(env, insn);
 				else
 					err = check_helper_call(env, insn, &env->insn_idx);
 				if (err)
@@ -11634,6 +11916,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->name[0] = 'F';
 		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
+		func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
 		func[i]->aux->linfo = prog->aux->linfo;
 		func[i]->aux->nr_linfo = prog->aux->nr_linfo;
 		func[i]->aux->jited_linfo = prog->aux->jited_linfo;
@@ -11773,6 +12056,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	struct bpf_prog *prog = env->prog;
 	struct bpf_insn *insn = prog->insnsi;
+	bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
 	int i, depth;
 #endif
 	int err = 0;
@@ -11786,6 +12070,10 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 			return err;
 	}
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
+	if (has_kfunc_call) {
+		verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+		return -EINVAL;
+	}
 	if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
 		/* When JIT fails the progs with bpf2bpf calls and tail_calls
 		 * have to be rejected, since interpreter doesn't support them yet.
@@ -11814,6 +12102,26 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 	return err;
 }
 
+static int fixup_kfunc_call(struct bpf_verifier_env *env,
+			    struct bpf_insn *insn)
+{
+	const struct bpf_kfunc_desc *desc;
+
+	/* insn->imm has the btf func_id. Replace it with
+	 * an address (relative to __bpf_base_call).
+	 */
+	desc = find_kfunc_desc(env->prog, insn->imm);
+	if (!desc) {
+		verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
+			insn->imm);
+		return -EFAULT;
+	}
+
+	insn->imm = desc->imm;
+
+	return 0;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
@@ -11949,6 +12257,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
 			continue;
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			ret = fixup_kfunc_call(env, insn);
+			if (ret)
+				return ret;
+			continue;
+		}
 
 		if (insn->imm == BPF_FUNC_get_route_realm)
 			prog->dst_needed = 1;
@@ -12178,6 +12492,8 @@ patch_call_imm:
 		}
 	}
 
+	sort_kfunc_descs_by_imm(env->prog);
+
 	return 0;
 }
 
@@ -12883,6 +13199,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	if (!env->explored_states)
 		goto skip_full_check;
 
+	ret = add_subprog_and_kfunc(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = check_subprogs(env);
 	if (ret < 0)
 		goto skip_full_check;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2d3036e292a9..ab9f2233607c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1117,6 +1117,10 @@ enum bpf_link_type {
  * offset to another bpf function
  */
 #define BPF_PSEUDO_CALL		1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL	2
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
 enum {
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index eb888c8479c3..336a749673d1 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -19,7 +19,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 2),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 1,
@@ -136,7 +136,7 @@
 {
 	"calls: wrong src reg",
 	.insns = {
-	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
@@ -397,7 +397,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.fixup_map_hash_48b = { 3 },
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
@@ -1977,7 +1977,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
@@ -2003,7 +2003,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.errstr = "!read_ok",
 	.result = REJECT,
 },
@@ -2028,7 +2028,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.errstr = "!read_ok",
 	.result = REJECT,
 },
diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c
index 5cf361d8eb1c..17fe33a75034 100644
--- a/tools/testing/selftests/bpf/verifier/dead_code.c
+++ b/tools/testing/selftests/bpf/verifier/dead_code.c
@@ -85,7 +85,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 12),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -103,7 +103,7 @@
 	BPF_MOV64_IMM(BPF_REG_0, 12),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -121,7 +121,7 @@
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
@@ -137,7 +137,7 @@
 	BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
@@ -152,7 +152,7 @@
 	BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "function calls to other bpf functions are allowed for",
+	.errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for",
 	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
-- 
cgit v1.2.3


From 7bd1590d4eba1583f6ee85e8cfe556505f761e19 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 24 Mar 2021 18:52:52 -0700
Subject: bpf: selftests: Add kfunc_call test

This patch adds a few kernel function bpf_kfunc_call_test*() for the
selftest's test_run purpose.  They will be allowed for tc_cls prog.

The selftest calling the kernel function bpf_kfunc_call_test*()
is also added in this patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210325015252.1551395-1-kafai@fb.com
---
 include/linux/bpf.h                                |  6 +++
 net/bpf/test_run.c                                 | 28 ++++++++++
 net/core/filter.c                                  |  1 +
 .../testing/selftests/bpf/prog_tests/kfunc_call.c  | 59 ++++++++++++++++++++++
 .../testing/selftests/bpf/progs/kfunc_call_test.c  | 47 +++++++++++++++++
 .../selftests/bpf/progs/kfunc_call_test_subprog.c  | 42 +++++++++++++++
 6 files changed, 183 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_call.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b5b7967e3ff3..9fdd839b418c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1532,6 +1532,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 				const union bpf_attr *kattr,
 				union bpf_attr __user *uattr);
+bool bpf_prog_test_check_kfunc_call(u32 kfunc_id);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
@@ -1731,6 +1732,11 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 	return -ENOTSUPP;
 }
 
+static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id)
+{
+	return false;
+}
+
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 4aabf71cd95d..a5d72c48fb66 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2017 Facebook
  */
 #include <linux/bpf.h>
+#include <linux/btf_ids.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/etherdevice.h>
@@ -213,10 +214,37 @@ int noinline bpf_modify_return_test(int a, int *b)
 	*b += 1;
 	return a + *b;
 }
+
+u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
+{
+	return a + b + c + d;
+}
+
+int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
+{
+	return a + b;
+}
+
+struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
+{
+	return sk;
+}
+
 __diag_pop();
 
 ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
 
+BTF_SET_START(test_sk_kfunc_ids)
+BTF_ID(func, bpf_kfunc_call_test1)
+BTF_ID(func, bpf_kfunc_call_test2)
+BTF_ID(func, bpf_kfunc_call_test3)
+BTF_SET_END(test_sk_kfunc_ids)
+
+bool bpf_prog_test_check_kfunc_call(u32 kfunc_id)
+{
+	return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id);
+}
+
 static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
 			   u32 headroom, u32 tailroom)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 17dc159ec40c..cae56d08a670 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9813,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
 	.gen_ld_abs		= bpf_gen_ld_abs,
+	.check_kfunc_call	= bpf_prog_test_check_kfunc_call,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
new file mode 100644
index 000000000000..7fc0951ee75f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "kfunc_call_test.skel.h"
+#include "kfunc_call_test_subprog.skel.h"
+
+static void test_main(void)
+{
+	struct kfunc_call_test *skel;
+	int prog_fd, retval, err;
+
+	skel = kfunc_call_test__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel"))
+		return;
+
+	prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1);
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, (__u32 *)&retval, NULL);
+	ASSERT_OK(err, "bpf_prog_test_run(test1)");
+	ASSERT_EQ(retval, 12, "test1-retval");
+
+	prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2);
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, (__u32 *)&retval, NULL);
+	ASSERT_OK(err, "bpf_prog_test_run(test2)");
+	ASSERT_EQ(retval, 3, "test2-retval");
+
+	kfunc_call_test__destroy(skel);
+}
+
+static void test_subprog(void)
+{
+	struct kfunc_call_test_subprog *skel;
+	int prog_fd, retval, err;
+
+	skel = kfunc_call_test_subprog__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel"))
+		return;
+
+	prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1);
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, (__u32 *)&retval, NULL);
+	ASSERT_OK(err, "bpf_prog_test_run(test1)");
+	ASSERT_EQ(retval, 10, "test1-retval");
+	ASSERT_NEQ(skel->data->active_res, -1, "active_res");
+	ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state");
+
+	kfunc_call_test_subprog__destroy(skel);
+}
+
+void test_kfunc_call(void)
+{
+	if (test__start_subtest("main"))
+		test_main();
+
+	if (test__start_subtest("subprog"))
+		test_subprog();
+}
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
new file mode 100644
index 000000000000..470f8723e463
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym;
+extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
+				  __u32 c, __u64 d) __ksym;
+
+SEC("classifier")
+int kfunc_call_test2(struct __sk_buff *skb)
+{
+	struct bpf_sock *sk = skb->sk;
+
+	if (!sk)
+		return -1;
+
+	sk = bpf_sk_fullsock(sk);
+	if (!sk)
+		return -1;
+
+	return bpf_kfunc_call_test2((struct sock *)sk, 1, 2);
+}
+
+SEC("classifier")
+int kfunc_call_test1(struct __sk_buff *skb)
+{
+	struct bpf_sock *sk = skb->sk;
+	__u64 a = 1ULL << 32;
+	__u32 ret;
+
+	if (!sk)
+		return -1;
+
+	sk = bpf_sk_fullsock(sk);
+	if (!sk)
+		return -1;
+
+	a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4);
+	ret = a >> 32;   /* ret should be 2 */
+	ret += (__u32)a; /* ret should be 12 */
+
+	return ret;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
new file mode 100644
index 000000000000..b2dcb7d9cb03
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+extern const int bpf_prog_active __ksym;
+extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
+				  __u32 c, __u64 d) __ksym;
+extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym;
+int active_res = -1;
+int sk_state = -1;
+
+int __noinline f1(struct __sk_buff *skb)
+{
+	struct bpf_sock *sk = skb->sk;
+	int *active;
+
+	if (!sk)
+		return -1;
+
+	sk = bpf_sk_fullsock(sk);
+	if (!sk)
+		return -1;
+
+	active = (int *)bpf_per_cpu_ptr(&bpf_prog_active,
+					bpf_get_smp_processor_id());
+	if (active)
+		active_res = *active;
+
+	sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state;
+
+	return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4);
+}
+
+SEC("classifier")
+int kfunc_call_test1(struct __sk_buff *skb)
+{
+	return f1(skb);
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 8c8239c2c1fb82f171cb22a707f3bb88a2f22109 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 25 Mar 2021 10:47:12 -0600
Subject: of: Add missing 'Return' section in kerneldoc comments

Many of the DT kerneldoc comments are lacking a 'Return' section. Let's
add the section in cases we have a description of return values. There's
still some cases where the return values are not documented.

Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/20210325164713.1296407-8-robh@kernel.org
---
 drivers/of/base.c     | 39 ++++++++++++++++--------------
 drivers/of/dynamic.c  | 19 +++++++++------
 drivers/of/fdt.c      |  8 +++----
 drivers/of/irq.c      | 14 +++++------
 drivers/of/overlay.c  | 16 ++++++-------
 drivers/of/platform.c | 10 ++++----
 drivers/of/property.c | 66 ++++++++++++++++++++++++++++++++-------------------
 include/linux/of.h    | 63 +++++++++++++++++++++++++++++++-----------------
 8 files changed, 140 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index a64c093d30ef..0269439d74aa 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -244,7 +244,7 @@ struct device_node *__of_find_all_nodes(struct device_node *prev)
  * @prev:	Previous node or NULL to start iteration
  *		of_node_put() will be called on it
  *
- * Returns a node pointer with refcount incremented, use
+ * Return: A node pointer with refcount incremented, use
  * of_node_put() on it when done.
  */
 struct device_node *of_find_all_nodes(struct device_node *prev)
@@ -374,7 +374,7 @@ bool __weak arch_find_n_match_cpu_physical_id(struct device_node *cpun,
  * before booting secondary cores. This function uses arch_match_cpu_phys_id
  * which can be overridden by architecture specific implementation.
  *
- * Returns a node pointer for the logical cpu with refcount incremented, use
+ * Return: A node pointer for the logical cpu with refcount incremented, use
  * of_node_put() on it when done. Returns NULL if not found.
  */
 struct device_node *of_get_cpu_node(int cpu, unsigned int *thread)
@@ -394,8 +394,8 @@ EXPORT_SYMBOL(of_get_cpu_node);
  *
  * @cpu_node: Pointer to the device_node for CPU.
  *
- * Returns the logical CPU number of the given CPU device_node.
- * Returns -ENODEV if the CPU is not found.
+ * Return: The logical CPU number of the given CPU device_node or -ENODEV if the
+ * CPU is not found.
  */
 int of_cpu_node_to_id(struct device_node *cpu_node)
 {
@@ -427,7 +427,7 @@ EXPORT_SYMBOL(of_cpu_node_to_id);
  * bindings. This function check for both and returns the idle state node for
  * the requested index.
  *
- * In case an idle state node is found at @index, the refcount is incremented
+ * Return: An idle state node if found at @index. The refcount is incremented
  * for it, so call of_node_put() on it when done. Returns NULL if not found.
  */
 struct device_node *of_get_cpu_state_node(struct device_node *cpu_node,
@@ -561,7 +561,7 @@ int of_device_compatible_match(struct device_node *device,
  * of_machine_is_compatible - Test root of device tree for a given compatible value
  * @compat: compatible string to look for in root node's compatible property.
  *
- * Returns a positive integer if the root node has the given value in its
+ * Return: A positive integer if the root node has the given value in its
  * compatible property.
  */
 int of_machine_is_compatible(const char *compat)
@@ -583,7 +583,7 @@ EXPORT_SYMBOL(of_machine_is_compatible);
  *
  *  @device: Node to check for availability, with locks already held
  *
- *  Returns true if the status property is absent or set to "okay" or "ok",
+ *  Return: True if the status property is absent or set to "okay" or "ok",
  *  false otherwise
  */
 static bool __of_device_is_available(const struct device_node *device)
@@ -611,7 +611,7 @@ static bool __of_device_is_available(const struct device_node *device)
  *
  *  @device: Node to check for availability
  *
- *  Returns true if the status property is absent or set to "okay" or "ok",
+ *  Return: True if the status property is absent or set to "okay" or "ok",
  *  false otherwise
  */
 bool of_device_is_available(const struct device_node *device)
@@ -632,7 +632,7 @@ EXPORT_SYMBOL(of_device_is_available);
  *
  *  @device: Node to check for endianness
  *
- *  Returns true if the device has a "big-endian" property, or if the kernel
+ *  Return: True if the device has a "big-endian" property, or if the kernel
  *  was compiled for BE *and* the device has a "native-endian" property.
  *  Returns false otherwise.
  *
@@ -816,7 +816,7 @@ EXPORT_SYMBOL(of_get_next_cpu_node);
  * Lookup child node whose compatible property contains the given compatible
  * string.
  *
- * Returns a node pointer with refcount incremented, use of_node_put() on it
+ * Return: a node pointer with refcount incremented, use of_node_put() on it
  * when done; or NULL if not found.
  */
 struct device_node *of_get_compatible_child(const struct device_node *parent,
@@ -1170,7 +1170,7 @@ EXPORT_SYMBOL(of_find_matching_node_and_match);
  * It does this by stripping the manufacturer prefix (as delimited by a ',')
  * from the first entry in the compatible list property.
  *
- * This routine returns 0 on success, <0 on failure.
+ * Return: This routine returns 0 on success, <0 on failure.
  */
 int of_modalias_node(struct device_node *node, char *modalias, int len)
 {
@@ -1190,7 +1190,7 @@ EXPORT_SYMBOL_GPL(of_modalias_node);
  * of_find_node_by_phandle - Find a node given a phandle
  * @handle:	phandle of the node to find
  *
- * Returns a node pointer with refcount incremented, use
+ * Return: A node pointer with refcount incremented, use
  * of_node_put() on it when done.
  */
 struct device_node *of_find_node_by_phandle(phandle handle)
@@ -1426,7 +1426,7 @@ static int __of_parse_phandle_with_args(const struct device_node *np,
  * @index: For properties holding a table of phandles, this is the index into
  *         the table
  *
- * Returns the device_node pointer with refcount incremented.  Use
+ * Return: The device_node pointer with refcount incremented.  Use
  * of_node_put() on it when done.
  */
 struct device_node *of_parse_phandle(const struct device_node *np,
@@ -1726,7 +1726,7 @@ EXPORT_SYMBOL(of_parse_phandle_with_fixed_args);
  * @list_name:	property name that contains a list
  * @cells_name:	property name that specifies phandles' arguments count
  *
- * Returns the number of phandle + argument tuples within a property. It
+ * Return: The number of phandle + argument tuples within a property. It
  * is a typical pattern to encode a list of phandle and variable
  * arguments into a single property. The number of arguments is encoded
  * by a property in the phandle-target node. For example, a gpios
@@ -2026,7 +2026,9 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
  * @stem:	Alias stem of the given device_node
  *
  * The function travels the lookup table to get the alias id for the given
- * device_node and alias stem.  It returns the alias id if found.
+ * device_node and alias stem.
+ *
+ * Return: The alias id if found.
  */
 int of_alias_get_id(struct device_node *np, const char *stem)
 {
@@ -2135,8 +2137,9 @@ EXPORT_SYMBOL_GPL(of_alias_get_highest_id);
  * @index: Index to use for preferred console.
  *
  * Check if the given device node matches the stdout-path property in the
- * /chosen node. If it does then register it as the preferred console and return
- * TRUE. Otherwise return FALSE.
+ * /chosen node. If it does then register it as the preferred console.
+ *
+ * Return: TRUE if console successfully setup. Otherwise return FALSE.
  */
 bool of_console_check(struct device_node *dn, char *name, int index)
 {
@@ -2187,7 +2190,7 @@ struct device_node *of_find_next_cache_node(const struct device_node *np)
  *
  * @cpu: cpu number(logical index) for which the last cache level is needed
  *
- * Returns the the level at which the last cache is present. It is exactly
+ * Return: The the level at which the last cache is present. It is exactly
  * same as  the total number of cache levels for the given logical cpu.
  */
 int of_find_last_cache_level(unsigned int cpu)
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 1d7a22e44d78..cd3821a6444f 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -27,7 +27,7 @@ static struct device_node *kobj_to_device_node(struct kobject *kobj)
  * @node:	Node to inc refcount, NULL is supported to simplify writing of
  *		callers
  *
- * Returns node.
+ * Return: The node with refcount incremented.
  */
 struct device_node *of_node_get(struct device_node *node)
 {
@@ -104,7 +104,8 @@ int of_reconfig_notify(unsigned long action, struct of_reconfig_data *p)
  * @arg		- argument of the of notifier
  *
  * Returns the new state of a device based on the notifier used.
- * Returns 0 on device going from enabled to disabled, 1 on device
+ *
+ * Return: 0 on device going from enabled to disabled, 1 on device
  * going from disabled to enabled and -1 on no change.
  */
 int of_reconfig_get_state_change(unsigned long action, struct of_reconfig_data *pr)
@@ -374,7 +375,8 @@ void of_node_release(struct kobject *kobj)
  * property structure and the property name & contents. The property's
  * flags have the OF_DYNAMIC bit set so that we can differentiate between
  * dynamically allocated properties and not.
- * Returns the newly allocated property or NULL on out of memory error.
+ *
+ * Return: The newly allocated property or NULL on out of memory error.
  */
 struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags)
 {
@@ -417,7 +419,7 @@ struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags)
  * another node.  The node data are dynamically allocated and all the node
  * flags have the OF_DYNAMIC & OF_DETACHED bits set.
  *
- * Returns the newly allocated node or NULL on out of memory error.
+ * Return: The newly allocated node or NULL on out of memory error.
  */
 struct device_node *__of_node_dup(const struct device_node *np,
 				  const char *full_name)
@@ -783,7 +785,8 @@ static int __of_changeset_apply(struct of_changeset *ocs)
  * Any side-effects of live tree state changes are applied here on
  * success, like creation/destruction of devices and side-effects
  * like creation of sysfs properties and directories.
- * Returns 0 on success, a negative error value in case of an error.
+ *
+ * Return: 0 on success, a negative error value in case of an error.
  * On error the partially applied effects are reverted.
  */
 int of_changeset_apply(struct of_changeset *ocs)
@@ -877,7 +880,8 @@ static int __of_changeset_revert(struct of_changeset *ocs)
  * was before the application.
  * Any side-effects like creation/destruction of devices and
  * removal of sysfs properties and directories are applied.
- * Returns 0 on success, a negative error value in case of an error.
+ *
+ * Return: 0 on success, a negative error value in case of an error.
  */
 int of_changeset_revert(struct of_changeset *ocs)
 {
@@ -905,7 +909,8 @@ EXPORT_SYMBOL_GPL(of_changeset_revert);
  * + OF_RECONFIG_ADD_PROPERTY
  * + OF_RECONFIG_REMOVE_PROPERTY,
  * + OF_RECONFIG_UPDATE_PROPERTY
- * Returns 0 on success, a negative error value in case of an error.
+ *
+ * Return: 0 on success, a negative error value in case of an error.
  */
 int of_changeset_action(struct of_changeset *ocs, unsigned long action,
 		struct device_node *np, struct property *prop)
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index ba53da9c3895..134c7fb43a14 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -282,7 +282,7 @@ static void reverse_nodes(struct device_node *parent)
  * @dad: Parent struct device_node
  * @nodepp: The device_node tree created by the call
  *
- * It returns the size of unflattened device tree or error code
+ * Return: The size of unflattened device tree or error code
  */
 static int unflatten_dt_nodes(const void *blob,
 			      void *mem,
@@ -360,7 +360,7 @@ static int unflatten_dt_nodes(const void *blob,
  * fills the "name" and "type" pointers of the nodes so the normal device-tree
  * walking functions can be used.
  *
- * Returns NULL on failure or the memory chunk containing the unflattened
+ * Return: NULL on failure or the memory chunk containing the unflattened
  * device tree on success.
  */
 void *__unflatten_device_tree(const void *blob,
@@ -441,7 +441,7 @@ static DEFINE_MUTEX(of_fdt_unflatten_mutex);
  * pointers of the nodes so the normal device-tree walking functions
  * can be used.
  *
- * Returns NULL on failure or the memory chunk containing the unflattened
+ * Return: NULL on failure or the memory chunk containing the unflattened
  * device tree on success.
  */
 void *of_fdt_unflatten_tree(const unsigned long *blob,
@@ -716,7 +716,7 @@ const void *__init of_get_flat_dt_prop(unsigned long node, const char *name,
  * @node: node to test
  * @compat: compatible string to compare with compatible list.
  *
- * On match, returns a non-zero value with smaller values returned for more
+ * Return: a non-zero value on match with smaller values returned for more
  * specific compatible values.
  */
 static int of_fdt_is_compatible(const void *blob,
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 25d17b8a1a1a..352e14b007e7 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -48,7 +48,7 @@ EXPORT_SYMBOL_GPL(irq_of_parse_and_map);
  * of_irq_find_parent - Given a device node, find its interrupt parent node
  * @child: pointer to device node
  *
- * Returns a pointer to the interrupt parent node, or NULL if the interrupt
+ * Return: A pointer to the interrupt parent node, or NULL if the interrupt
  * parent could not be determined.
  */
 struct device_node *of_irq_find_parent(struct device_node *child)
@@ -81,14 +81,14 @@ EXPORT_SYMBOL_GPL(of_irq_find_parent);
  * @addr:	address specifier (start of "reg" property of the device) in be32 format
  * @out_irq:	structure of_phandle_args updated by this function
  *
- * Returns 0 on success and a negative number on error
- *
  * This function is a low-level interrupt tree walking function. It
  * can be used to do a partial walk with synthetized reg and interrupts
  * properties, for example when resolving PCI interrupts when no device
  * node exist for the parent. It takes an interrupt specifier structure as
  * input, walks the tree looking for any interrupt-map properties, translates
  * the specifier for each map, and then returns the translated map.
+ *
+ * Return: 0 on success and a negative number on error
  */
 int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq)
 {
@@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(of_irq_to_resource);
  * @dev: pointer to device tree node
  * @index: zero-based index of the IRQ
  *
- * Returns Linux IRQ number on success, or 0 on the IRQ mapping failure, or
+ * Return: Linux IRQ number on success, or 0 on the IRQ mapping failure, or
  * -EPROBE_DEFER if the IRQ domain is not yet created, or error code in case
  * of any other failure.
  */
@@ -407,7 +407,7 @@ EXPORT_SYMBOL_GPL(of_irq_get);
  * @dev: pointer to device tree node
  * @name: IRQ name
  *
- * Returns Linux IRQ number on success, or 0 on the IRQ mapping failure, or
+ * Return: Linux IRQ number on success, or 0 on the IRQ mapping failure, or
  * -EPROBE_DEFER if the IRQ domain is not yet created, or error code in case
  * of any other failure.
  */
@@ -447,7 +447,7 @@ int of_irq_count(struct device_node *dev)
  * @res: array of resources to fill in
  * @nr_irqs: the number of IRQs (and upper bound for num of @res elements)
  *
- * Returns the size of the filled in table (up to @nr_irqs).
+ * Return: The size of the filled in table (up to @nr_irqs).
  */
 int of_irq_to_resource_table(struct device_node *dev, struct resource *res,
 		int nr_irqs)
@@ -602,7 +602,7 @@ static u32 __of_msi_map_id(struct device *dev, struct device_node **np,
  * Walk up the device hierarchy looking for devices with a "msi-map"
  * property.  If found, apply the mapping to @id_in.
  *
- * Returns the mapped MSI ID.
+ * Return: The mapped MSI ID.
  */
 u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in)
 {
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index c25679f7bd3f..d241273170fd 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -298,7 +298,7 @@ err_free_target_path:
  *
  * Update of property in symbols node is not allowed.
  *
- * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * Return: 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
  * invalid @overlay.
  */
 static int add_changeset_property(struct overlay_changeset *ovcs,
@@ -403,7 +403,7 @@ static int add_changeset_property(struct overlay_changeset *ovcs,
  *
  * NOTE_2: Multiple mods of created nodes not supported.
  *
- * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * Return: 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
  * invalid @overlay.
  */
 static int add_changeset_node(struct overlay_changeset *ovcs,
@@ -475,7 +475,7 @@ static int add_changeset_node(struct overlay_changeset *ovcs,
  *
  * Do not allow symbols node to have any children.
  *
- * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * Return: 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
  * invalid @overlay_node.
  */
 static int build_changeset_next_level(struct overlay_changeset *ovcs,
@@ -606,7 +606,7 @@ static int find_dup_cset_prop(struct overlay_changeset *ovcs,
  * the same node or duplicate {add, delete, or update} properties entries
  * for the same property.
  *
- * Returns 0 on success, or -EINVAL if duplicate changeset entry found.
+ * Return: 0 on success, or -EINVAL if duplicate changeset entry found.
  */
 static int changeset_dup_entry_check(struct overlay_changeset *ovcs)
 {
@@ -630,7 +630,7 @@ static int changeset_dup_entry_check(struct overlay_changeset *ovcs)
  * any portions of the changeset that were successfully created will remain
  * in @ovcs->cset.
  *
- * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * Return: 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
  * invalid overlay in @ovcs->fragments[].
  */
 static int build_changeset(struct overlay_changeset *ovcs)
@@ -726,7 +726,7 @@ static struct device_node *find_target(struct device_node *info_node)
  * the top level of @tree.  The relevant top level nodes are the fragment
  * nodes and the __symbols__ node.  Any other top level node will be ignored.
  *
- * Returns 0 on success, -ENOMEM if memory allocation failure, -EINVAL if error
+ * Return: 0 on success, -ENOMEM if memory allocation failure, -EINVAL if error
  * detected in @tree, or -ENOSPC if idr_alloc() error.
  */
 static int init_overlay_changeset(struct overlay_changeset *ovcs,
@@ -1181,7 +1181,7 @@ static int overlay_removal_is_ok(struct overlay_changeset *remove_ovcs)
  * If an error is returned by an overlay changeset post-remove notifier
  * then no further overlay changeset post-remove notifier will be called.
  *
- * Returns 0 on success, or a negative error number.  *ovcs_id is set to
+ * Return: 0 on success, or a negative error number.  *ovcs_id is set to
  * zero after reverting the changeset, even if a subsequent error occurs.
  */
 int of_overlay_remove(int *ovcs_id)
@@ -1259,7 +1259,7 @@ EXPORT_SYMBOL_GPL(of_overlay_remove);
  *
  * Removes all overlays from the system in the correct order.
  *
- * Returns 0 on success, or a negative error number
+ * Return: 0 on success, or a negative error number
  */
 int of_overlay_remove_all(void)
 {
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 0ed46d301431..25d448f5af91 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -44,7 +44,7 @@ static const struct of_device_id of_skipped_node_table[] = {
  * Takes a reference to the embedded struct device which needs to be dropped
  * after use.
  *
- * Returns platform_device pointer, or NULL if not found
+ * Return: platform_device pointer, or NULL if not found
  */
 struct platform_device *of_find_device_by_node(struct device_node *np)
 {
@@ -160,7 +160,7 @@ EXPORT_SYMBOL(of_device_alloc);
  * @platform_data: pointer to populate platform_data pointer with
  * @parent: Linux device model parent device.
  *
- * Returns pointer to created platform device, or NULL if a device was not
+ * Return: Pointer to created platform device, or NULL if a device was not
  * registered.  Unavailable devices will not get registered.
  */
 static struct platform_device *of_platform_device_create_pdata(
@@ -204,7 +204,7 @@ err_clear_flag:
  * @bus_id: name to assign device
  * @parent: Linux device model parent device.
  *
- * Returns pointer to created platform device, or NULL if a device was not
+ * Return: Pointer to created platform device, or NULL if a device was not
  * registered.  Unavailable devices will not get registered.
  */
 struct platform_device *of_platform_device_create(struct device_node *np,
@@ -463,7 +463,7 @@ EXPORT_SYMBOL(of_platform_bus_probe);
  * New board support should be using this function instead of
  * of_platform_bus_probe().
  *
- * Returns 0 on success, < 0 on failure.
+ * Return: 0 on success, < 0 on failure.
  */
 int of_platform_populate(struct device_node *root,
 			const struct of_device_id *matches,
@@ -607,7 +607,7 @@ static void devm_of_platform_populate_release(struct device *dev, void *res)
  * Similar to of_platform_populate(), but will automatically call
  * of_platform_depopulate() when the device is unbound from the bus.
  *
- * Returns 0 on success, < 0 on failure.
+ * Return: 0 on success, < 0 on failure.
  */
 int devm_of_platform_populate(struct device *dev)
 {
diff --git a/drivers/of/property.c b/drivers/of/property.c
index c000ed01db01..2046ae311322 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -61,9 +61,11 @@ EXPORT_SYMBOL(of_graph_is_present);
  * @elem_size:	size of the individual element
  *
  * Search for a property in a device node and count the number of elements of
- * size elem_size in it. Returns number of elements on sucess, -EINVAL if the
- * property does not exist or its length does not match a multiple of elem_size
- * and -ENODATA if the property does not have a value.
+ * size elem_size in it.
+ *
+ * Return: The number of elements on sucess, -EINVAL if the property does not
+ * exist or its length does not match a multiple of elem_size and -ENODATA if
+ * the property does not have a value.
  */
 int of_property_count_elems_of_size(const struct device_node *np,
 				const char *propname, int elem_size)
@@ -95,8 +97,9 @@ EXPORT_SYMBOL_GPL(of_property_count_elems_of_size);
  * @len:	if !=NULL, actual length is written to here
  *
  * Search for a property in a device node and valid the requested size.
- * Returns the property value on success, -EINVAL if the property does not
- *  exist, -ENODATA if property does not have a value, and -EOVERFLOW if the
+ *
+ * Return: The property value on success, -EINVAL if the property does not
+ * exist, -ENODATA if property does not have a value, and -EOVERFLOW if the
  * property data is too small or too large.
  *
  */
@@ -129,7 +132,9 @@ static void *of_find_property_value_of_size(const struct device_node *np,
  * @out_value:	pointer to return value, modified only if no error.
  *
  * Search for a property in a device node and read nth 32-bit value from
- * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * it.
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
  * -ENODATA if property does not have a value, and -EOVERFLOW if the
  * property data isn't large enough.
  *
@@ -161,7 +166,9 @@ EXPORT_SYMBOL_GPL(of_property_read_u32_index);
  * @out_value:	pointer to return value, modified only if no error.
  *
  * Search for a property in a device node and read nth 64-bit value from
- * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * it.
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
  * -ENODATA if property does not have a value, and -EOVERFLOW if the
  * property data isn't large enough.
  *
@@ -196,12 +203,14 @@ EXPORT_SYMBOL_GPL(of_property_read_u64_index);
  *		sz_min will be read.
  *
  * Search for a property in a device node and read 8-bit value(s) from
- * it. Returns number of elements read on success, -EINVAL if the property
- * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW
- * if the property data is smaller than sz_min or longer than sz_max.
+ * it.
  *
  * dts entry of array should be like:
- *	property = /bits/ 8 <0x50 0x60 0x70>;
+ *  ``property = /bits/ 8 <0x50 0x60 0x70>;``
+ *
+ * Return: The number of elements read on success, -EINVAL if the property
+ * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW
+ * if the property data is smaller than sz_min or longer than sz_max.
  *
  * The out_values is modified only if a valid u8 value can be decoded.
  */
@@ -244,12 +253,14 @@ EXPORT_SYMBOL_GPL(of_property_read_variable_u8_array);
  *		sz_min will be read.
  *
  * Search for a property in a device node and read 16-bit value(s) from
- * it. Returns number of elements read on success, -EINVAL if the property
- * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW
- * if the property data is smaller than sz_min or longer than sz_max.
+ * it.
  *
  * dts entry of array should be like:
- *	property = /bits/ 16 <0x5000 0x6000 0x7000>;
+ *  ``property = /bits/ 16 <0x5000 0x6000 0x7000>;``
+ *
+ * Return: The number of elements read on success, -EINVAL if the property
+ * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW
+ * if the property data is smaller than sz_min or longer than sz_max.
  *
  * The out_values is modified only if a valid u16 value can be decoded.
  */
@@ -292,7 +303,9 @@ EXPORT_SYMBOL_GPL(of_property_read_variable_u16_array);
  *		sz_min will be read.
  *
  * Search for a property in a device node and read 32-bit value(s) from
- * it. Returns number of elements read on success, -EINVAL if the property
+ * it.
+ *
+ * Return: The number of elements read on success, -EINVAL if the property
  * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW
  * if the property data is smaller than sz_min or longer than sz_max.
  *
@@ -331,7 +344,9 @@ EXPORT_SYMBOL_GPL(of_property_read_variable_u32_array);
  * @out_value:	pointer to return value, modified only if return value is 0.
  *
  * Search for a property in a device node and read a 64-bit value from
- * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * it.
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
  * -ENODATA if property does not have a value, and -EOVERFLOW if the
  * property data isn't large enough.
  *
@@ -366,7 +381,9 @@ EXPORT_SYMBOL_GPL(of_property_read_u64);
  *		sz_min will be read.
  *
  * Search for a property in a device node and read 64-bit value(s) from
- * it. Returns number of elements read on success, -EINVAL if the property
+ * it.
+ *
+ * Return: The number of elements read on success, -EINVAL if the property
  * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW
  * if the property data is smaller than sz_min or longer than sz_max.
  *
@@ -408,10 +425,11 @@ EXPORT_SYMBOL_GPL(of_property_read_variable_u64_array);
  *		return value is 0.
  *
  * Search for a property in a device tree node and retrieve a null
- * terminated string value (pointer to data, not a copy). Returns 0 on
- * success, -EINVAL if the property does not exist, -ENODATA if property
- * does not have a value, and -EILSEQ if the string is not null-terminated
- * within the length of the property data.
+ * terminated string value (pointer to data, not a copy).
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist, -ENODATA if
+ * property does not have a value, and -EILSEQ if the string is not
+ * null-terminated within the length of the property data.
  *
  * The out_string pointer is modified only if a valid string can be decoded.
  */
@@ -775,7 +793,7 @@ EXPORT_SYMBOL(of_graph_get_remote_port_parent);
  * @node: pointer to a local endpoint device_node
  *
  * Return: Remote port node associated with remote endpoint node linked
- *	   to @node. Use of_node_put() on it when done.
+ * to @node. Use of_node_put() on it when done.
  */
 struct device_node *of_graph_get_remote_port(const struct device_node *node)
 {
@@ -808,7 +826,7 @@ EXPORT_SYMBOL(of_graph_get_endpoint_count);
  * @endpoint: identifier (value of reg property) of the endpoint node
  *
  * Return: Remote device node associated with remote endpoint node linked
- *	   to @node. Use of_node_put() on it when done.
+ * to @node. Use of_node_put() on it when done.
  */
 struct device_node *of_graph_get_remote_node(const struct device_node *node,
 					     u32 port, u32 endpoint)
diff --git a/include/linux/of.h b/include/linux/of.h
index e9209ef44cc0..ef6b161d1f91 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -424,12 +424,14 @@ extern int of_detach_node(struct device_node *);
  * @sz:		number of array elements to read
  *
  * Search for a property in a device node and read 8-bit value(s) from
- * it. Returns 0 on success, -EINVAL if the property does not exist,
- * -ENODATA if property does not have a value, and -EOVERFLOW if the
- * property data isn't large enough.
+ * it.
  *
  * dts entry of array should be like:
- *	property = /bits/ 8 <0x50 0x60 0x70>;
+ *  ``property = /bits/ 8 <0x50 0x60 0x70>;``
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
  *
  * The out_values is modified only if a valid u8 value can be decoded.
  */
@@ -454,12 +456,14 @@ static inline int of_property_read_u8_array(const struct device_node *np,
  * @sz:		number of array elements to read
  *
  * Search for a property in a device node and read 16-bit value(s) from
- * it. Returns 0 on success, -EINVAL if the property does not exist,
- * -ENODATA if property does not have a value, and -EOVERFLOW if the
- * property data isn't large enough.
+ * it.
  *
  * dts entry of array should be like:
- *	property = /bits/ 16 <0x5000 0x6000 0x7000>;
+ *  ``property = /bits/ 16 <0x5000 0x6000 0x7000>;``
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
  *
  * The out_values is modified only if a valid u16 value can be decoded.
  */
@@ -485,7 +489,9 @@ static inline int of_property_read_u16_array(const struct device_node *np,
  * @sz:		number of array elements to read
  *
  * Search for a property in a device node and read 32-bit value(s) from
- * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * it.
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
  * -ENODATA if property does not have a value, and -EOVERFLOW if the
  * property data isn't large enough.
  *
@@ -513,7 +519,9 @@ static inline int of_property_read_u32_array(const struct device_node *np,
  * @sz:		number of array elements to read
  *
  * Search for a property in a device node and read 64-bit value(s) from
- * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * it.
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist,
  * -ENODATA if property does not have a value, and -EOVERFLOW if the
  * property data isn't large enough.
  *
@@ -1070,7 +1078,9 @@ static inline bool of_node_is_type(const struct device_node *np, const char *typ
  * @propname:	name of the property to be searched.
  *
  * Search for a property in a device node and count the number of u8 elements
- * in it. Returns number of elements on sucess, -EINVAL if the property does
+ * in it.
+ *
+ * Return: The number of elements on sucess, -EINVAL if the property does
  * not exist or its length does not match a multiple of u8 and -ENODATA if the
  * property does not have a value.
  */
@@ -1087,7 +1097,9 @@ static inline int of_property_count_u8_elems(const struct device_node *np,
  * @propname:	name of the property to be searched.
  *
  * Search for a property in a device node and count the number of u16 elements
- * in it. Returns number of elements on sucess, -EINVAL if the property does
+ * in it.
+ *
+ * Return: The number of elements on sucess, -EINVAL if the property does
  * not exist or its length does not match a multiple of u16 and -ENODATA if the
  * property does not have a value.
  */
@@ -1104,7 +1116,9 @@ static inline int of_property_count_u16_elems(const struct device_node *np,
  * @propname:	name of the property to be searched.
  *
  * Search for a property in a device node and count the number of u32 elements
- * in it. Returns number of elements on sucess, -EINVAL if the property does
+ * in it.
+ *
+ * Return: The number of elements on sucess, -EINVAL if the property does
  * not exist or its length does not match a multiple of u32 and -ENODATA if the
  * property does not have a value.
  */
@@ -1121,7 +1135,9 @@ static inline int of_property_count_u32_elems(const struct device_node *np,
  * @propname:	name of the property to be searched.
  *
  * Search for a property in a device node and count the number of u64 elements
- * in it. Returns number of elements on sucess, -EINVAL if the property does
+ * in it.
+ *
+ * Return: The number of elements on sucess, -EINVAL if the property does
  * not exist or its length does not match a multiple of u64 and -ENODATA if the
  * property does not have a value.
  */
@@ -1142,7 +1158,7 @@ static inline int of_property_count_u64_elems(const struct device_node *np,
  * Search for a property in a device tree node and retrieve a list of
  * terminated string values (pointer to data, not a copy) in that property.
  *
- * If @out_strs is NULL, the number of strings in the property is returned.
+ * Return: If @out_strs is NULL, the number of strings in the property is returned.
  */
 static inline int of_property_read_string_array(const struct device_node *np,
 						const char *propname, const char **out_strs,
@@ -1158,10 +1174,11 @@ static inline int of_property_read_string_array(const struct device_node *np,
  * @propname:	name of the property to be searched.
  *
  * Search for a property in a device tree node and retrieve the number of null
- * terminated string contain in it. Returns the number of strings on
- * success, -EINVAL if the property does not exist, -ENODATA if property
- * does not have a value, and -EILSEQ if the string is not null-terminated
- * within the length of the property data.
+ * terminated string contain in it.
+ *
+ * Return: The number of strings on success, -EINVAL if the property does not
+ * exist, -ENODATA if property does not have a value, and -EILSEQ if the string
+ * is not null-terminated within the length of the property data.
  */
 static inline int of_property_count_strings(const struct device_node *np,
 					    const char *propname)
@@ -1181,7 +1198,8 @@ static inline int of_property_count_strings(const struct device_node *np,
  * Search for a property in a device tree node and retrieve a null
  * terminated string value (pointer to data, not a copy) in the list of strings
  * contained in that property.
- * Returns 0 on success, -EINVAL if the property does not exist, -ENODATA if
+ *
+ * Return: 0 on success, -EINVAL if the property does not exist, -ENODATA if
  * property does not have a value, and -EILSEQ if the string is not
  * null-terminated within the length of the property data.
  *
@@ -1201,7 +1219,8 @@ static inline int of_property_read_string_index(const struct device_node *np,
  * @propname:	name of the property to be searched.
  *
  * Search for a property in a device node.
- * Returns true if the property exists false otherwise.
+ *
+ * Return: true if the property exists false otherwise.
  */
 static inline bool of_property_read_bool(const struct device_node *np,
 					 const char *propname)
@@ -1447,7 +1466,7 @@ static inline int of_reconfig_get_state_change(unsigned long action,
  * of_device_is_system_power_controller - Tells if system-power-controller is found for device_node
  * @np: Pointer to the given device_node
  *
- * return true if present false otherwise
+ * Return: true if present false otherwise
  */
 static inline bool of_device_is_system_power_controller(const struct device_node *np)
 {
-- 
cgit v1.2.3


From c21161e40ee94486f7db701f9b4d3f9c25763e8c Mon Sep 17 00:00:00 2001
From: Ray Chi <raychi@google.com>
Date: Sun, 28 Mar 2021 02:28:09 +0800
Subject: power: supply: Fix build error when CONFIG_POWER_SUPPLY is not
 enabled.

The build error happens when CONFIG_POWER_SUPPLY is not enabled.

h8300-linux-ld: drivers/usb/dwc3/gadget.o: in function `.L59':
>> gadget.c:(.text+0x655): undefined reference to `power_supply_set_property'

Fixes: 99288de36020 ("usb: dwc3: add an alternate path in vbus_draw callback")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Ray Chi <raychi@google.com>
Link: https://lore.kernel.org/r/20210327182809.1814480-3-raychi@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/power_supply.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 81a55e974feb..b495b4374cd0 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -426,9 +426,16 @@ static inline int power_supply_is_system_supplied(void) { return -ENOSYS; }
 extern int power_supply_get_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val);
+#if IS_ENABLED(CONFIG_POWER_SUPPLY)
 extern int power_supply_set_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    const union power_supply_propval *val);
+#else
+static inline int power_supply_set_property(struct power_supply *psy,
+			    enum power_supply_property psp,
+			    const union power_supply_propval *val)
+{ return 0; }
+#endif
 extern int power_supply_property_is_writeable(struct power_supply *psy,
 					enum power_supply_property psp);
 extern void power_supply_external_power_changed(struct power_supply *psy);
-- 
cgit v1.2.3


From 7a534c5e4159f9bbac9f3c146dc78e163d8858c2 Mon Sep 17 00:00:00 2001
From: Zhang Yunkai <zhang.yunkai@zte.com.cn>
Date: Sat, 13 Mar 2021 02:57:02 -0800
Subject: mtd: rawnand: remove duplicate include in rawnand.h

'linux/mtd/nand.h' included in 'rawnand.h' is duplicated.
It is also included in the 17th line.

Signed-off-by: Zhang Yunkai <zhang.yunkai@zte.com.cn>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210313105702.365878-1-zhang.yunkai@zte.com.cn
---
 include/linux/mtd/rawnand.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 6b3240e44310..93e8f72beba6 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -18,7 +18,6 @@
 #include <linux/mtd/flashchip.h>
 #include <linux/mtd/bbm.h>
 #include <linux/mtd/jedec.h>
-#include <linux/mtd/nand.h>
 #include <linux/mtd/onfi.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
-- 
cgit v1.2.3


From e3c1f1c92d6ede3cfa09d6a103d3d1c1ef645e35 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Wed, 3 Mar 2021 21:18:19 +0100
Subject: mtd: add OTP (one-time-programmable) erase ioctl

This may sound like a contradiction but some SPI-NOR flashes really
support erasing their OTP region until it is finally locked. Having the
possibility to erase an OTP region might come in handy during
development.

The ioctl argument follows the OTPLOCK style.

Signed-off-by: Michael Walle <michael@walle.cc>
Acked-by: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210303201819.2752-1-michael@walle.cc
---
 drivers/mtd/mtdchar.c      |  7 ++++++-
 drivers/mtd/mtdcore.c      | 12 ++++++++++++
 include/linux/mtd/mtd.h    |  3 +++
 include/uapi/mtd/mtd-abi.h |  2 ++
 4 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 870f7a19ad9d..155e991d9d75 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -666,6 +666,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 	case MEMWRITEOOB64:
 	case MEMWRITE:
 	case OTPLOCK:
+	case OTPERASE:
 		if (!(file->f_mode & FMODE_WRITE))
 			return -EPERM;
 		break;
@@ -930,6 +931,7 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 	}
 
 	case OTPLOCK:
+	case OTPERASE:
 	{
 		struct otp_info oinfo;
 
@@ -937,7 +939,10 @@ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
 			return -EINVAL;
 		if (copy_from_user(&oinfo, argp, sizeof(oinfo)))
 			return -EFAULT;
-		ret = mtd_lock_user_prot_reg(mtd, oinfo.start, oinfo.length);
+		if (cmd == OTPLOCK)
+			ret = mtd_lock_user_prot_reg(mtd, oinfo.start, oinfo.length);
+		else
+			ret = mtd_erase_user_prot_reg(mtd, oinfo.start, oinfo.length);
 		break;
 	}
 
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 38782ceea1f6..aea58366a94e 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1919,6 +1919,18 @@ int mtd_lock_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len)
 }
 EXPORT_SYMBOL_GPL(mtd_lock_user_prot_reg);
 
+int mtd_erase_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len)
+{
+	struct mtd_info *master = mtd_get_master(mtd);
+
+	if (!master->_erase_user_prot_reg)
+		return -EOPNOTSUPP;
+	if (!len)
+		return 0;
+	return master->_erase_user_prot_reg(master, from, len);
+}
+EXPORT_SYMBOL_GPL(mtd_erase_user_prot_reg);
+
 /* Chip-supported device locking */
 int mtd_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index ceabc2cae8a4..4aac200ca8b5 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -337,6 +337,8 @@ struct mtd_info {
 				     size_t len, size_t *retlen, u_char *buf);
 	int (*_lock_user_prot_reg) (struct mtd_info *mtd, loff_t from,
 				    size_t len);
+	int (*_erase_user_prot_reg) (struct mtd_info *mtd, loff_t from,
+				     size_t len);
 	int (*_writev) (struct mtd_info *mtd, const struct kvec *vecs,
 			unsigned long count, loff_t to, size_t *retlen);
 	void (*_sync) (struct mtd_info *mtd);
@@ -518,6 +520,7 @@ int mtd_read_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len,
 int mtd_write_user_prot_reg(struct mtd_info *mtd, loff_t to, size_t len,
 			    size_t *retlen, u_char *buf);
 int mtd_lock_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len);
+int mtd_erase_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len);
 
 int mtd_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	       unsigned long count, loff_t to, size_t *retlen);
diff --git a/include/uapi/mtd/mtd-abi.h b/include/uapi/mtd/mtd-abi.h
index 65b9db936557..b869990c2db2 100644
--- a/include/uapi/mtd/mtd-abi.h
+++ b/include/uapi/mtd/mtd-abi.h
@@ -205,6 +205,8 @@ struct otp_info {
  * without OOB, e.g., NOR flash.
  */
 #define MEMWRITE		_IOWR('M', 24, struct mtd_write_req)
+/* Erase a given range of user data (must be in mode %MTD_FILE_MODE_OTP_USER) */
+#define OTPERASE		_IOW('M', 25, struct otp_info)
 
 /*
  * Obsolete legacy interface. Keep it in order not to break userspace
-- 
cgit v1.2.3


From 28f0be44b263ca4b59ea63c801db3830e65fbe99 Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Tue, 23 Mar 2021 11:17:37 +0800
Subject: include: linux: mtd: Remove duplicate include of nand.h

linux/mtd/nand.h has been included at line 17.
So we remove the duplicate one at line 21.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210323031737.259365-1-wanjiabing@vivo.com
---
 include/linux/mtd/rawnand.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 6b3240e44310..93e8f72beba6 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -18,7 +18,6 @@
 #include <linux/mtd/flashchip.h>
 #include <linux/mtd/bbm.h>
 #include <linux/mtd/jedec.h>
-#include <linux/mtd/nand.h>
 #include <linux/mtd/onfi.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
-- 
cgit v1.2.3


From 48dc16e2e52ca386d97435f257ef8a754becb0b8 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:26 +0000
Subject: firmware: arm_scmi: Extend protocol registration interfaces

Extend common protocol registration routines and provide some new generic
protocols get/put helpers that can track protocols usage and automatically
perform the proper initialization and de-initialization on demand when
required.

Convert all standard protocols to use this new registration scheme while
keeping them all still using the usual initialization logic bound to SCMI
devices probing.

Link: https://lore.kernel.org/r/20210316124903.35011-2-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/base.c    |  10 +-
 drivers/firmware/arm_scmi/bus.c     |  59 +++++++++---
 drivers/firmware/arm_scmi/clock.c   |  10 +-
 drivers/firmware/arm_scmi/common.h  |  46 +++++++--
 drivers/firmware/arm_scmi/driver.c  | 182 +++++++++++++++++++++++++++++++++++-
 drivers/firmware/arm_scmi/perf.c    |  10 +-
 drivers/firmware/arm_scmi/power.c   |  10 +-
 drivers/firmware/arm_scmi/reset.c   |  10 +-
 drivers/firmware/arm_scmi/sensors.c |  10 +-
 drivers/firmware/arm_scmi/system.c  |  10 +-
 drivers/firmware/arm_scmi/voltage.c |  10 +-
 include/linux/scmi_protocol.h       |   8 +-
 12 files changed, 332 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/base.c b/drivers/firmware/arm_scmi/base.c
index 017e5d8bd869..b0e16bea614a 100644
--- a/drivers/firmware/arm_scmi/base.c
+++ b/drivers/firmware/arm_scmi/base.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) Base Protocol
  *
- * Copyright (C) 2018 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 
 #define pr_fmt(fmt) "SCMI Notifications BASE - " fmt
@@ -365,3 +365,11 @@ int scmi_base_protocol_init(struct scmi_handle *h)
 
 	return 0;
 }
+
+static const struct scmi_protocol scmi_base = {
+	.id = SCMI_PROTOCOL_BASE,
+	.init = &scmi_base_protocol_init,
+	.ops = NULL,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(base, scmi_base)
diff --git a/drivers/firmware/arm_scmi/bus.c b/drivers/firmware/arm_scmi/bus.c
index 1377ec76a45d..4e8f6ea355dd 100644
--- a/drivers/firmware/arm_scmi/bus.c
+++ b/drivers/firmware/arm_scmi/bus.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) Message Protocol bus layer
  *
- * Copyright (C) 2018 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -51,13 +51,29 @@ static int scmi_dev_match(struct device *dev, struct device_driver *drv)
 	return 0;
 }
 
+const struct scmi_protocol *scmi_protocol_get(int protocol_id)
+{
+	const struct scmi_protocol *proto;
+
+	proto = idr_find(&scmi_protocols, protocol_id);
+	if (!proto) {
+		pr_warn("SCMI Protocol 0x%x not found!\n", protocol_id);
+		return NULL;
+	}
+
+	pr_debug("Found SCMI Protocol 0x%x\n", protocol_id);
+
+	return proto;
+}
+
 static int scmi_protocol_init(int protocol_id, struct scmi_handle *handle)
 {
-	scmi_prot_init_fn_t fn = idr_find(&scmi_protocols, protocol_id);
+	const struct scmi_protocol *proto;
 
-	if (unlikely(!fn))
+	proto = scmi_protocol_get(protocol_id);
+	if (!proto)
 		return -EINVAL;
-	return fn(handle);
+	return proto->init(handle);
 }
 
 static int scmi_protocol_dummy_init(struct scmi_handle *handle)
@@ -194,26 +210,45 @@ void scmi_set_handle(struct scmi_device *scmi_dev)
 	scmi_dev->handle = scmi_handle_get(&scmi_dev->dev);
 }
 
-int scmi_protocol_register(int protocol_id, scmi_prot_init_fn_t fn)
+int scmi_protocol_register(const struct scmi_protocol *proto)
 {
 	int ret;
 
+	if (!proto) {
+		pr_err("invalid protocol\n");
+		return -EINVAL;
+	}
+
+	if (!proto->init && !proto->instance_init) {
+		pr_err("missing init for protocol 0x%x\n", proto->id);
+		return -EINVAL;
+	}
+
 	spin_lock(&protocol_lock);
-	ret = idr_alloc(&scmi_protocols, fn, protocol_id, protocol_id + 1,
-			GFP_ATOMIC);
+	ret = idr_alloc(&scmi_protocols, (void *)proto,
+			proto->id, proto->id + 1, GFP_ATOMIC);
 	spin_unlock(&protocol_lock);
-	if (ret != protocol_id)
-		pr_err("unable to allocate SCMI idr slot, err %d\n", ret);
+	if (ret != proto->id) {
+		pr_err("unable to allocate SCMI idr slot for 0x%x - err %d\n",
+		       proto->id, ret);
+		return ret;
+	}
+
+	pr_debug("Registered SCMI Protocol 0x%x\n", proto->id);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(scmi_protocol_register);
 
-void scmi_protocol_unregister(int protocol_id)
+void scmi_protocol_unregister(const struct scmi_protocol *proto)
 {
 	spin_lock(&protocol_lock);
-	idr_remove(&scmi_protocols, protocol_id);
+	idr_remove(&scmi_protocols, proto->id);
 	spin_unlock(&protocol_lock);
+
+	pr_debug("Unregistered SCMI Protocol 0x%x\n", proto->id);
+
+	return;
 }
 EXPORT_SYMBOL_GPL(scmi_protocol_unregister);
 
diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 4645677d86f1..2312e824ed3c 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) Clock Protocol
  *
- * Copyright (C) 2018 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 
 #include <linux/sort.h>
@@ -366,4 +366,10 @@ static int scmi_clock_protocol_init(struct scmi_handle *handle)
 	return 0;
 }
 
-DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(SCMI_PROTOCOL_CLOCK, clock)
+static const struct scmi_protocol scmi_clock = {
+	.id = SCMI_PROTOCOL_CLOCK,
+	.init = &scmi_clock_protocol_init,
+	.ops = &clk_ops,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(clock, scmi_clock)
diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index c0fb45e7c3e8..93383154d2c1 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -4,7 +4,7 @@
  * driver common header file containing some definitions, structures
  * and function prototypes used in all the different SCMI protocols.
  *
- * Copyright (C) 2018 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 #ifndef _SCMI_COMMON_H
 #define _SCMI_COMMON_H
@@ -157,6 +157,24 @@ void scmi_setup_protocol_implemented(const struct scmi_handle *handle,
 				     u8 *prot_imp);
 
 int scmi_base_protocol_init(struct scmi_handle *h);
+typedef int (*scmi_prot_init_fn_t)(struct scmi_handle *);
+
+/**
+ * struct scmi_protocol  - Protocol descriptor
+ * @id: Protocol ID.
+ * @init: Mandatory protocol initialization function.
+ * @instance_init: Optional protocol instance initialization function.
+ * @instance_deinit: Optional protocol de-initialization function.
+ * @ops: Optional reference to the operations provided by the protocol and
+ *	 exposed in scmi_protocol.h.
+ */
+struct scmi_protocol {
+	const u8				id;
+	const scmi_prot_init_fn_t		init;
+	const scmi_prot_init_fn_t		instance_init;
+	const scmi_prot_init_fn_t		instance_deinit;
+	const void				*ops;
+};
 
 int __init scmi_bus_init(void);
 void __exit scmi_bus_exit(void);
@@ -164,6 +182,7 @@ void __exit scmi_bus_exit(void);
 #define DECLARE_SCMI_REGISTER_UNREGISTER(func)		\
 	int __init scmi_##func##_register(void);	\
 	void __exit scmi_##func##_unregister(void)
+DECLARE_SCMI_REGISTER_UNREGISTER(base);
 DECLARE_SCMI_REGISTER_UNREGISTER(clock);
 DECLARE_SCMI_REGISTER_UNREGISTER(perf);
 DECLARE_SCMI_REGISTER_UNREGISTER(power);
@@ -172,17 +191,24 @@ DECLARE_SCMI_REGISTER_UNREGISTER(sensors);
 DECLARE_SCMI_REGISTER_UNREGISTER(voltage);
 DECLARE_SCMI_REGISTER_UNREGISTER(system);
 
-#define DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(id, name) \
-int __init scmi_##name##_register(void) \
-{ \
-	return scmi_protocol_register((id), &scmi_##name##_protocol_init); \
-} \
-\
-void __exit scmi_##name##_unregister(void) \
-{ \
-	scmi_protocol_unregister((id)); \
+#define DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(name, proto)	\
+static const struct scmi_protocol *__this_proto = &(proto);	\
+								\
+int __init scmi_##name##_register(void)				\
+{								\
+	return scmi_protocol_register(__this_proto);		\
+}								\
+								\
+void __exit scmi_##name##_unregister(void)			\
+{								\
+	scmi_protocol_unregister(__this_proto);			\
 }
 
+const struct scmi_protocol *scmi_protocol_get(int protocol_id);
+
+int scmi_protocol_acquire(struct scmi_handle *handle, u8 protocol_id);
+void scmi_protocol_release(struct scmi_handle *handle, u8 protocol_id);
+
 /* SCMI Transport */
 /**
  * struct scmi_chan_info - Structure representing a SCMI channel information
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 3e748e57deab..c1ec80e00734 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -11,11 +11,12 @@
  * various power domain DVFS including the core/cluster, certain system
  * clocks configuration, thermal sensors and many others.
  *
- * Copyright (C) 2018 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 
 #include <linux/bitmap.h>
 #include <linux/export.h>
+#include <linux/idr.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/ktime.h>
@@ -23,6 +24,7 @@
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/processor.h>
+#include <linux/refcount.h>
 #include <linux/slab.h>
 
 #include "common.h"
@@ -68,6 +70,21 @@ struct scmi_xfers_info {
 	spinlock_t xfer_lock;
 };
 
+/**
+ * struct scmi_protocol_instance  - Describe an initialized protocol instance.
+ * @proto: A reference to the protocol descriptor.
+ * @gid: A reference for per-protocol devres management.
+ * @users: A refcount to track effective users of this protocol.
+ *
+ * Each protocol is initialized independently once for each SCMI platform in
+ * which is defined by DT and implemented by the SCMI server fw.
+ */
+struct scmi_protocol_instance {
+	const struct scmi_protocol	*proto;
+	void				*gid;
+	refcount_t			users;
+};
+
 /**
  * struct scmi_info - Structure representing a SCMI instance
  *
@@ -80,6 +97,10 @@ struct scmi_xfers_info {
  * @rx_minfo: Universal Receive Message management info
  * @tx_idr: IDR object to map protocol id to Tx channel info pointer
  * @rx_idr: IDR object to map protocol id to Rx channel info pointer
+ * @protocols: IDR for protocols' instance descriptors initialized for
+ *	       this SCMI instance: populated on protocol's first attempted
+ *	       usage.
+ * @protocols_mtx: A mutex to protect protocols instances initialization.
  * @protocols_imp: List of protocols implemented, currently maximum of
  *	MAX_PROTOCOLS_IMP elements allocated by the base protocol
  * @node: List head
@@ -94,6 +115,9 @@ struct scmi_info {
 	struct scmi_xfers_info rx_minfo;
 	struct idr tx_idr;
 	struct idr rx_idr;
+	struct idr protocols;
+	/* Ensure mutual exclusive access to protocols instance array */
+	struct mutex protocols_mtx;
 	u8 *protocols_imp;
 	struct list_head node;
 	int users;
@@ -519,6 +543,150 @@ int scmi_version_get(const struct scmi_handle *handle, u8 protocol,
 	return ret;
 }
 
+/**
+ * scmi_alloc_init_protocol_instance  - Allocate and initialize a protocol
+ * instance descriptor.
+ * @info: The reference to the related SCMI instance.
+ * @proto: The protocol descriptor.
+ *
+ * Allocate a new protocol instance descriptor, using the provided @proto
+ * description, against the specified SCMI instance @info, and initialize it;
+ * all resources management is handled via a dedicated per-protocol devres
+ * group.
+ *
+ * Context: Assumes to be called with @protocols_mtx already acquired.
+ * Return: A reference to a freshly allocated and initialized protocol instance
+ *	   or ERR_PTR on failure.
+ */
+static struct scmi_protocol_instance *
+scmi_alloc_init_protocol_instance(struct scmi_info *info,
+				  const struct scmi_protocol *proto)
+{
+	int ret = -ENOMEM;
+	void *gid;
+	struct scmi_protocol_instance *pi;
+	struct scmi_handle *handle = &info->handle;
+
+	/* Protocol specific devres group */
+	gid = devres_open_group(handle->dev, NULL, GFP_KERNEL);
+	if (!gid)
+		goto out;
+
+	pi = devm_kzalloc(handle->dev, sizeof(*pi), GFP_KERNEL);
+	if (!pi)
+		goto clean;
+
+	pi->gid = gid;
+	pi->proto = proto;
+	refcount_set(&pi->users, 1);
+	/* proto->init is assured NON NULL by scmi_protocol_register */
+	ret = pi->proto->instance_init(handle);
+	if (ret)
+		goto clean;
+
+	ret = idr_alloc(&info->protocols, pi, proto->id, proto->id + 1,
+			GFP_KERNEL);
+	if (ret != proto->id)
+		goto clean;
+
+	devres_close_group(handle->dev, pi->gid);
+	dev_dbg(handle->dev, "Initialized protocol: 0x%X\n", pi->proto->id);
+
+	return pi;
+
+clean:
+	devres_release_group(handle->dev, gid);
+out:
+	return ERR_PTR(ret);
+}
+
+/**
+ * scmi_get_protocol_instance  - Protocol initialization helper.
+ * @handle: A reference to the SCMI platform instance.
+ * @protocol_id: The protocol being requested.
+ *
+ * In case the required protocol has never been requested before for this
+ * instance, allocate and initialize all the needed structures while handling
+ * resource allocation with a dedicated per-protocol devres subgroup.
+ *
+ * Return: A reference to an initialized protocol instance or error on failure.
+ */
+static struct scmi_protocol_instance * __must_check
+scmi_get_protocol_instance(struct scmi_handle *handle, u8 protocol_id)
+{
+	struct scmi_protocol_instance *pi;
+	struct scmi_info *info = handle_to_scmi_info(handle);
+
+	mutex_lock(&info->protocols_mtx);
+	pi = idr_find(&info->protocols, protocol_id);
+
+	if (pi) {
+		refcount_inc(&pi->users);
+	} else {
+		const struct scmi_protocol *proto;
+
+		/* Fails if protocol not registered on bus */
+		proto = scmi_protocol_get(protocol_id);
+		if (proto)
+			pi = scmi_alloc_init_protocol_instance(info, proto);
+		else
+			pi = ERR_PTR(-ENODEV);
+	}
+	mutex_unlock(&info->protocols_mtx);
+
+	return pi;
+}
+
+/**
+ * scmi_protocol_acquire  - Protocol acquire
+ * @handle: A reference to the SCMI platform instance.
+ * @protocol_id: The protocol being requested.
+ *
+ * Register a new user for the requested protocol on the specified SCMI
+ * platform instance, possibly triggering its initialization on first user.
+ *
+ * Return: 0 if protocol was acquired successfully.
+ */
+int scmi_protocol_acquire(struct scmi_handle *handle, u8 protocol_id)
+{
+	return PTR_ERR_OR_ZERO(scmi_get_protocol_instance(handle, protocol_id));
+}
+
+/**
+ * scmi_protocol_release  - Protocol de-initialization helper.
+ * @handle: A reference to the SCMI platform instance.
+ * @protocol_id: The protocol being requested.
+ *
+ * Remove one user for the specified protocol and triggers de-initialization
+ * and resources de-allocation once the last user has gone.
+ */
+void scmi_protocol_release(struct scmi_handle *handle, u8 protocol_id)
+{
+	struct scmi_info *info = handle_to_scmi_info(handle);
+	struct scmi_protocol_instance *pi;
+
+	mutex_lock(&info->protocols_mtx);
+	pi = idr_find(&info->protocols, protocol_id);
+	if (WARN_ON(!pi))
+		goto out;
+
+	if (refcount_dec_and_test(&pi->users)) {
+		void *gid = pi->gid;
+
+		if (pi->proto->instance_deinit)
+			pi->proto->instance_deinit(handle);
+
+		idr_remove(&info->protocols, protocol_id);
+
+		devres_release_group(handle->dev, gid);
+		dev_dbg(handle->dev, "De-Initialized protocol: 0x%X\n",
+			protocol_id);
+	}
+
+out:
+	mutex_unlock(&info->protocols_mtx);
+}
+
 void scmi_setup_protocol_implemented(const struct scmi_handle *handle,
 				     u8 *prot_imp)
 {
@@ -786,6 +954,8 @@ static int scmi_probe(struct platform_device *pdev)
 	info->dev = dev;
 	info->desc = desc;
 	INIT_LIST_HEAD(&info->node);
+	idr_init(&info->protocols);
+	mutex_init(&info->protocols_mtx);
 
 	platform_set_drvdata(pdev, info);
 	idr_init(&info->tx_idr);
@@ -860,6 +1030,10 @@ static int scmi_remove(struct platform_device *pdev)
 
 	scmi_notification_exit(&info->handle);
 
+	mutex_lock(&info->protocols_mtx);
+	idr_destroy(&info->protocols);
+	mutex_unlock(&info->protocols_mtx);
+
 	/* Safe to free channels since no more users */
 	ret = idr_for_each(idr, info->desc->ops->chan_free, idr);
 	idr_destroy(&info->tx_idr);
@@ -942,6 +1116,8 @@ static int __init scmi_driver_init(void)
 {
 	scmi_bus_init();
 
+	scmi_base_register();
+
 	scmi_clock_register();
 	scmi_perf_register();
 	scmi_power_register();
@@ -956,7 +1132,7 @@ subsys_initcall(scmi_driver_init);
 
 static void __exit scmi_driver_exit(void)
 {
-	scmi_bus_exit();
+	scmi_base_unregister();
 
 	scmi_clock_unregister();
 	scmi_perf_unregister();
@@ -966,6 +1142,8 @@ static void __exit scmi_driver_exit(void)
 	scmi_voltage_unregister();
 	scmi_system_unregister();
 
+	scmi_bus_exit();
+
 	platform_driver_unregister(&scmi_driver);
 }
 module_exit(scmi_driver_exit);
diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index e374b1125fca..4630dcae3d21 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) Performance Protocol
  *
- * Copyright (C) 2018 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 
 #define pr_fmt(fmt) "SCMI Notifications PERF - " fmt
@@ -900,4 +900,10 @@ static int scmi_perf_protocol_init(struct scmi_handle *handle)
 	return 0;
 }
 
-DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(SCMI_PROTOCOL_PERF, perf)
+static const struct scmi_protocol scmi_perf = {
+	.id = SCMI_PROTOCOL_PERF,
+	.init = &scmi_perf_protocol_init,
+	.ops = &perf_ops,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(perf, scmi_perf)
diff --git a/drivers/firmware/arm_scmi/power.c b/drivers/firmware/arm_scmi/power.c
index 1f37258e9bee..a5e5e22913b5 100644
--- a/drivers/firmware/arm_scmi/power.c
+++ b/drivers/firmware/arm_scmi/power.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) Power Protocol
  *
- * Copyright (C) 2018 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 
 #define pr_fmt(fmt) "SCMI Notifications POWER - " fmt
@@ -301,4 +301,10 @@ static int scmi_power_protocol_init(struct scmi_handle *handle)
 	return 0;
 }
 
-DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(SCMI_PROTOCOL_POWER, power)
+static const struct scmi_protocol scmi_power = {
+	.id = SCMI_PROTOCOL_POWER,
+	.init = &scmi_power_protocol_init,
+	.ops = &power_ops,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(power, scmi_power)
diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c
index a981a22cfe89..ef9d80cafb56 100644
--- a/drivers/firmware/arm_scmi/reset.c
+++ b/drivers/firmware/arm_scmi/reset.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) Reset Protocol
  *
- * Copyright (C) 2019 ARM Ltd.
+ * Copyright (C) 2019-2021 ARM Ltd.
  */
 
 #define pr_fmt(fmt) "SCMI Notifications RESET - " fmt
@@ -311,4 +311,10 @@ static int scmi_reset_protocol_init(struct scmi_handle *handle)
 	return 0;
 }
 
-DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(SCMI_PROTOCOL_RESET, reset)
+static const struct scmi_protocol scmi_reset = {
+	.id = SCMI_PROTOCOL_RESET,
+	.init = &scmi_reset_protocol_init,
+	.ops = &reset_ops,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(reset, scmi_reset)
diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index 4541b891b733..1eb8a36abf64 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) Sensor Protocol
  *
- * Copyright (C) 2018-2020 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 
 #define pr_fmt(fmt) "SCMI Notifications SENSOR - " fmt
@@ -983,4 +983,10 @@ static int scmi_sensors_protocol_init(struct scmi_handle *handle)
 	return 0;
 }
 
-DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(SCMI_PROTOCOL_SENSOR, sensors)
+static const struct scmi_protocol scmi_sensors = {
+	.id = SCMI_PROTOCOL_SENSOR,
+	.init = &scmi_sensors_protocol_init,
+	.ops = &sensor_ops,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(sensors, scmi_sensors)
diff --git a/drivers/firmware/arm_scmi/system.c b/drivers/firmware/arm_scmi/system.c
index 283e12d5f24b..d6d7400449fc 100644
--- a/drivers/firmware/arm_scmi/system.c
+++ b/drivers/firmware/arm_scmi/system.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) System Power Protocol
  *
- * Copyright (C) 2020 ARM Ltd.
+ * Copyright (C) 2020-2021 ARM Ltd.
  */
 
 #define pr_fmt(fmt) "SCMI Notifications SYSTEM - " fmt
@@ -128,4 +128,10 @@ static int scmi_system_protocol_init(struct scmi_handle *handle)
 	return 0;
 }
 
-DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(SCMI_PROTOCOL_SYSTEM, system)
+static const struct scmi_protocol scmi_system = {
+	.id = SCMI_PROTOCOL_SYSTEM,
+	.init = &scmi_system_protocol_init,
+	.ops = NULL,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(system, scmi_system)
diff --git a/drivers/firmware/arm_scmi/voltage.c b/drivers/firmware/arm_scmi/voltage.c
index e794e4349ae6..b16f8d42fa0a 100644
--- a/drivers/firmware/arm_scmi/voltage.c
+++ b/drivers/firmware/arm_scmi/voltage.c
@@ -2,7 +2,7 @@
 /*
  * System Control and Management Interface (SCMI) Voltage Protocol
  *
- * Copyright (C) 2020 ARM Ltd.
+ * Copyright (C) 2020-2021 ARM Ltd.
  */
 
 #include <linux/scmi_protocol.h>
@@ -377,4 +377,10 @@ static int scmi_voltage_protocol_init(struct scmi_handle *handle)
 	return 0;
 }
 
-DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(SCMI_PROTOCOL_VOLTAGE, voltage)
+static const struct scmi_protocol scmi_voltage = {
+	.id = SCMI_PROTOCOL_VOLTAGE,
+	.init = &scmi_voltage_protocol_init,
+	.ops = &voltage_ops,
+};
+
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(voltage, scmi_voltage)
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index ecb3aad1a964..ea301c579c6f 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -2,7 +2,7 @@
 /*
  * SCMI Message Protocol driver header
  *
- * Copyright (C) 2018 ARM Ltd.
+ * Copyright (C) 2018-2021 ARM Ltd.
  */
 
 #ifndef _LINUX_SCMI_PROTOCOL_H
@@ -712,9 +712,9 @@ static inline void scmi_driver_unregister(struct scmi_driver *driver) {}
 #define module_scmi_driver(__scmi_driver)	\
 	module_driver(__scmi_driver, scmi_register, scmi_unregister)
 
-typedef int (*scmi_prot_init_fn_t)(struct scmi_handle *);
-int scmi_protocol_register(int protocol_id, scmi_prot_init_fn_t fn);
-void scmi_protocol_unregister(int protocol_id);
+struct scmi_protocol;
+int scmi_protocol_register(const struct scmi_protocol *proto);
+void scmi_protocol_unregister(const struct scmi_protocol *proto);
 
 /* SCMI Notification API - Custom Event Reports */
 enum scmi_notification_events {
-- 
cgit v1.2.3


From 23934efe3748f6d9d8ac0760178a5ef1ed8320f4 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:28 +0000
Subject: firmware: arm_scmi: Introduce devres get/put protocols operations

Expose to the SCMI drivers a new devres managed common protocols API
based on generic get/put methods and protocol handles.

All drivers still keep using the old API, no functional change.

Link: https://lore.kernel.org/r/20210316124903.35011-4-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/driver.c | 92 ++++++++++++++++++++++++++++++++++++++
 include/linux/scmi_protocol.h      | 11 +++++
 2 files changed, 103 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index b6f1a2866248..5a960a7fd973 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/bitmap.h>
+#include <linux/device.h>
 #include <linux/export.h>
 #include <linux/idr.h>
 #include <linux/io.h>
@@ -755,6 +756,95 @@ scmi_is_protocol_implemented(const struct scmi_handle *handle, u8 prot_id)
 	return false;
 }
 
+struct scmi_protocol_devres {
+	struct scmi_handle *handle;
+	u8 protocol_id;
+};
+
+static void scmi_devm_release_protocol(struct device *dev, void *res)
+{
+	struct scmi_protocol_devres *dres = res;
+
+	scmi_protocol_release(dres->handle, dres->protocol_id);
+}
+
+/**
+ * scmi_devm_protocol_get  - Devres managed get protocol operations and handle
+ * @sdev: A reference to an scmi_device whose embedded struct device is to
+ *	  be used for devres accounting.
+ * @protocol_id: The protocol being requested.
+ * @ph: A pointer reference used to pass back the associated protocol handle.
+ *
+ * Get hold of a protocol accounting for its usage, eventually triggering its
+ * initialization, and returning the protocol specific operations and related
+ * protocol handle which will be used as first argument in most of the
+ * protocols operations methods.
+ * Being a devres based managed method, protocol hold will be automatically
+ * released, and possibly de-initialized on last user, once the SCMI driver
+ * owning the scmi_device is unbound from it.
+ *
+ * Return: A reference to the requested protocol operations or error.
+ *	   Must be checked for errors by caller.
+ */
+static const void __must_check *
+scmi_devm_protocol_get(struct scmi_device *sdev, u8 protocol_id,
+		       struct scmi_protocol_handle **ph)
+{
+	struct scmi_protocol_instance *pi;
+	struct scmi_protocol_devres *dres;
+	struct scmi_handle *handle = sdev->handle;
+
+	if (!ph)
+		return ERR_PTR(-EINVAL);
+
+	dres = devres_alloc(scmi_devm_release_protocol,
+			    sizeof(*dres), GFP_KERNEL);
+	if (!dres)
+		return ERR_PTR(-ENOMEM);
+
+	pi = scmi_get_protocol_instance(handle, protocol_id);
+	if (IS_ERR(pi)) {
+		devres_free(dres);
+		return pi;
+	}
+
+	dres->handle = handle;
+	dres->protocol_id = protocol_id;
+	devres_add(&sdev->dev, dres);
+
+	*ph = &pi->ph;
+
+	return pi->proto->ops;
+}
+
+static int scmi_devm_protocol_match(struct device *dev, void *res, void *data)
+{
+	struct scmi_protocol_devres *dres = res;
+
+	if (WARN_ON(!dres || !data))
+		return 0;
+
+	return dres->protocol_id == *((u8 *)data);
+}
+
+/**
+ * scmi_devm_protocol_put  - Devres managed put protocol operations and handle
+ * @sdev: A reference to an scmi_device whose embedded struct device is to
+ *	  be used for devres accounting.
+ * @protocol_id: The protocol being requested.
+ *
+ * Explicitly release a protocol hold previously obtained calling the above
+ * @scmi_devm_protocol_get.
+ */
+static void scmi_devm_protocol_put(struct scmi_device *sdev, u8 protocol_id)
+{
+	int ret;
+
+	ret = devres_release(&sdev->dev, scmi_devm_release_protocol,
+			     scmi_devm_protocol_match, &protocol_id);
+	WARN_ON(ret);
+}
+
 /**
  * scmi_handle_get() - Get the SCMI handle for a device
  *
@@ -1009,6 +1099,8 @@ static int scmi_probe(struct platform_device *pdev)
 	handle = &info->handle;
 	handle->dev = info->dev;
 	handle->version = &info->version;
+	handle->devm_protocol_get = scmi_devm_protocol_get;
+	handle->devm_protocol_put = scmi_devm_protocol_put;
 
 	ret = scmi_txrx_setup(info, dev, SCMI_PROTOCOL_BASE);
 	if (ret)
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index ea301c579c6f..a9becd356fe8 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -57,6 +57,8 @@ struct scmi_clock_info {
 };
 
 struct scmi_handle;
+struct scmi_device;
+struct scmi_protocol_handle;
 
 /**
  * struct scmi_clk_ops - represents the various operations provided
@@ -593,6 +595,9 @@ struct scmi_notify_ops {
  * @sensor_ops: pointer to set of sensor protocol operations
  * @reset_ops: pointer to set of reset protocol operations
  * @voltage_ops: pointer to set of voltage protocol operations
+ * @devm_protocol_get: devres managed method to acquire a protocol and get specific
+ *		       operations and a dedicated protocol handler
+ * @devm_protocol_put: devres managed method to release a protocol
  * @notify_ops: pointer to set of notifications related operations
  * @perf_priv: pointer to private data structure specific to performance
  *	protocol(for internal use only)
@@ -618,6 +623,12 @@ struct scmi_handle {
 	const struct scmi_sensor_ops *sensor_ops;
 	const struct scmi_reset_ops *reset_ops;
 	const struct scmi_voltage_ops *voltage_ops;
+
+	const void __must_check *
+		(*devm_protocol_get)(struct scmi_device *sdev, u8 proto,
+				     struct scmi_protocol_handle **ph);
+	void (*devm_protocol_put)(struct scmi_device *sdev, u8 proto);
+
 	const struct scmi_notify_ops *notify_ops;
 	/* for protocol internal use */
 	void *perf_priv;
-- 
cgit v1.2.3


From 5ad3d1cf7d34c00aa5b425d93373b5c0044aa0a9 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:30 +0000
Subject: firmware: arm_scmi: Introduce new devres notification ops

Expose to the SCMI drivers a new alternative devres managed notifications
API based on protocol handles.

All drivers still keep using the old API, no functional change.

Link: https://lore.kernel.org/r/20210316124903.35011-6-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/notify.c | 129 ++++++++++++++++++++++++++++++++++++-
 include/linux/scmi_protocol.h      |  22 ++++++-
 2 files changed, 146 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index 09015f1f9942..9dd729ac9fe2 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -1317,7 +1317,7 @@ static int scmi_event_handler_enable_events(struct scmi_event_handler *hndl)
  * Return: 0 on Success
  */
 static int scmi_register_notifier(const struct scmi_handle *handle,
-				  u8 proto_id, u8 evt_id, u32 *src_id,
+				  u8 proto_id, u8 evt_id, const u32 *src_id,
 				  struct notifier_block *nb)
 {
 	int ret = 0;
@@ -1366,7 +1366,7 @@ static int scmi_register_notifier(const struct scmi_handle *handle,
  * Return: 0 on Success
  */
 static int scmi_unregister_notifier(const struct scmi_handle *handle,
-				    u8 proto_id, u8 evt_id, u32 *src_id,
+				    u8 proto_id, u8 evt_id, const u32 *src_id,
 				    struct notifier_block *nb)
 {
 	u32 evt_key;
@@ -1408,6 +1408,129 @@ static int scmi_unregister_notifier(const struct scmi_handle *handle,
 	return 0;
 }
 
+struct scmi_notifier_devres {
+	const struct scmi_handle *handle;
+	u8 proto_id;
+	u8 evt_id;
+	u32 __src_id;
+	u32 *src_id;
+	struct notifier_block *nb;
+};
+
+static void scmi_devm_release_notifier(struct device *dev, void *res)
+{
+	struct scmi_notifier_devres *dres = res;
+
+	scmi_unregister_notifier(dres->handle, dres->proto_id, dres->evt_id,
+				 dres->src_id, dres->nb);
+}
+
+/**
+ * scmi_devm_notifier_register()  - Managed registration of a notifier_block
+ * for an event
+ * @sdev: A reference to an scmi_device whose embedded struct device is to
+ *	  be used for devres accounting.
+ * @proto_id: Protocol ID
+ * @evt_id: Event ID
+ * @src_id: Source ID, when NULL register for events coming form ALL possible
+ *	    sources
+ * @nb: A standard notifier block to register for the specified event
+ *
+ * Generic devres managed helper to register a notifier_block against a
+ * protocol event.
+ */
+static int scmi_devm_notifier_register(struct scmi_device *sdev,
+				       u8 proto_id, u8 evt_id,
+				       const u32 *src_id,
+				       struct notifier_block *nb)
+{
+	int ret;
+	struct scmi_notifier_devres *dres;
+
+	dres = devres_alloc(scmi_devm_release_notifier,
+			    sizeof(*dres), GFP_KERNEL);
+	if (!dres)
+		return -ENOMEM;
+
+	ret = scmi_register_notifier(sdev->handle, proto_id,
+				     evt_id, src_id, nb);
+	if (ret) {
+		devres_free(dres);
+		return ret;
+	}
+
+	dres->handle = sdev->handle;
+	dres->proto_id = proto_id;
+	dres->evt_id = evt_id;
+	dres->nb = nb;
+	if (src_id) {
+		dres->__src_id = *src_id;
+		dres->src_id = &dres->__src_id;
+	} else {
+		dres->src_id = NULL;
+	}
+	devres_add(&sdev->dev, dres);
+
+	return ret;
+}
+
+static int scmi_devm_notifier_match(struct device *dev, void *res, void *data)
+{
+	struct scmi_notifier_devres *dres = res;
+	struct scmi_notifier_devres *xres = data;
+
+	if (WARN_ON(!dres || !xres))
+		return 0;
+
+	return dres->proto_id == xres->proto_id &&
+		dres->evt_id == xres->evt_id &&
+		dres->nb == xres->nb &&
+		((!dres->src_id && !xres->src_id) ||
+		  (dres->src_id && xres->src_id &&
+		   dres->__src_id == xres->__src_id));
+}
+
+/**
+ * scmi_devm_notifier_unregister()  - Managed un-registration of a
+ * notifier_block for an event
+ * @sdev: A reference to an scmi_device whose embedded struct device is to
+ *	  be used for devres accounting.
+ * @proto_id: Protocol ID
+ * @evt_id: Event ID
+ * @src_id: Source ID, when NULL register for events coming form ALL possible
+ *	    sources
+ * @nb: A standard notifier block to register for the specified event
+ *
+ * Generic devres managed helper to explicitly un-register a notifier_block
+ * against a protocol event, which was previously registered using the above
+ * @scmi_devm_notifier_register.
+ */
+static int scmi_devm_notifier_unregister(struct scmi_device *sdev,
+					 u8 proto_id, u8 evt_id,
+					 const u32 *src_id,
+					 struct notifier_block *nb)
+{
+	int ret;
+	struct scmi_notifier_devres dres;
+
+	dres.handle = sdev->handle;
+	dres.proto_id = proto_id;
+	dres.evt_id = evt_id;
+	if (src_id) {
+		dres.__src_id = *src_id;
+		dres.src_id = &dres.__src_id;
+	} else {
+		dres.src_id = NULL;
+	}
+
+	ret = devres_release(&sdev->dev, scmi_devm_release_notifier,
+			     scmi_devm_notifier_match, &dres);
+
+	WARN_ON(ret);
+
+	return ret;
+}
+
 /**
  * scmi_protocols_late_init()  - Worker for late initialization
  * @work: The work item to use associated to the proper SCMI instance
@@ -1465,6 +1588,8 @@ static void scmi_protocols_late_init(struct work_struct *work)
  * directly from an scmi_driver to register its own notifiers.
  */
 static const struct scmi_notify_ops notify_ops = {
+	.devm_event_notifier_register = scmi_devm_notifier_register,
+	.devm_event_notifier_unregister = scmi_devm_notifier_unregister,
 	.register_event_notifier = scmi_register_notifier,
 	.unregister_event_notifier = scmi_unregister_notifier,
 };
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index a9becd356fe8..2b66b243e841 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -544,6 +544,10 @@ struct scmi_voltage_ops {
 /**
  * struct scmi_notify_ops  - represents notifications' operations provided by
  * SCMI core
+ * @devm_event_notifier_register: Managed registration of a notifier_block for
+ *				  the requested event
+ * @devm_event_notifier_unregister: Managed unregistration of a notifier_block
+ *				    for the requested event
  * @register_event_notifier: Register a notifier_block for the requested event
  * @unregister_event_notifier: Unregister a notifier_block for the requested
  *			       event
@@ -553,7 +557,9 @@ struct scmi_voltage_ops {
  * tuple: (proto_id, evt_id, src_id) using the provided register/unregister
  * interface where:
  *
- * @handle: The handle identifying the platform instance to use
+ * @sdev: The scmi_device to use when calling the devres managed ops devm_
+ * @handle: The handle identifying the platform instance to use, when not
+ *	    calling the managed ops devm_
  * @proto_id: The protocol ID as in SCMI Specification
  * @evt_id: The message ID of the desired event as in SCMI Specification
  * @src_id: A pointer to the desired source ID if different sources are
@@ -576,11 +582,21 @@ struct scmi_voltage_ops {
  * @report: A custom struct describing the specific event delivered
  */
 struct scmi_notify_ops {
+	int (*devm_event_notifier_register)(struct scmi_device *sdev,
+					    u8 proto_id, u8 evt_id,
+					    const u32 *src_id,
+					    struct notifier_block *nb);
+	int (*devm_event_notifier_unregister)(struct scmi_device *sdev,
+					      u8 proto_id, u8 evt_id,
+					      const u32 *src_id,
+					      struct notifier_block *nb);
 	int (*register_event_notifier)(const struct scmi_handle *handle,
-				       u8 proto_id, u8 evt_id, u32 *src_id,
+				       u8 proto_id, u8 evt_id,
+				       const u32 *src_id,
 				       struct notifier_block *nb);
 	int (*unregister_event_notifier)(const struct scmi_handle *handle,
-					 u8 proto_id, u8 evt_id, u32 *src_id,
+					 u8 proto_id, u8 evt_id,
+					 const u32 *src_id,
 					 struct notifier_block *nb);
 };
 
-- 
cgit v1.2.3


From 1fec5e6b5233a08ebf43011703b8baf9c9856862 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:36 +0000
Subject: firmware: arm_scmi: Port perf protocol to new protocols interface

Convert internals of protocol implementation to use protocol handles and
expose a new protocol operations interface for SCMI driver using the new
get/put common operations, while keeping the old handle->perf_ops still
around to ease transition.

Remove handle->perf_priv now unused.

Link: https://lore.kernel.org/r/20210316124903.35011-12-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/perf.c | 349 ++++++++++++++++++++++++++-------------
 include/linux/scmi_protocol.h    |  30 +++-
 2 files changed, 258 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 6e37a6ebfcd8..73f04d7809b3 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -175,21 +175,21 @@ static enum scmi_performance_protocol_cmd evt_2_cmd[] = {
 	PERF_NOTIFY_LEVEL,
 };
 
-static int scmi_perf_attributes_get(const struct scmi_handle *handle,
+static int scmi_perf_attributes_get(const struct scmi_protocol_handle *ph,
 				    struct scmi_perf_info *pi)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_perf_attributes *attr;
 
-	ret = scmi_xfer_get_init(handle, PROTOCOL_ATTRIBUTES,
-				 SCMI_PROTOCOL_PERF, 0, sizeof(*attr), &t);
+	ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES, 0,
+				      sizeof(*attr), &t);
 	if (ret)
 		return ret;
 
 	attr = t->rx.buf;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		u16 flags = le16_to_cpu(attr->flags);
 
@@ -200,28 +200,27 @@ static int scmi_perf_attributes_get(const struct scmi_handle *handle,
 		pi->stats_size = le32_to_cpu(attr->stats_size);
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
 static int
-scmi_perf_domain_attributes_get(const struct scmi_handle *handle, u32 domain,
-				struct perf_dom_info *dom_info)
+scmi_perf_domain_attributes_get(const struct scmi_protocol_handle *ph,
+				u32 domain, struct perf_dom_info *dom_info)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_perf_domain_attributes *attr;
 
-	ret = scmi_xfer_get_init(handle, PERF_DOMAIN_ATTRIBUTES,
-				 SCMI_PROTOCOL_PERF, sizeof(domain),
-				 sizeof(*attr), &t);
+	ret = ph->xops->xfer_get_init(ph, PERF_DOMAIN_ATTRIBUTES,
+				     sizeof(domain), sizeof(*attr), &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(domain, t->tx.buf);
 	attr = t->rx.buf;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		u32 flags = le32_to_cpu(attr->flags);
 
@@ -245,7 +244,7 @@ scmi_perf_domain_attributes_get(const struct scmi_handle *handle, u32 domain,
 		strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE);
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
@@ -257,7 +256,7 @@ static int opp_cmp_func(const void *opp1, const void *opp2)
 }
 
 static int
-scmi_perf_describe_levels_get(const struct scmi_handle *handle, u32 domain,
+scmi_perf_describe_levels_get(const struct scmi_protocol_handle *ph, u32 domain,
 			      struct perf_dom_info *perf_dom)
 {
 	int ret, cnt;
@@ -268,8 +267,8 @@ scmi_perf_describe_levels_get(const struct scmi_handle *handle, u32 domain,
 	struct scmi_msg_perf_describe_levels *dom_info;
 	struct scmi_msg_resp_perf_describe_levels *level_info;
 
-	ret = scmi_xfer_get_init(handle, PERF_DESCRIBE_LEVELS,
-				 SCMI_PROTOCOL_PERF, sizeof(*dom_info), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, PERF_DESCRIBE_LEVELS,
+				      sizeof(*dom_info), 0, &t);
 	if (ret)
 		return ret;
 
@@ -281,14 +280,14 @@ scmi_perf_describe_levels_get(const struct scmi_handle *handle, u32 domain,
 		/* Set the number of OPPs to be skipped/already read */
 		dom_info->level_index = cpu_to_le32(tot_opp_cnt);
 
-		ret = scmi_do_xfer(handle, t);
+		ret = ph->xops->do_xfer(ph, t);
 		if (ret)
 			break;
 
 		num_returned = le16_to_cpu(level_info->num_returned);
 		num_remaining = le16_to_cpu(level_info->num_remaining);
 		if (tot_opp_cnt + num_returned > MAX_OPPS) {
-			dev_err(handle->dev, "No. of OPPs exceeded MAX_OPPS");
+			dev_err(ph->dev, "No. of OPPs exceeded MAX_OPPS");
 			break;
 		}
 
@@ -299,13 +298,13 @@ scmi_perf_describe_levels_get(const struct scmi_handle *handle, u32 domain,
 			opp->trans_latency_us = le16_to_cpu
 				(level_info->opp[cnt].transition_latency_us);
 
-			dev_dbg(handle->dev, "Level %d Power %d Latency %dus\n",
+			dev_dbg(ph->dev, "Level %d Power %d Latency %dus\n",
 				opp->perf, opp->power, opp->trans_latency_us);
 		}
 
 		tot_opp_cnt += num_returned;
 
-		scmi_reset_rx_to_maxsz(handle, t);
+		ph->xops->reset_rx_to_maxsz(ph, t);
 		/*
 		 * check for both returned and remaining to avoid infinite
 		 * loop due to buggy firmware
@@ -313,7 +312,7 @@ scmi_perf_describe_levels_get(const struct scmi_handle *handle, u32 domain,
 	} while (num_returned && num_remaining);
 
 	perf_dom->opp_count = tot_opp_cnt;
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 
 	sort(perf_dom->opp, tot_opp_cnt, sizeof(*opp), opp_cmp_func, NULL);
 	return ret;
@@ -353,15 +352,15 @@ static void scmi_perf_fc_ring_db(struct scmi_fc_db_info *db)
 #endif
 }
 
-static int scmi_perf_mb_limits_set(const struct scmi_handle *handle, u32 domain,
-				   u32 max_perf, u32 min_perf)
+static int scmi_perf_mb_limits_set(const struct scmi_protocol_handle *ph,
+				   u32 domain, u32 max_perf, u32 min_perf)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_perf_set_limits *limits;
 
-	ret = scmi_xfer_get_init(handle, PERF_LIMITS_SET, SCMI_PROTOCOL_PERF,
-				 sizeof(*limits), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, PERF_LIMITS_SET,
+				      sizeof(*limits), 0, &t);
 	if (ret)
 		return ret;
 
@@ -370,16 +369,16 @@ static int scmi_perf_mb_limits_set(const struct scmi_handle *handle, u32 domain,
 	limits->max_level = cpu_to_le32(max_perf);
 	limits->min_level = cpu_to_le32(min_perf);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_perf_limits_set(const struct scmi_handle *handle, u32 domain,
-				u32 max_perf, u32 min_perf)
+static int scmi_perf_limits_set(const struct scmi_protocol_handle *ph,
+				u32 domain, u32 max_perf, u32 min_perf)
 {
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 	struct perf_dom_info *dom = pi->dom_info + domain;
 
 	if (dom->fc_info && dom->fc_info->limit_set_addr) {
@@ -389,24 +388,33 @@ static int scmi_perf_limits_set(const struct scmi_handle *handle, u32 domain,
 		return 0;
 	}
 
-	return scmi_perf_mb_limits_set(handle, domain, max_perf, min_perf);
+	return scmi_perf_mb_limits_set(ph, domain, max_perf, min_perf);
 }
 
-static int scmi_perf_mb_limits_get(const struct scmi_handle *handle, u32 domain,
-				   u32 *max_perf, u32 *min_perf)
+static int __scmi_perf_limits_set(const struct scmi_handle *handle,
+				  u32 domain, u32 max_perf, u32 min_perf)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_perf_limits_set(ph, domain, max_perf, min_perf);
+}
+
+static int scmi_perf_mb_limits_get(const struct scmi_protocol_handle *ph,
+				   u32 domain, u32 *max_perf, u32 *min_perf)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_perf_get_limits *limits;
 
-	ret = scmi_xfer_get_init(handle, PERF_LIMITS_GET, SCMI_PROTOCOL_PERF,
-				 sizeof(__le32), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, PERF_LIMITS_GET,
+				      sizeof(__le32), 0, &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(domain, t->tx.buf);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		limits = t->rx.buf;
 
@@ -414,14 +422,14 @@ static int scmi_perf_mb_limits_get(const struct scmi_handle *handle, u32 domain,
 		*min_perf = le32_to_cpu(limits->min_level);
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_perf_limits_get(const struct scmi_handle *handle, u32 domain,
-				u32 *max_perf, u32 *min_perf)
+static int scmi_perf_limits_get(const struct scmi_protocol_handle *ph,
+				u32 domain, u32 *max_perf, u32 *min_perf)
 {
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 	struct perf_dom_info *dom = pi->dom_info + domain;
 
 	if (dom->fc_info && dom->fc_info->limit_get_addr) {
@@ -430,18 +438,26 @@ static int scmi_perf_limits_get(const struct scmi_handle *handle, u32 domain,
 		return 0;
 	}
 
-	return scmi_perf_mb_limits_get(handle, domain, max_perf, min_perf);
+	return scmi_perf_mb_limits_get(ph, domain, max_perf, min_perf);
 }
 
-static int scmi_perf_mb_level_set(const struct scmi_handle *handle, u32 domain,
-				  u32 level, bool poll)
+static int __scmi_perf_limits_get(const struct scmi_handle *handle,
+				  u32 domain, u32 *max_perf, u32 *min_perf)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_perf_limits_get(ph, domain, max_perf, min_perf);
+}
+
+static int scmi_perf_mb_level_set(const struct scmi_protocol_handle *ph,
+				  u32 domain, u32 level, bool poll)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_perf_set_level *lvl;
 
-	ret = scmi_xfer_get_init(handle, PERF_LEVEL_SET, SCMI_PROTOCOL_PERF,
-				 sizeof(*lvl), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, PERF_LEVEL_SET, sizeof(*lvl), 0, &t);
 	if (ret)
 		return ret;
 
@@ -450,16 +466,16 @@ static int scmi_perf_mb_level_set(const struct scmi_handle *handle, u32 domain,
 	lvl->domain = cpu_to_le32(domain);
 	lvl->level = cpu_to_le32(level);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_perf_level_set(const struct scmi_handle *handle, u32 domain,
-			       u32 level, bool poll)
+static int scmi_perf_level_set(const struct scmi_protocol_handle *ph,
+			       u32 domain, u32 level, bool poll)
 {
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 	struct perf_dom_info *dom = pi->dom_info + domain;
 
 	if (dom->fc_info && dom->fc_info->level_set_addr) {
@@ -468,35 +484,44 @@ static int scmi_perf_level_set(const struct scmi_handle *handle, u32 domain,
 		return 0;
 	}
 
-	return scmi_perf_mb_level_set(handle, domain, level, poll);
+	return scmi_perf_mb_level_set(ph, domain, level, poll);
+}
+
+static int __scmi_perf_level_set(const struct scmi_handle *handle,
+				 u32 domain, u32 level, bool poll)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_perf_level_set(ph, domain, level, poll);
 }
 
-static int scmi_perf_mb_level_get(const struct scmi_handle *handle, u32 domain,
-				  u32 *level, bool poll)
+static int scmi_perf_mb_level_get(const struct scmi_protocol_handle *ph,
+				  u32 domain, u32 *level, bool poll)
 {
 	int ret;
 	struct scmi_xfer *t;
 
-	ret = scmi_xfer_get_init(handle, PERF_LEVEL_GET, SCMI_PROTOCOL_PERF,
-				 sizeof(u32), sizeof(u32), &t);
+	ret = ph->xops->xfer_get_init(ph, PERF_LEVEL_GET,
+				     sizeof(u32), sizeof(u32), &t);
 	if (ret)
 		return ret;
 
 	t->hdr.poll_completion = poll;
 	put_unaligned_le32(domain, t->tx.buf);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret)
 		*level = get_unaligned_le32(t->rx.buf);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_perf_level_get(const struct scmi_handle *handle, u32 domain,
-			       u32 *level, bool poll)
+static int scmi_perf_level_get(const struct scmi_protocol_handle *ph,
+			       u32 domain, u32 *level, bool poll)
 {
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 	struct perf_dom_info *dom = pi->dom_info + domain;
 
 	if (dom->fc_info && dom->fc_info->level_get_addr) {
@@ -504,10 +529,19 @@ static int scmi_perf_level_get(const struct scmi_handle *handle, u32 domain,
 		return 0;
 	}
 
-	return scmi_perf_mb_level_get(handle, domain, level, poll);
+	return scmi_perf_mb_level_get(ph, domain, level, poll);
+}
+
+static int __scmi_perf_level_get(const struct scmi_handle *handle,
+				 u32 domain, u32 *level, bool poll)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_perf_level_get(ph, domain, level, poll);
 }
 
-static int scmi_perf_level_limits_notify(const struct scmi_handle *handle,
+static int scmi_perf_level_limits_notify(const struct scmi_protocol_handle *ph,
 					 u32 domain, int message_id,
 					 bool enable)
 {
@@ -515,8 +549,7 @@ static int scmi_perf_level_limits_notify(const struct scmi_handle *handle,
 	struct scmi_xfer *t;
 	struct scmi_perf_notify_level_or_limits *notify;
 
-	ret = scmi_xfer_get_init(handle, message_id, SCMI_PROTOCOL_PERF,
-				 sizeof(*notify), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, message_id, sizeof(*notify), 0, &t);
 	if (ret)
 		return ret;
 
@@ -524,9 +557,9 @@ static int scmi_perf_level_limits_notify(const struct scmi_handle *handle,
 	notify->domain = cpu_to_le32(domain);
 	notify->notify_enable = enable ? cpu_to_le32(BIT(0)) : 0;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
@@ -540,7 +573,7 @@ static bool scmi_perf_fc_size_is_valid(u32 msg, u32 size)
 }
 
 static void
-scmi_perf_domain_desc_fc(const struct scmi_handle *handle, u32 domain,
+scmi_perf_domain_desc_fc(const struct scmi_protocol_handle *ph, u32 domain,
 			 u32 message_id, void __iomem **p_addr,
 			 struct scmi_fc_db_info **p_db)
 {
@@ -557,9 +590,8 @@ scmi_perf_domain_desc_fc(const struct scmi_handle *handle, u32 domain,
 	if (!p_addr)
 		return;
 
-	ret = scmi_xfer_get_init(handle, PERF_DESCRIBE_FASTCHANNEL,
-				 SCMI_PROTOCOL_PERF,
-				 sizeof(*info), sizeof(*resp), &t);
+	ret = ph->xops->xfer_get_init(ph, PERF_DESCRIBE_FASTCHANNEL,
+				      sizeof(*info), sizeof(*resp), &t);
 	if (ret)
 		return;
 
@@ -567,7 +599,7 @@ scmi_perf_domain_desc_fc(const struct scmi_handle *handle, u32 domain,
 	info->domain = cpu_to_le32(domain);
 	info->message_id = cpu_to_le32(message_id);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (ret)
 		goto err_xfer;
 
@@ -579,20 +611,20 @@ scmi_perf_domain_desc_fc(const struct scmi_handle *handle, u32 domain,
 
 	phys_addr = le32_to_cpu(resp->chan_addr_low);
 	phys_addr |= (u64)le32_to_cpu(resp->chan_addr_high) << 32;
-	addr = devm_ioremap(handle->dev, phys_addr, size);
+	addr = devm_ioremap(ph->dev, phys_addr, size);
 	if (!addr)
 		goto err_xfer;
 	*p_addr = addr;
 
 	if (p_db && SUPPORTS_DOORBELL(flags)) {
-		db = devm_kzalloc(handle->dev, sizeof(*db), GFP_KERNEL);
+		db = devm_kzalloc(ph->dev, sizeof(*db), GFP_KERNEL);
 		if (!db)
 			goto err_xfer;
 
 		size = 1 << DOORBELL_REG_WIDTH(flags);
 		phys_addr = le32_to_cpu(resp->db_addr_low);
 		phys_addr |= (u64)le32_to_cpu(resp->db_addr_high) << 32;
-		addr = devm_ioremap(handle->dev, phys_addr, size);
+		addr = devm_ioremap(ph->dev, phys_addr, size);
 		if (!addr)
 			goto err_xfer;
 
@@ -605,25 +637,25 @@ scmi_perf_domain_desc_fc(const struct scmi_handle *handle, u32 domain,
 		*p_db = db;
 	}
 err_xfer:
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 }
 
-static void scmi_perf_domain_init_fc(const struct scmi_handle *handle,
+static void scmi_perf_domain_init_fc(const struct scmi_protocol_handle *ph,
 				     u32 domain, struct scmi_fc_info **p_fc)
 {
 	struct scmi_fc_info *fc;
 
-	fc = devm_kzalloc(handle->dev, sizeof(*fc), GFP_KERNEL);
+	fc = devm_kzalloc(ph->dev, sizeof(*fc), GFP_KERNEL);
 	if (!fc)
 		return;
 
-	scmi_perf_domain_desc_fc(handle, domain, PERF_LEVEL_SET,
+	scmi_perf_domain_desc_fc(ph, domain, PERF_LEVEL_SET,
 				 &fc->level_set_addr, &fc->level_set_db);
-	scmi_perf_domain_desc_fc(handle, domain, PERF_LEVEL_GET,
+	scmi_perf_domain_desc_fc(ph, domain, PERF_LEVEL_GET,
 				 &fc->level_get_addr, NULL);
-	scmi_perf_domain_desc_fc(handle, domain, PERF_LIMITS_SET,
+	scmi_perf_domain_desc_fc(ph, domain, PERF_LIMITS_SET,
 				 &fc->limit_set_addr, &fc->limit_set_db);
-	scmi_perf_domain_desc_fc(handle, domain, PERF_LIMITS_GET,
+	scmi_perf_domain_desc_fc(ph, domain, PERF_LIMITS_GET,
 				 &fc->limit_get_addr, NULL);
 	*p_fc = fc;
 }
@@ -640,14 +672,14 @@ static int scmi_dev_domain_id(struct device *dev)
 	return clkspec.args[0];
 }
 
-static int scmi_dvfs_device_opps_add(const struct scmi_handle *handle,
+static int scmi_dvfs_device_opps_add(const struct scmi_protocol_handle *ph,
 				     struct device *dev)
 {
 	int idx, ret, domain;
 	unsigned long freq;
 	struct scmi_opp *opp;
 	struct perf_dom_info *dom;
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 
 	domain = scmi_dev_domain_id(dev);
 	if (domain < 0)
@@ -672,11 +704,21 @@ static int scmi_dvfs_device_opps_add(const struct scmi_handle *handle,
 	return 0;
 }
 
-static int scmi_dvfs_transition_latency_get(const struct scmi_handle *handle,
-					    struct device *dev)
+static int __scmi_dvfs_device_opps_add(const struct scmi_handle *handle,
+				       struct device *dev)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_dvfs_device_opps_add(ph, dev);
+}
+
+static int
+scmi_dvfs_transition_latency_get(const struct scmi_protocol_handle *ph,
+				 struct device *dev)
 {
 	struct perf_dom_info *dom;
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 	int domain = scmi_dev_domain_id(dev);
 
 	if (domain < 0)
@@ -687,35 +729,63 @@ static int scmi_dvfs_transition_latency_get(const struct scmi_handle *handle,
 	return dom->opp[dom->opp_count - 1].trans_latency_us * 1000;
 }
 
-static int scmi_dvfs_freq_set(const struct scmi_handle *handle, u32 domain,
+static int
+__scmi_dvfs_transition_latency_get(const struct scmi_handle *handle,
+				   struct device *dev)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_dvfs_transition_latency_get(ph, dev);
+}
+
+static int scmi_dvfs_freq_set(const struct scmi_protocol_handle *ph, u32 domain,
 			      unsigned long freq, bool poll)
 {
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 	struct perf_dom_info *dom = pi->dom_info + domain;
 
-	return scmi_perf_level_set(handle, domain, freq / dom->mult_factor,
-				   poll);
+	return scmi_perf_level_set(ph, domain, freq / dom->mult_factor, poll);
 }
 
-static int scmi_dvfs_freq_get(const struct scmi_handle *handle, u32 domain,
+static int __scmi_dvfs_freq_set(const struct scmi_handle *handle,
+				u32 domain, unsigned long freq, bool poll)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_dvfs_freq_set(ph, domain, freq, poll);
+}
+
+static int scmi_dvfs_freq_get(const struct scmi_protocol_handle *ph, u32 domain,
 			      unsigned long *freq, bool poll)
 {
 	int ret;
 	u32 level;
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 	struct perf_dom_info *dom = pi->dom_info + domain;
 
-	ret = scmi_perf_level_get(handle, domain, &level, poll);
+	ret = scmi_perf_level_get(ph, domain, &level, poll);
 	if (!ret)
 		*freq = level * dom->mult_factor;
 
 	return ret;
 }
 
-static int scmi_dvfs_est_power_get(const struct scmi_handle *handle, u32 domain,
-				   unsigned long *freq, unsigned long *power)
+static int __scmi_dvfs_freq_get(const struct scmi_handle *handle, u32 domain,
+				unsigned long *freq, bool poll)
 {
-	struct scmi_perf_info *pi = handle->perf_priv;
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_dvfs_freq_get(ph, domain, freq, poll);
+}
+
+static int scmi_dvfs_est_power_get(const struct scmi_protocol_handle *ph,
+				   u32 domain, unsigned long *freq,
+				   unsigned long *power)
+{
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 	struct perf_dom_info *dom;
 	unsigned long opp_freq;
 	int idx, ret = -EINVAL;
@@ -739,25 +809,67 @@ static int scmi_dvfs_est_power_get(const struct scmi_handle *handle, u32 domain,
 	return ret;
 }
 
-static bool scmi_fast_switch_possible(const struct scmi_handle *handle,
+static int __scmi_dvfs_est_power_get(const struct scmi_handle *handle,
+				     u32 domain, unsigned long *freq,
+				     unsigned long *power)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_dvfs_est_power_get(ph, domain, freq, power);
+}
+
+static bool scmi_fast_switch_possible(const struct scmi_protocol_handle *ph,
 				      struct device *dev)
 {
 	struct perf_dom_info *dom;
-	struct scmi_perf_info *pi = handle->perf_priv;
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 
 	dom = pi->dom_info + scmi_dev_domain_id(dev);
 
 	return dom->fc_info && dom->fc_info->level_set_addr;
 }
 
-static bool scmi_power_scale_mw_get(const struct scmi_handle *handle)
+static bool __scmi_fast_switch_possible(const struct scmi_handle *handle,
+					struct device *dev)
 {
-	struct scmi_perf_info *pi = handle->perf_priv;
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_fast_switch_possible(ph, dev);
+}
+
+static bool scmi_power_scale_mw_get(const struct scmi_protocol_handle *ph)
+{
+	struct scmi_perf_info *pi = ph->get_priv(ph);
 
 	return pi->power_scale_mw;
 }
 
+static bool __scmi_power_scale_mw_get(const struct scmi_handle *handle)
+{
+	const struct scmi_protocol_handle *ph =
+			scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
+
+	return scmi_power_scale_mw_get(ph);
+}
+
 static const struct scmi_perf_ops perf_ops = {
+	.limits_set = __scmi_perf_limits_set,
+	.limits_get = __scmi_perf_limits_get,
+	.level_set = __scmi_perf_level_set,
+	.level_get = __scmi_perf_level_get,
+	.device_domain_id = scmi_dev_domain_id,
+	.transition_latency_get = __scmi_dvfs_transition_latency_get,
+	.device_opps_add = __scmi_dvfs_device_opps_add,
+	.freq_set = __scmi_dvfs_freq_set,
+	.freq_get = __scmi_dvfs_freq_get,
+	.est_power_get = __scmi_dvfs_est_power_get,
+	.fast_switch_possible = __scmi_fast_switch_possible,
+	.power_scale_mw_get = __scmi_power_scale_mw_get,
+};
+
+static const struct scmi_perf_proto_ops perf_proto_ops = {
 	.limits_set = scmi_perf_limits_set,
 	.limits_get = scmi_perf_limits_get,
 	.level_set = scmi_perf_level_set,
@@ -772,7 +884,7 @@ static const struct scmi_perf_ops perf_ops = {
 	.power_scale_mw_get = scmi_power_scale_mw_get,
 };
 
-static int scmi_perf_set_notify_enabled(const void *handle,
+static int scmi_perf_set_notify_enabled(const void *ph,
 					u8 evt_id, u32 src_id, bool enable)
 {
 	int ret, cmd_id;
@@ -781,7 +893,7 @@ static int scmi_perf_set_notify_enabled(const void *handle,
 		return -EINVAL;
 
 	cmd_id = evt_2_cmd[evt_id];
-	ret = scmi_perf_level_limits_notify(handle, src_id, cmd_id, enable);
+	ret = scmi_perf_level_limits_notify(ph, src_id, cmd_id, enable);
 	if (ret)
 		pr_debug("FAIL_ENABLED - evt[%X] dom[%d] - ret:%d\n",
 			 evt_id, src_id, ret);
@@ -789,7 +901,7 @@ static int scmi_perf_set_notify_enabled(const void *handle,
 	return ret;
 }
 
-static void *scmi_perf_fill_custom_report(const void *handle,
+static void *scmi_perf_fill_custom_report(const void *ph,
 					  u8 evt_id, ktime_t timestamp,
 					  const void *payld, size_t payld_sz,
 					  void *report, u32 *src_id)
@@ -837,10 +949,10 @@ static void *scmi_perf_fill_custom_report(const void *handle,
 	return rep;
 }
 
-static int scmi_perf_get_num_sources(const void *handle)
+static int scmi_perf_get_num_sources(const void *ph)
 {
 	struct scmi_perf_info *pi =
-		((const struct scmi_handle *)(handle))->perf_priv;
+		((const struct scmi_protocol_handle *)ph)->get_priv(ph);
 
 	if (!pi)
 		return -EINVAL;
@@ -874,24 +986,25 @@ static const struct scmi_protocol_events perf_protocol_events = {
 	.num_events = ARRAY_SIZE(perf_events),
 };
 
-static int scmi_perf_protocol_init(struct scmi_handle *handle)
+static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
 {
 	int domain;
 	u32 version;
 	struct scmi_perf_info *pinfo;
+	struct scmi_handle *handle;
 
-	scmi_version_get(handle, SCMI_PROTOCOL_PERF, &version);
+	ph->xops->version_get(ph, &version);
 
-	dev_dbg(handle->dev, "Performance Version %d.%d\n",
+	dev_dbg(ph->dev, "Performance Version %d.%d\n",
 		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
 
-	pinfo = devm_kzalloc(handle->dev, sizeof(*pinfo), GFP_KERNEL);
+	pinfo = devm_kzalloc(ph->dev, sizeof(*pinfo), GFP_KERNEL);
 	if (!pinfo)
 		return -ENOMEM;
 
-	scmi_perf_attributes_get(handle, pinfo);
+	scmi_perf_attributes_get(ph, pinfo);
 
-	pinfo->dom_info = devm_kcalloc(handle->dev, pinfo->num_domains,
+	pinfo->dom_info = devm_kcalloc(ph->dev, pinfo->num_domains,
 				       sizeof(*pinfo->dom_info), GFP_KERNEL);
 	if (!pinfo->dom_info)
 		return -ENOMEM;
@@ -899,24 +1012,26 @@ static int scmi_perf_protocol_init(struct scmi_handle *handle)
 	for (domain = 0; domain < pinfo->num_domains; domain++) {
 		struct perf_dom_info *dom = pinfo->dom_info + domain;
 
-		scmi_perf_domain_attributes_get(handle, domain, dom);
-		scmi_perf_describe_levels_get(handle, domain, dom);
+		scmi_perf_domain_attributes_get(ph, domain, dom);
+		scmi_perf_describe_levels_get(ph, domain, dom);
 
 		if (dom->perf_fastchannels)
-			scmi_perf_domain_init_fc(handle, domain, &dom->fc_info);
+			scmi_perf_domain_init_fc(ph, domain, &dom->fc_info);
 	}
 
 	pinfo->version = version;
+
+	/* Transient code for legacy ops interface */
+	handle = scmi_map_scmi_handle(ph);
 	handle->perf_ops = &perf_ops;
-	handle->perf_priv = pinfo;
 
-	return 0;
+	return ph->set_priv(ph, pinfo);
 }
 
 static const struct scmi_protocol scmi_perf = {
 	.id = SCMI_PROTOCOL_PERF,
-	.init = &scmi_perf_protocol_init,
-	.ops = &perf_ops,
+	.instance_init = &scmi_perf_protocol_init,
+	.ops = &perf_proto_ops,
 	.events = &perf_protocol_events,
 };
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 2b66b243e841..c658aec78e15 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -85,7 +85,7 @@ struct scmi_clk_ops {
 };
 
 /**
- * struct scmi_perf_ops - represents the various operations provided
+ * struct scmi_perf_proto_ops - represents the various operations provided
  *	by SCMI Performance Protocol
  *
  * @limits_set: sets limits on the performance level of a domain
@@ -102,6 +102,31 @@ struct scmi_clk_ops {
  * @est_power_get: gets the estimated power cost for a given performance domain
  *	at a given frequency
  */
+struct scmi_perf_proto_ops {
+	int (*limits_set)(const struct scmi_protocol_handle *ph, u32 domain,
+			  u32 max_perf, u32 min_perf);
+	int (*limits_get)(const struct scmi_protocol_handle *ph, u32 domain,
+			  u32 *max_perf, u32 *min_perf);
+	int (*level_set)(const struct scmi_protocol_handle *ph, u32 domain,
+			 u32 level, bool poll);
+	int (*level_get)(const struct scmi_protocol_handle *ph, u32 domain,
+			 u32 *level, bool poll);
+	int (*device_domain_id)(struct device *dev);
+	int (*transition_latency_get)(const struct scmi_protocol_handle *ph,
+				      struct device *dev);
+	int (*device_opps_add)(const struct scmi_protocol_handle *ph,
+			       struct device *dev);
+	int (*freq_set)(const struct scmi_protocol_handle *ph, u32 domain,
+			unsigned long rate, bool poll);
+	int (*freq_get)(const struct scmi_protocol_handle *ph, u32 domain,
+			unsigned long *rate, bool poll);
+	int (*est_power_get)(const struct scmi_protocol_handle *ph, u32 domain,
+			     unsigned long *rate, unsigned long *power);
+	bool (*fast_switch_possible)(const struct scmi_protocol_handle *ph,
+				     struct device *dev);
+	bool (*power_scale_mw_get)(const struct scmi_protocol_handle *ph);
+};
+
 struct scmi_perf_ops {
 	int (*limits_set)(const struct scmi_handle *handle, u32 domain,
 			  u32 max_perf, u32 min_perf);
@@ -615,8 +640,6 @@ struct scmi_notify_ops {
  *		       operations and a dedicated protocol handler
  * @devm_protocol_put: devres managed method to release a protocol
  * @notify_ops: pointer to set of notifications related operations
- * @perf_priv: pointer to private data structure specific to performance
- *	protocol(for internal use only)
  * @clk_priv: pointer to private data structure specific to clock
  *	protocol(for internal use only)
  * @power_priv: pointer to private data structure specific to power
@@ -647,7 +670,6 @@ struct scmi_handle {
 
 	const struct scmi_notify_ops *notify_ops;
 	/* for protocol internal use */
-	void *perf_priv;
 	void *clk_priv;
 	void *power_priv;
 	void *sensor_priv;
-- 
cgit v1.2.3


From f58315a49ce98007870e1206ca5d79fee09cf36e Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:38 +0000
Subject: firmware: arm_scmi: Remove legacy scmi_perf_ops protocol interface

Now that all the SCMI driver users have been migrated to the new interface
remove the legacy interface and all the transient code.

Link: https://lore.kernel.org/r/20210316124903.35011-14-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/perf.c | 120 ---------------------------------------
 include/linux/scmi_protocol.h    |  27 ---------
 2 files changed, 147 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 73f04d7809b3..9770821c4819 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -391,15 +391,6 @@ static int scmi_perf_limits_set(const struct scmi_protocol_handle *ph,
 	return scmi_perf_mb_limits_set(ph, domain, max_perf, min_perf);
 }
 
-static int __scmi_perf_limits_set(const struct scmi_handle *handle,
-				  u32 domain, u32 max_perf, u32 min_perf)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_perf_limits_set(ph, domain, max_perf, min_perf);
-}
-
 static int scmi_perf_mb_limits_get(const struct scmi_protocol_handle *ph,
 				   u32 domain, u32 *max_perf, u32 *min_perf)
 {
@@ -441,15 +432,6 @@ static int scmi_perf_limits_get(const struct scmi_protocol_handle *ph,
 	return scmi_perf_mb_limits_get(ph, domain, max_perf, min_perf);
 }
 
-static int __scmi_perf_limits_get(const struct scmi_handle *handle,
-				  u32 domain, u32 *max_perf, u32 *min_perf)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_perf_limits_get(ph, domain, max_perf, min_perf);
-}
-
 static int scmi_perf_mb_level_set(const struct scmi_protocol_handle *ph,
 				  u32 domain, u32 level, bool poll)
 {
@@ -487,15 +469,6 @@ static int scmi_perf_level_set(const struct scmi_protocol_handle *ph,
 	return scmi_perf_mb_level_set(ph, domain, level, poll);
 }
 
-static int __scmi_perf_level_set(const struct scmi_handle *handle,
-				 u32 domain, u32 level, bool poll)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_perf_level_set(ph, domain, level, poll);
-}
-
 static int scmi_perf_mb_level_get(const struct scmi_protocol_handle *ph,
 				  u32 domain, u32 *level, bool poll)
 {
@@ -532,15 +505,6 @@ static int scmi_perf_level_get(const struct scmi_protocol_handle *ph,
 	return scmi_perf_mb_level_get(ph, domain, level, poll);
 }
 
-static int __scmi_perf_level_get(const struct scmi_handle *handle,
-				 u32 domain, u32 *level, bool poll)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_perf_level_get(ph, domain, level, poll);
-}
-
 static int scmi_perf_level_limits_notify(const struct scmi_protocol_handle *ph,
 					 u32 domain, int message_id,
 					 bool enable)
@@ -704,15 +668,6 @@ static int scmi_dvfs_device_opps_add(const struct scmi_protocol_handle *ph,
 	return 0;
 }
 
-static int __scmi_dvfs_device_opps_add(const struct scmi_handle *handle,
-				       struct device *dev)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_dvfs_device_opps_add(ph, dev);
-}
-
 static int
 scmi_dvfs_transition_latency_get(const struct scmi_protocol_handle *ph,
 				 struct device *dev)
@@ -729,16 +684,6 @@ scmi_dvfs_transition_latency_get(const struct scmi_protocol_handle *ph,
 	return dom->opp[dom->opp_count - 1].trans_latency_us * 1000;
 }
 
-static int
-__scmi_dvfs_transition_latency_get(const struct scmi_handle *handle,
-				   struct device *dev)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_dvfs_transition_latency_get(ph, dev);
-}
-
 static int scmi_dvfs_freq_set(const struct scmi_protocol_handle *ph, u32 domain,
 			      unsigned long freq, bool poll)
 {
@@ -748,15 +693,6 @@ static int scmi_dvfs_freq_set(const struct scmi_protocol_handle *ph, u32 domain,
 	return scmi_perf_level_set(ph, domain, freq / dom->mult_factor, poll);
 }
 
-static int __scmi_dvfs_freq_set(const struct scmi_handle *handle,
-				u32 domain, unsigned long freq, bool poll)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_dvfs_freq_set(ph, domain, freq, poll);
-}
-
 static int scmi_dvfs_freq_get(const struct scmi_protocol_handle *ph, u32 domain,
 			      unsigned long *freq, bool poll)
 {
@@ -772,15 +708,6 @@ static int scmi_dvfs_freq_get(const struct scmi_protocol_handle *ph, u32 domain,
 	return ret;
 }
 
-static int __scmi_dvfs_freq_get(const struct scmi_handle *handle, u32 domain,
-				unsigned long *freq, bool poll)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_dvfs_freq_get(ph, domain, freq, poll);
-}
-
 static int scmi_dvfs_est_power_get(const struct scmi_protocol_handle *ph,
 				   u32 domain, unsigned long *freq,
 				   unsigned long *power)
@@ -809,16 +736,6 @@ static int scmi_dvfs_est_power_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_dvfs_est_power_get(const struct scmi_handle *handle,
-				     u32 domain, unsigned long *freq,
-				     unsigned long *power)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_dvfs_est_power_get(ph, domain, freq, power);
-}
-
 static bool scmi_fast_switch_possible(const struct scmi_protocol_handle *ph,
 				      struct device *dev)
 {
@@ -830,15 +747,6 @@ static bool scmi_fast_switch_possible(const struct scmi_protocol_handle *ph,
 	return dom->fc_info && dom->fc_info->level_set_addr;
 }
 
-static bool __scmi_fast_switch_possible(const struct scmi_handle *handle,
-					struct device *dev)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_fast_switch_possible(ph, dev);
-}
-
 static bool scmi_power_scale_mw_get(const struct scmi_protocol_handle *ph)
 {
 	struct scmi_perf_info *pi = ph->get_priv(ph);
@@ -846,29 +754,6 @@ static bool scmi_power_scale_mw_get(const struct scmi_protocol_handle *ph)
 	return pi->power_scale_mw;
 }
 
-static bool __scmi_power_scale_mw_get(const struct scmi_handle *handle)
-{
-	const struct scmi_protocol_handle *ph =
-			scmi_map_protocol_handle(handle, SCMI_PROTOCOL_PERF);
-
-	return scmi_power_scale_mw_get(ph);
-}
-
-static const struct scmi_perf_ops perf_ops = {
-	.limits_set = __scmi_perf_limits_set,
-	.limits_get = __scmi_perf_limits_get,
-	.level_set = __scmi_perf_level_set,
-	.level_get = __scmi_perf_level_get,
-	.device_domain_id = scmi_dev_domain_id,
-	.transition_latency_get = __scmi_dvfs_transition_latency_get,
-	.device_opps_add = __scmi_dvfs_device_opps_add,
-	.freq_set = __scmi_dvfs_freq_set,
-	.freq_get = __scmi_dvfs_freq_get,
-	.est_power_get = __scmi_dvfs_est_power_get,
-	.fast_switch_possible = __scmi_fast_switch_possible,
-	.power_scale_mw_get = __scmi_power_scale_mw_get,
-};
-
 static const struct scmi_perf_proto_ops perf_proto_ops = {
 	.limits_set = scmi_perf_limits_set,
 	.limits_get = scmi_perf_limits_get,
@@ -991,7 +876,6 @@ static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
 	int domain;
 	u32 version;
 	struct scmi_perf_info *pinfo;
-	struct scmi_handle *handle;
 
 	ph->xops->version_get(ph, &version);
 
@@ -1021,10 +905,6 @@ static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
 
 	pinfo->version = version;
 
-	/* Transient code for legacy ops interface */
-	handle = scmi_map_scmi_handle(ph);
-	handle->perf_ops = &perf_ops;
-
 	return ph->set_priv(ph, pinfo);
 }
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index c658aec78e15..c71d7c297125 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -127,31 +127,6 @@ struct scmi_perf_proto_ops {
 	bool (*power_scale_mw_get)(const struct scmi_protocol_handle *ph);
 };
 
-struct scmi_perf_ops {
-	int (*limits_set)(const struct scmi_handle *handle, u32 domain,
-			  u32 max_perf, u32 min_perf);
-	int (*limits_get)(const struct scmi_handle *handle, u32 domain,
-			  u32 *max_perf, u32 *min_perf);
-	int (*level_set)(const struct scmi_handle *handle, u32 domain,
-			 u32 level, bool poll);
-	int (*level_get)(const struct scmi_handle *handle, u32 domain,
-			 u32 *level, bool poll);
-	int (*device_domain_id)(struct device *dev);
-	int (*transition_latency_get)(const struct scmi_handle *handle,
-				      struct device *dev);
-	int (*device_opps_add)(const struct scmi_handle *handle,
-			       struct device *dev);
-	int (*freq_set)(const struct scmi_handle *handle, u32 domain,
-			unsigned long rate, bool poll);
-	int (*freq_get)(const struct scmi_handle *handle, u32 domain,
-			unsigned long *rate, bool poll);
-	int (*est_power_get)(const struct scmi_handle *handle, u32 domain,
-			     unsigned long *rate, unsigned long *power);
-	bool (*fast_switch_possible)(const struct scmi_handle *handle,
-				     struct device *dev);
-	bool (*power_scale_mw_get)(const struct scmi_handle *handle);
-};
-
 /**
  * struct scmi_power_ops - represents the various operations provided
  *	by SCMI Power Protocol
@@ -631,7 +606,6 @@ struct scmi_notify_ops {
  * @dev: pointer to the SCMI device
  * @version: pointer to the structure containing SCMI version information
  * @power_ops: pointer to set of power protocol operations
- * @perf_ops: pointer to set of performance protocol operations
  * @clk_ops: pointer to set of clock protocol operations
  * @sensor_ops: pointer to set of sensor protocol operations
  * @reset_ops: pointer to set of reset protocol operations
@@ -656,7 +630,6 @@ struct scmi_notify_ops {
 struct scmi_handle {
 	struct device *dev;
 	struct scmi_revision_info *version;
-	const struct scmi_perf_ops *perf_ops;
 	const struct scmi_clk_ops *clk_ops;
 	const struct scmi_power_ops *power_ops;
 	const struct scmi_sensor_ops *sensor_ops;
-- 
cgit v1.2.3


From 9bc8069c85678e9a61584b58951b7c5fcdd5d357 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:39 +0000
Subject: firmware: arm_scmi: Port power protocol to new protocols interface

Convert internals of protocol implementation to use protocol handles and
expose a new protocol operations interface for SCMI driver using the new
get/put common operations, while keeping the old handle->power_ops still
around to ease transition.

Remove handle->power_priv now unused.

Link: https://lore.kernel.org/r/20210316124903.35011-15-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/power.c | 141 +++++++++++++++++++++++++-------------
 include/linux/scmi_protocol.h     |  20 ++++--
 2 files changed, 105 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/power.c b/drivers/firmware/arm_scmi/power.c
index 5f43c45ebc13..c950c1fc0fbe 100644
--- a/drivers/firmware/arm_scmi/power.c
+++ b/drivers/firmware/arm_scmi/power.c
@@ -68,21 +68,21 @@ struct scmi_power_info {
 	struct power_dom_info *dom_info;
 };
 
-static int scmi_power_attributes_get(const struct scmi_handle *handle,
+static int scmi_power_attributes_get(const struct scmi_protocol_handle *ph,
 				     struct scmi_power_info *pi)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_power_attributes *attr;
 
-	ret = scmi_xfer_get_init(handle, PROTOCOL_ATTRIBUTES,
-				 SCMI_PROTOCOL_POWER, 0, sizeof(*attr), &t);
+	ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES,
+				      0, sizeof(*attr), &t);
 	if (ret)
 		return ret;
 
 	attr = t->rx.buf;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		pi->num_domains = le16_to_cpu(attr->num_domains);
 		pi->stats_addr = le32_to_cpu(attr->stats_addr_low) |
@@ -90,28 +90,27 @@ static int scmi_power_attributes_get(const struct scmi_handle *handle,
 		pi->stats_size = le32_to_cpu(attr->stats_size);
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
 static int
-scmi_power_domain_attributes_get(const struct scmi_handle *handle, u32 domain,
-				 struct power_dom_info *dom_info)
+scmi_power_domain_attributes_get(const struct scmi_protocol_handle *ph,
+				 u32 domain, struct power_dom_info *dom_info)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_power_domain_attributes *attr;
 
-	ret = scmi_xfer_get_init(handle, POWER_DOMAIN_ATTRIBUTES,
-				 SCMI_PROTOCOL_POWER, sizeof(domain),
-				 sizeof(*attr), &t);
+	ret = ph->xops->xfer_get_init(ph, POWER_DOMAIN_ATTRIBUTES,
+				      sizeof(domain), sizeof(*attr), &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(domain, t->tx.buf);
 	attr = t->rx.buf;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		u32 flags = le32_to_cpu(attr->flags);
 
@@ -121,19 +120,18 @@ scmi_power_domain_attributes_get(const struct scmi_handle *handle, u32 domain,
 		strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE);
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int
-scmi_power_state_set(const struct scmi_handle *handle, u32 domain, u32 state)
+static int scmi_power_state_set(const struct scmi_protocol_handle *ph,
+				u32 domain, u32 state)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_power_set_state *st;
 
-	ret = scmi_xfer_get_init(handle, POWER_STATE_SET, SCMI_PROTOCOL_POWER,
-				 sizeof(*st), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, POWER_STATE_SET, sizeof(*st), 0, &t);
 	if (ret)
 		return ret;
 
@@ -142,64 +140,106 @@ scmi_power_state_set(const struct scmi_handle *handle, u32 domain, u32 state)
 	st->domain = cpu_to_le32(domain);
 	st->state = cpu_to_le32(state);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int
-scmi_power_state_get(const struct scmi_handle *handle, u32 domain, u32 *state)
+static int __scmi_power_state_set(const struct scmi_handle *handle,
+				  u32 domain, u32 state)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_POWER);
+
+	return scmi_power_state_set(ph, domain, state);
+}
+
+static int scmi_power_state_get(const struct scmi_protocol_handle *ph,
+				u32 domain, u32 *state)
 {
 	int ret;
 	struct scmi_xfer *t;
 
-	ret = scmi_xfer_get_init(handle, POWER_STATE_GET, SCMI_PROTOCOL_POWER,
-				 sizeof(u32), sizeof(u32), &t);
+	ret = ph->xops->xfer_get_init(ph, POWER_STATE_GET, sizeof(u32), sizeof(u32), &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(domain, t->tx.buf);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret)
 		*state = get_unaligned_le32(t->rx.buf);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_power_num_domains_get(const struct scmi_handle *handle)
+static int __scmi_power_state_get(const struct scmi_handle *handle,
+				  u32 domain, u32 *state)
 {
-	struct scmi_power_info *pi = handle->power_priv;
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_POWER);
+
+	return scmi_power_state_get(ph, domain, state);
+}
+
+static int scmi_power_num_domains_get(const struct scmi_protocol_handle *ph)
+{
+	struct scmi_power_info *pi = ph->get_priv(ph);
 
 	return pi->num_domains;
 }
 
-static char *scmi_power_name_get(const struct scmi_handle *handle, u32 domain)
+static int __scmi_power_num_domains_get(const struct scmi_handle *handle)
 {
-	struct scmi_power_info *pi = handle->power_priv;
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_POWER);
+
+	return scmi_power_num_domains_get(ph);
+}
+
+static char *scmi_power_name_get(const struct scmi_protocol_handle *ph,
+				 u32 domain)
+{
+	struct scmi_power_info *pi = ph->get_priv(ph);
 	struct power_dom_info *dom = pi->dom_info + domain;
 
 	return dom->name;
 }
 
+static char *__scmi_power_name_get(const struct scmi_handle *handle,
+				   u32 domain)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_POWER);
+
+	return scmi_power_name_get(ph, domain);
+}
+
 static const struct scmi_power_ops power_ops = {
+	.num_domains_get = __scmi_power_num_domains_get,
+	.name_get = __scmi_power_name_get,
+	.state_set = __scmi_power_state_set,
+	.state_get = __scmi_power_state_get,
+};
+
+static const struct scmi_power_proto_ops power_proto_ops = {
 	.num_domains_get = scmi_power_num_domains_get,
 	.name_get = scmi_power_name_get,
 	.state_set = scmi_power_state_set,
 	.state_get = scmi_power_state_get,
 };
 
-static int scmi_power_request_notify(const struct scmi_handle *handle,
+static int scmi_power_request_notify(const struct scmi_protocol_handle *ph,
 				     u32 domain, bool enable)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_power_state_notify *notify;
 
-	ret = scmi_xfer_get_init(handle, POWER_STATE_NOTIFY,
-				 SCMI_PROTOCOL_POWER, sizeof(*notify), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, POWER_STATE_NOTIFY,
+				      sizeof(*notify), 0, &t);
 	if (ret)
 		return ret;
 
@@ -207,18 +247,18 @@ static int scmi_power_request_notify(const struct scmi_handle *handle,
 	notify->domain = cpu_to_le32(domain);
 	notify->notify_enable = enable ? cpu_to_le32(BIT(0)) : 0;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_power_set_notify_enabled(const void *handle,
+static int scmi_power_set_notify_enabled(const void *ph,
 					 u8 evt_id, u32 src_id, bool enable)
 {
 	int ret;
 
-	ret = scmi_power_request_notify(handle, src_id, enable);
+	ret = scmi_power_request_notify(ph, src_id, enable);
 	if (ret)
 		pr_debug("FAIL_ENABLE - evt[%X] dom[%d] - ret:%d\n",
 			 evt_id, src_id, ret);
@@ -226,7 +266,7 @@ static int scmi_power_set_notify_enabled(const void *handle,
 	return ret;
 }
 
-static void *scmi_power_fill_custom_report(const void *handle,
+static void *scmi_power_fill_custom_report(const void *ph,
 					   u8 evt_id, ktime_t timestamp,
 					   const void *payld, size_t payld_sz,
 					   void *report, u32 *src_id)
@@ -246,10 +286,10 @@ static void *scmi_power_fill_custom_report(const void *handle,
 	return r;
 }
 
-static int scmi_power_get_num_sources(const void *handle)
+static int scmi_power_get_num_sources(const void *ph)
 {
 	struct scmi_power_info *pinfo =
-		((const struct scmi_handle *)(handle))->power_priv;
+		((const struct scmi_protocol_handle *)ph)->get_priv(ph);
 
 	if (!pinfo)
 		return -EINVAL;
@@ -279,24 +319,25 @@ static const struct scmi_protocol_events power_protocol_events = {
 	.num_events = ARRAY_SIZE(power_events),
 };
 
-static int scmi_power_protocol_init(struct scmi_handle *handle)
+static int scmi_power_protocol_init(const struct scmi_protocol_handle *ph)
 {
 	int domain;
 	u32 version;
 	struct scmi_power_info *pinfo;
+	struct scmi_handle *handle;
 
-	scmi_version_get(handle, SCMI_PROTOCOL_POWER, &version);
+	ph->xops->version_get(ph, &version);
 
-	dev_dbg(handle->dev, "Power Version %d.%d\n",
+	dev_dbg(ph->dev, "Power Version %d.%d\n",
 		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
 
-	pinfo = devm_kzalloc(handle->dev, sizeof(*pinfo), GFP_KERNEL);
+	pinfo = devm_kzalloc(ph->dev, sizeof(*pinfo), GFP_KERNEL);
 	if (!pinfo)
 		return -ENOMEM;
 
-	scmi_power_attributes_get(handle, pinfo);
+	scmi_power_attributes_get(ph, pinfo);
 
-	pinfo->dom_info = devm_kcalloc(handle->dev, pinfo->num_domains,
+	pinfo->dom_info = devm_kcalloc(ph->dev, pinfo->num_domains,
 				       sizeof(*pinfo->dom_info), GFP_KERNEL);
 	if (!pinfo->dom_info)
 		return -ENOMEM;
@@ -304,20 +345,22 @@ static int scmi_power_protocol_init(struct scmi_handle *handle)
 	for (domain = 0; domain < pinfo->num_domains; domain++) {
 		struct power_dom_info *dom = pinfo->dom_info + domain;
 
-		scmi_power_domain_attributes_get(handle, domain, dom);
+		scmi_power_domain_attributes_get(ph, domain, dom);
 	}
 
 	pinfo->version = version;
+
+	/* Transient code for legacy ops interface */
+	handle = scmi_map_scmi_handle(ph);
 	handle->power_ops = &power_ops;
-	handle->power_priv = pinfo;
 
-	return 0;
+	return ph->set_priv(ph, pinfo);
 }
 
 static const struct scmi_protocol scmi_power = {
 	.id = SCMI_PROTOCOL_POWER,
-	.init = &scmi_power_protocol_init,
-	.ops = &power_ops,
+	.instance_init = &scmi_power_protocol_init,
+	.ops = &power_proto_ops,
 	.events = &power_protocol_events,
 };
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index c71d7c297125..dfb3d2f5e0d3 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -128,7 +128,7 @@ struct scmi_perf_proto_ops {
 };
 
 /**
- * struct scmi_power_ops - represents the various operations provided
+ * struct scmi_power_proto_ops - represents the various operations provided
  *	by SCMI Power Protocol
  *
  * @num_domains_get: get the count of power domains provided by SCMI
@@ -136,9 +136,9 @@ struct scmi_perf_proto_ops {
  * @state_set: sets the power state of a power domain
  * @state_get: gets the power state of a power domain
  */
-struct scmi_power_ops {
-	int (*num_domains_get)(const struct scmi_handle *handle);
-	char *(*name_get)(const struct scmi_handle *handle, u32 domain);
+struct scmi_power_proto_ops {
+	int (*num_domains_get)(const struct scmi_protocol_handle *ph);
+	char *(*name_get)(const struct scmi_protocol_handle *ph, u32 domain);
 #define SCMI_POWER_STATE_TYPE_SHIFT	30
 #define SCMI_POWER_STATE_ID_MASK	(BIT(28) - 1)
 #define SCMI_POWER_STATE_PARAM(type, id) \
@@ -146,6 +146,15 @@ struct scmi_power_ops {
 		((id) & SCMI_POWER_STATE_ID_MASK))
 #define SCMI_POWER_STATE_GENERIC_ON	SCMI_POWER_STATE_PARAM(0, 0)
 #define SCMI_POWER_STATE_GENERIC_OFF	SCMI_POWER_STATE_PARAM(1, 0)
+	int (*state_set)(const struct scmi_protocol_handle *ph, u32 domain,
+			 u32 state);
+	int (*state_get)(const struct scmi_protocol_handle *ph, u32 domain,
+			 u32 *state);
+};
+
+struct scmi_power_ops {
+	int (*num_domains_get)(const struct scmi_handle *handle);
+	char *(*name_get)(const struct scmi_handle *handle, u32 domain);
 	int (*state_set)(const struct scmi_handle *handle, u32 domain,
 			 u32 state);
 	int (*state_get)(const struct scmi_handle *handle, u32 domain,
@@ -616,8 +625,6 @@ struct scmi_notify_ops {
  * @notify_ops: pointer to set of notifications related operations
  * @clk_priv: pointer to private data structure specific to clock
  *	protocol(for internal use only)
- * @power_priv: pointer to private data structure specific to power
- *	protocol(for internal use only)
  * @sensor_priv: pointer to private data structure specific to sensors
  *	protocol(for internal use only)
  * @reset_priv: pointer to private data structure specific to reset
@@ -644,7 +651,6 @@ struct scmi_handle {
 	const struct scmi_notify_ops *notify_ops;
 	/* for protocol internal use */
 	void *clk_priv;
-	void *power_priv;
 	void *sensor_priv;
 	void *reset_priv;
 	void *voltage_priv;
-- 
cgit v1.2.3


From 0f84576a62c4b42ae7110f1c27bbb51183f232fb Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:41 +0000
Subject: firmware: arm_scmi: Remove legacy scmi_power_ops protocol interface

Now that all the SCMI driver users have been migrated to the new interface
remove the legacy interface and all the transient code.

Link: https://lore.kernel.org/r/20210316124903.35011-17-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/power.c | 47 ---------------------------------------
 include/linux/scmi_protocol.h     | 11 ---------
 2 files changed, 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/power.c b/drivers/firmware/arm_scmi/power.c
index c950c1fc0fbe..ecdf49abd981 100644
--- a/drivers/firmware/arm_scmi/power.c
+++ b/drivers/firmware/arm_scmi/power.c
@@ -146,15 +146,6 @@ static int scmi_power_state_set(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_power_state_set(const struct scmi_handle *handle,
-				  u32 domain, u32 state)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_POWER);
-
-	return scmi_power_state_set(ph, domain, state);
-}
-
 static int scmi_power_state_get(const struct scmi_protocol_handle *ph,
 				u32 domain, u32 *state)
 {
@@ -175,15 +166,6 @@ static int scmi_power_state_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_power_state_get(const struct scmi_handle *handle,
-				  u32 domain, u32 *state)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_POWER);
-
-	return scmi_power_state_get(ph, domain, state);
-}
-
 static int scmi_power_num_domains_get(const struct scmi_protocol_handle *ph)
 {
 	struct scmi_power_info *pi = ph->get_priv(ph);
@@ -191,14 +173,6 @@ static int scmi_power_num_domains_get(const struct scmi_protocol_handle *ph)
 	return pi->num_domains;
 }
 
-static int __scmi_power_num_domains_get(const struct scmi_handle *handle)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_POWER);
-
-	return scmi_power_num_domains_get(ph);
-}
-
 static char *scmi_power_name_get(const struct scmi_protocol_handle *ph,
 				 u32 domain)
 {
@@ -208,22 +182,6 @@ static char *scmi_power_name_get(const struct scmi_protocol_handle *ph,
 	return dom->name;
 }
 
-static char *__scmi_power_name_get(const struct scmi_handle *handle,
-				   u32 domain)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_POWER);
-
-	return scmi_power_name_get(ph, domain);
-}
-
-static const struct scmi_power_ops power_ops = {
-	.num_domains_get = __scmi_power_num_domains_get,
-	.name_get = __scmi_power_name_get,
-	.state_set = __scmi_power_state_set,
-	.state_get = __scmi_power_state_get,
-};
-
 static const struct scmi_power_proto_ops power_proto_ops = {
 	.num_domains_get = scmi_power_num_domains_get,
 	.name_get = scmi_power_name_get,
@@ -324,7 +282,6 @@ static int scmi_power_protocol_init(const struct scmi_protocol_handle *ph)
 	int domain;
 	u32 version;
 	struct scmi_power_info *pinfo;
-	struct scmi_handle *handle;
 
 	ph->xops->version_get(ph, &version);
 
@@ -350,10 +307,6 @@ static int scmi_power_protocol_init(const struct scmi_protocol_handle *ph)
 
 	pinfo->version = version;
 
-	/* Transient code for legacy ops interface */
-	handle = scmi_map_scmi_handle(ph);
-	handle->power_ops = &power_ops;
-
 	return ph->set_priv(ph, pinfo);
 }
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index dfb3d2f5e0d3..2328d36b7597 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -152,15 +152,6 @@ struct scmi_power_proto_ops {
 			 u32 *state);
 };
 
-struct scmi_power_ops {
-	int (*num_domains_get)(const struct scmi_handle *handle);
-	char *(*name_get)(const struct scmi_handle *handle, u32 domain);
-	int (*state_set)(const struct scmi_handle *handle, u32 domain,
-			 u32 state);
-	int (*state_get)(const struct scmi_handle *handle, u32 domain,
-			 u32 *state);
-};
-
 /**
  * scmi_sensor_reading  - represent a timestamped read
  *
@@ -614,7 +605,6 @@ struct scmi_notify_ops {
  *
  * @dev: pointer to the SCMI device
  * @version: pointer to the structure containing SCMI version information
- * @power_ops: pointer to set of power protocol operations
  * @clk_ops: pointer to set of clock protocol operations
  * @sensor_ops: pointer to set of sensor protocol operations
  * @reset_ops: pointer to set of reset protocol operations
@@ -638,7 +628,6 @@ struct scmi_handle {
 	struct device *dev;
 	struct scmi_revision_info *version;
 	const struct scmi_clk_ops *clk_ops;
-	const struct scmi_power_ops *power_ops;
 	const struct scmi_sensor_ops *sensor_ops;
 	const struct scmi_reset_ops *reset_ops;
 	const struct scmi_voltage_ops *voltage_ops;
-- 
cgit v1.2.3


From 887281c7519d6e291be5353449d1e1bc00aa63af Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:42 +0000
Subject: firmware: arm_scmi: Port clock protocol to new protocols interface

Convert internals of protocol implementation to use protocol handles and
expose a new protocol operations interface for SCMI driver using the new
get/put common operations, while keeping the old handle->clk_ops still
around to ease transition.

Remove handle->clock_priv now unused.

Link: https://lore.kernel.org/r/20210316124903.35011-18-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/clock.c | 182 ++++++++++++++++++++++++++------------
 include/linux/scmi_protocol.h     |  20 +++--
 2 files changed, 139 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 2312e824ed3c..1b3b0278b73d 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -74,52 +74,53 @@ struct clock_info {
 	struct scmi_clock_info *clk;
 };
 
-static int scmi_clock_protocol_attributes_get(const struct scmi_handle *handle,
-					      struct clock_info *ci)
+static int
+scmi_clock_protocol_attributes_get(const struct scmi_protocol_handle *ph,
+				   struct clock_info *ci)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_clock_protocol_attributes *attr;
 
-	ret = scmi_xfer_get_init(handle, PROTOCOL_ATTRIBUTES,
-				 SCMI_PROTOCOL_CLOCK, 0, sizeof(*attr), &t);
+	ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES,
+				      0, sizeof(*attr), &t);
 	if (ret)
 		return ret;
 
 	attr = t->rx.buf;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		ci->num_clocks = le16_to_cpu(attr->num_clocks);
 		ci->max_async_req = attr->max_async_req;
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_clock_attributes_get(const struct scmi_handle *handle,
+static int scmi_clock_attributes_get(const struct scmi_protocol_handle *ph,
 				     u32 clk_id, struct scmi_clock_info *clk)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_clock_attributes *attr;
 
-	ret = scmi_xfer_get_init(handle, CLOCK_ATTRIBUTES, SCMI_PROTOCOL_CLOCK,
-				 sizeof(clk_id), sizeof(*attr), &t);
+	ret = ph->xops->xfer_get_init(ph, CLOCK_ATTRIBUTES,
+				      sizeof(clk_id), sizeof(*attr), &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(clk_id, t->tx.buf);
 	attr = t->rx.buf;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret)
 		strlcpy(clk->name, attr->name, SCMI_MAX_STR_SIZE);
 	else
 		clk->name[0] = '\0';
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
@@ -136,7 +137,7 @@ static int rate_cmp_func(const void *_r1, const void *_r2)
 }
 
 static int
-scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id,
+scmi_clock_describe_rates_get(const struct scmi_protocol_handle *ph, u32 clk_id,
 			      struct scmi_clock_info *clk)
 {
 	u64 *rate = NULL;
@@ -148,8 +149,8 @@ scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id,
 	struct scmi_msg_clock_describe_rates *clk_desc;
 	struct scmi_msg_resp_clock_describe_rates *rlist;
 
-	ret = scmi_xfer_get_init(handle, CLOCK_DESCRIBE_RATES,
-				 SCMI_PROTOCOL_CLOCK, sizeof(*clk_desc), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, CLOCK_DESCRIBE_RATES,
+				      sizeof(*clk_desc), 0, &t);
 	if (ret)
 		return ret;
 
@@ -161,7 +162,7 @@ scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id,
 		/* Set the number of rates to be skipped/already read */
 		clk_desc->rate_index = cpu_to_le32(tot_rate_cnt);
 
-		ret = scmi_do_xfer(handle, t);
+		ret = ph->xops->do_xfer(ph, t);
 		if (ret)
 			goto err;
 
@@ -171,7 +172,7 @@ scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id,
 		num_returned = NUM_RETURNED(rates_flag);
 
 		if (tot_rate_cnt + num_returned > SCMI_MAX_NUM_RATES) {
-			dev_err(handle->dev, "No. of rates > MAX_NUM_RATES");
+			dev_err(ph->dev, "No. of rates > MAX_NUM_RATES");
 			break;
 		}
 
@@ -179,7 +180,7 @@ scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id,
 			clk->range.min_rate = RATE_TO_U64(rlist->rate[0]);
 			clk->range.max_rate = RATE_TO_U64(rlist->rate[1]);
 			clk->range.step_size = RATE_TO_U64(rlist->rate[2]);
-			dev_dbg(handle->dev, "Min %llu Max %llu Step %llu Hz\n",
+			dev_dbg(ph->dev, "Min %llu Max %llu Step %llu Hz\n",
 				clk->range.min_rate, clk->range.max_rate,
 				clk->range.step_size);
 			break;
@@ -188,12 +189,12 @@ scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id,
 		rate = &clk->list.rates[tot_rate_cnt];
 		for (cnt = 0; cnt < num_returned; cnt++, rate++) {
 			*rate = RATE_TO_U64(rlist->rate[cnt]);
-			dev_dbg(handle->dev, "Rate %llu Hz\n", *rate);
+			dev_dbg(ph->dev, "Rate %llu Hz\n", *rate);
 		}
 
 		tot_rate_cnt += num_returned;
 
-		scmi_reset_rx_to_maxsz(handle, t);
+		ph->xops->reset_rx_to_maxsz(ph, t);
 		/*
 		 * check for both returned and remaining to avoid infinite
 		 * loop due to buggy firmware
@@ -208,42 +209,52 @@ scmi_clock_describe_rates_get(const struct scmi_handle *handle, u32 clk_id,
 	clk->rate_discrete = rate_discrete;
 
 err:
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
 static int
-scmi_clock_rate_get(const struct scmi_handle *handle, u32 clk_id, u64 *value)
+scmi_clock_rate_get(const struct scmi_protocol_handle *ph,
+		    u32 clk_id, u64 *value)
 {
 	int ret;
 	struct scmi_xfer *t;
 
-	ret = scmi_xfer_get_init(handle, CLOCK_RATE_GET, SCMI_PROTOCOL_CLOCK,
-				 sizeof(__le32), sizeof(u64), &t);
+	ret = ph->xops->xfer_get_init(ph, CLOCK_RATE_GET,
+				      sizeof(__le32), sizeof(u64), &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(clk_id, t->tx.buf);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret)
 		*value = get_unaligned_le64(t->rx.buf);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_clock_rate_set(const struct scmi_handle *handle, u32 clk_id,
-			       u64 rate)
+static int
+__scmi_clock_rate_get(const struct scmi_handle *handle,
+		      u32 clk_id, u64 *value)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
+
+	return scmi_clock_rate_get(ph, clk_id, value);
+}
+
+static int scmi_clock_rate_set(const struct scmi_protocol_handle *ph,
+			       u32 clk_id, u64 rate)
 {
 	int ret;
 	u32 flags = 0;
 	struct scmi_xfer *t;
 	struct scmi_clock_set_rate *cfg;
-	struct clock_info *ci = handle->clk_priv;
+	struct clock_info *ci = ph->get_priv(ph);
 
-	ret = scmi_xfer_get_init(handle, CLOCK_RATE_SET, SCMI_PROTOCOL_CLOCK,
-				 sizeof(*cfg), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, CLOCK_RATE_SET, sizeof(*cfg), 0, &t);
 	if (ret)
 		return ret;
 
@@ -258,26 +269,36 @@ static int scmi_clock_rate_set(const struct scmi_handle *handle, u32 clk_id,
 	cfg->value_high = cpu_to_le32(rate >> 32);
 
 	if (flags & CLOCK_SET_ASYNC)
-		ret = scmi_do_xfer_with_response(handle, t);
+		ret = ph->xops->do_xfer_with_response(ph, t);
 	else
-		ret = scmi_do_xfer(handle, t);
+		ret = ph->xops->do_xfer(ph, t);
 
 	if (ci->max_async_req)
 		atomic_dec(&ci->cur_async_req);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
+static int __scmi_clock_rate_set(const struct scmi_handle *handle,
+				 u32 clk_id, u64 rate)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
+
+	return scmi_clock_rate_set(ph, clk_id, rate);
+}
+
 static int
-scmi_clock_config_set(const struct scmi_handle *handle, u32 clk_id, u32 config)
+scmi_clock_config_set(const struct scmi_protocol_handle *ph, u32 clk_id,
+		      u32 config)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_clock_set_config *cfg;
 
-	ret = scmi_xfer_get_init(handle, CLOCK_CONFIG_SET, SCMI_PROTOCOL_CLOCK,
-				 sizeof(*cfg), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, CLOCK_CONFIG_SET,
+				      sizeof(*cfg), 0, &t);
 	if (ret)
 		return ret;
 
@@ -285,33 +306,57 @@ scmi_clock_config_set(const struct scmi_handle *handle, u32 clk_id, u32 config)
 	cfg->id = cpu_to_le32(clk_id);
 	cfg->attributes = cpu_to_le32(config);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_clock_enable(const struct scmi_handle *handle, u32 clk_id)
+static int scmi_clock_enable(const struct scmi_protocol_handle *ph, u32 clk_id)
+{
+	return scmi_clock_config_set(ph, clk_id, CLOCK_ENABLE);
+}
+
+static int __scmi_clock_enable(const struct scmi_handle *handle, u32 clk_id)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
+
+	return scmi_clock_enable(ph, clk_id);
+}
+
+static int scmi_clock_disable(const struct scmi_protocol_handle *ph, u32 clk_id)
 {
-	return scmi_clock_config_set(handle, clk_id, CLOCK_ENABLE);
+	return scmi_clock_config_set(ph, clk_id, 0);
 }
 
-static int scmi_clock_disable(const struct scmi_handle *handle, u32 clk_id)
+static int __scmi_clock_disable(const struct scmi_handle *handle, u32 clk_id)
 {
-	return scmi_clock_config_set(handle, clk_id, 0);
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
+
+	return scmi_clock_disable(ph, clk_id);
 }
 
-static int scmi_clock_count_get(const struct scmi_handle *handle)
+static int scmi_clock_count_get(const struct scmi_protocol_handle *ph)
 {
-	struct clock_info *ci = handle->clk_priv;
+	struct clock_info *ci = ph->get_priv(ph);
 
 	return ci->num_clocks;
 }
 
+static int __scmi_clock_count_get(const struct scmi_handle *handle)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
+
+	return scmi_clock_count_get(ph);
+}
+
 static const struct scmi_clock_info *
-scmi_clock_info_get(const struct scmi_handle *handle, u32 clk_id)
+scmi_clock_info_get(const struct scmi_protocol_handle *ph, u32 clk_id)
 {
-	struct clock_info *ci = handle->clk_priv;
+	struct clock_info *ci = ph->get_priv(ph);
 	struct scmi_clock_info *clk = ci->clk + clk_id;
 
 	if (!clk->name[0])
@@ -320,7 +365,25 @@ scmi_clock_info_get(const struct scmi_handle *handle, u32 clk_id)
 	return clk;
 }
 
+static const struct scmi_clock_info *
+__scmi_clock_info_get(const struct scmi_handle *handle, u32 clk_id)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
+
+	return scmi_clock_info_get(ph, clk_id);
+}
+
 static const struct scmi_clk_ops clk_ops = {
+	.count_get = __scmi_clock_count_get,
+	.info_get = __scmi_clock_info_get,
+	.rate_get = __scmi_clock_rate_get,
+	.rate_set = __scmi_clock_rate_set,
+	.enable = __scmi_clock_enable,
+	.disable = __scmi_clock_disable,
+};
+
+static const struct scmi_clk_proto_ops clk_proto_ops = {
 	.count_get = scmi_clock_count_get,
 	.info_get = scmi_clock_info_get,
 	.rate_get = scmi_clock_rate_get,
@@ -329,24 +392,25 @@ static const struct scmi_clk_ops clk_ops = {
 	.disable = scmi_clock_disable,
 };
 
-static int scmi_clock_protocol_init(struct scmi_handle *handle)
+static int scmi_clock_protocol_init(const struct scmi_protocol_handle *ph)
 {
 	u32 version;
 	int clkid, ret;
 	struct clock_info *cinfo;
+	struct scmi_handle *handle;
 
-	scmi_version_get(handle, SCMI_PROTOCOL_CLOCK, &version);
+	ph->xops->version_get(ph, &version);
 
-	dev_dbg(handle->dev, "Clock Version %d.%d\n",
+	dev_dbg(ph->dev, "Clock Version %d.%d\n",
 		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
 
-	cinfo = devm_kzalloc(handle->dev, sizeof(*cinfo), GFP_KERNEL);
+	cinfo = devm_kzalloc(ph->dev, sizeof(*cinfo), GFP_KERNEL);
 	if (!cinfo)
 		return -ENOMEM;
 
-	scmi_clock_protocol_attributes_get(handle, cinfo);
+	scmi_clock_protocol_attributes_get(ph, cinfo);
 
-	cinfo->clk = devm_kcalloc(handle->dev, cinfo->num_clocks,
+	cinfo->clk = devm_kcalloc(ph->dev, cinfo->num_clocks,
 				  sizeof(*cinfo->clk), GFP_KERNEL);
 	if (!cinfo->clk)
 		return -ENOMEM;
@@ -354,22 +418,24 @@ static int scmi_clock_protocol_init(struct scmi_handle *handle)
 	for (clkid = 0; clkid < cinfo->num_clocks; clkid++) {
 		struct scmi_clock_info *clk = cinfo->clk + clkid;
 
-		ret = scmi_clock_attributes_get(handle, clkid, clk);
+		ret = scmi_clock_attributes_get(ph, clkid, clk);
 		if (!ret)
-			scmi_clock_describe_rates_get(handle, clkid, clk);
+			scmi_clock_describe_rates_get(ph, clkid, clk);
 	}
 
 	cinfo->version = version;
+
+	/* Transient code for legacy ops interface */
+	handle = scmi_map_scmi_handle(ph);
 	handle->clk_ops = &clk_ops;
-	handle->clk_priv = cinfo;
 
-	return 0;
+	return ph->set_priv(ph, cinfo);
 }
 
 static const struct scmi_protocol scmi_clock = {
 	.id = SCMI_PROTOCOL_CLOCK,
-	.init = &scmi_clock_protocol_init,
-	.ops = &clk_ops,
+	.instance_init = &scmi_clock_protocol_init,
+	.ops = &clk_proto_ops,
 };
 
 DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(clock, scmi_clock)
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 2328d36b7597..b19797b23c5f 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -61,7 +61,7 @@ struct scmi_device;
 struct scmi_protocol_handle;
 
 /**
- * struct scmi_clk_ops - represents the various operations provided
+ * struct scmi_clk_proto_ops - represents the various operations provided
  *	by SCMI Clock Protocol
  *
  * @count_get: get the count of clocks provided by SCMI
@@ -71,8 +71,21 @@ struct scmi_protocol_handle;
  * @enable: enables the specified clock
  * @disable: disables the specified clock
  */
+struct scmi_clk_proto_ops {
+	int (*count_get)(const struct scmi_protocol_handle *ph);
+
+	const struct scmi_clock_info *(*info_get)
+		(const struct scmi_protocol_handle *ph, u32 clk_id);
+	int (*rate_get)(const struct scmi_protocol_handle *ph, u32 clk_id,
+			u64 *rate);
+	int (*rate_set)(const struct scmi_protocol_handle *ph, u32 clk_id,
+			u64 rate);
+	int (*enable)(const struct scmi_protocol_handle *ph, u32 clk_id);
+	int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id);
+};
+
 struct scmi_clk_ops {
-	int (*count_get)(const struct scmi_handle *handle);
+	int (*count_get)(const struct scmi_handle *hamdle);
 
 	const struct scmi_clock_info *(*info_get)
 		(const struct scmi_handle *handle, u32 clk_id);
@@ -613,8 +626,6 @@ struct scmi_notify_ops {
  *		       operations and a dedicated protocol handler
  * @devm_protocol_put: devres managed method to release a protocol
  * @notify_ops: pointer to set of notifications related operations
- * @clk_priv: pointer to private data structure specific to clock
- *	protocol(for internal use only)
  * @sensor_priv: pointer to private data structure specific to sensors
  *	protocol(for internal use only)
  * @reset_priv: pointer to private data structure specific to reset
@@ -639,7 +650,6 @@ struct scmi_handle {
 
 	const struct scmi_notify_ops *notify_ops;
 	/* for protocol internal use */
-	void *clk_priv;
 	void *sensor_priv;
 	void *reset_priv;
 	void *voltage_priv;
-- 
cgit v1.2.3


From 7b2d92a3c8e5e8a7ef2710a928a1011bcebc2b91 Mon Sep 17 00:00:00 2001
From: Guoqing Chi <chiguoqing@yulong.com>
Date: Mon, 22 Mar 2021 09:30:24 +0800
Subject: iio:imu:mpu6050: Modify matricies to matrices

The plural of "matrix" is "matrices".

Signed-off-by: Guoqing Chi <chiguoqing@yulong.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Jean-Baptiste Maneyrol <jmaneyrol@invensense.com>
Link: https://lore.kernel.org/r/20210322013024.1849-1-chi962464zy@163.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/platform_data/invensense_mpu6050.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/invensense_mpu6050.h b/include/linux/platform_data/invensense_mpu6050.h
index 93974f4cfba1..f05b37521f67 100644
--- a/include/linux/platform_data/invensense_mpu6050.h
+++ b/include/linux/platform_data/invensense_mpu6050.h
@@ -12,7 +12,7 @@
  *			mounting matrix retrieved from device-tree)
  *
  * Contains platform specific information on how to configure the MPU6050 to
- * work on this platform.  The orientation matricies are 3x3 rotation matricies
+ * work on this platform.  The orientation matrices are 3x3 rotation matrices
  * that are applied to the data to rotate from the mounting orientation to the
  * platform orientation.  The values must be one of 0, 1, or -1 and each row and
  * column should have exactly 1 non-zero value.
-- 
cgit v1.2.3


From 9c7d24693d864f90b27aad5d15fbfe226c02898b Mon Sep 17 00:00:00 2001
From: Álvaro Fernández Rojas <noltari@gmail.com>
Date: Wed, 24 Mar 2021 09:19:02 +0100
Subject: gpio: guard gpiochip_irqchip_add_domain() with GPIOLIB_IRQCHIP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current code doesn't check if GPIOLIB_IRQCHIP is enabled, which results in
a compilation error when trying to build gpio-regmap if CONFIG_GPIOLIB_IRQCHIP
isn't enabled.

Fixes: 6a45b0e2589f ("gpiolib: Introduce gpiochip_irqchip_add_domain()")
Suggested-by: Michael Walle <michael@walle.cc>
Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Michael Walle <michael@walle.cc>
Acked-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Link: https://lore.kernel.org/r/20210324081923.20379-2-noltari@gmail.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 286de0520574..ecf0032a0995 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -624,8 +624,17 @@ void gpiochip_irq_domain_deactivate(struct irq_domain *domain,
 bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc,
 				unsigned int offset);
 
+#ifdef CONFIG_GPIOLIB_IRQCHIP
 int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 				struct irq_domain *domain);
+#else
+static inline int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
+					      struct irq_domain *domain)
+{
+	WARN_ON(1);
+	return -EINVAL;
+}
+#endif
 
 int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset);
-- 
cgit v1.2.3


From d46bf9ec4596654f36245e3b14765bcb422be6ad Mon Sep 17 00:00:00 2001
From: Álvaro Fernández Rojas <noltari@gmail.com>
Date: Wed, 24 Mar 2021 09:19:03 +0100
Subject: gpio: regmap: set gpio_chip of_node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is needed for properly registering GPIO regmap as a child of a regmap
pin controller.

Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
Reviewed-by: Michael Walle <michael@walle.cc>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Link: https://lore.kernel.org/r/20210324081923.20379-3-noltari@gmail.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-regmap.c  | 5 +++++
 include/linux/gpio/regmap.h | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c
index 5412cb3b0b2a..134cedf151a7 100644
--- a/drivers/gpio/gpio-regmap.c
+++ b/drivers/gpio/gpio-regmap.c
@@ -254,6 +254,11 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config
 	chip->names = config->names;
 	chip->label = config->label ?: dev_name(config->parent);
 
+#if defined(CONFIG_OF_GPIO)
+	/* gpiolib will use of_node of the parent if chip->of_node is NULL */
+	chip->of_node = to_of_node(config->fwnode);
+#endif /* CONFIG_OF_GPIO */
+
 	/*
 	 * If our regmap is fast_io we should probably set can_sleep to false.
 	 * Right now, the regmap doesn't save this property, nor is there any
diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h
index ad76f3d0a6ba..334dd928042b 100644
--- a/include/linux/gpio/regmap.h
+++ b/include/linux/gpio/regmap.h
@@ -4,6 +4,7 @@
 #define _LINUX_GPIO_REGMAP_H
 
 struct device;
+struct fwnode_handle;
 struct gpio_regmap;
 struct irq_domain;
 struct regmap;
@@ -16,6 +17,8 @@ struct regmap;
  * @parent:		The parent device
  * @regmap:		The regmap used to access the registers
  *			given, the name of the device is used
+ * @fwnode:		(Optional) The firmware node.
+ *			If not given, the fwnode of the parent is used.
  * @label:		(Optional) Descriptive name for GPIO controller.
  *			If not given, the name of the device is used.
  * @ngpio:		Number of GPIOs
@@ -57,6 +60,7 @@ struct regmap;
 struct gpio_regmap_config {
 	struct device *parent;
 	struct regmap *regmap;
+	struct fwnode_handle *fwnode;
 
 	const char *label;
 	int ngpio;
-- 
cgit v1.2.3


From 0f00b82e5413571ed225ddbccad6882d7ea60bc7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Mar 2021 08:45:50 +0100
Subject: block: remove the revalidate_disk method

No implementations left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210308074550.422714-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/filesystems/locking.rst | 2 --
 fs/block_dev.c                        | 3 ---
 include/linux/blkdev.h                | 1 -
 3 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index b7dcc86c92a4..9774e92e449f 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -469,7 +469,6 @@ prototypes::
 	int (*direct_access) (struct block_device *, sector_t, void **,
 				unsigned long *);
 	void (*unlock_native_capacity) (struct gendisk *);
-	int (*revalidate_disk) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 
@@ -484,7 +483,6 @@ ioctl:			no
 compat_ioctl:		no
 direct_access:		no
 unlock_native_capacity:	no
-revalidate_disk:	no
 getgeo:			no
 swap_slot_free_notify:	no	(see below)
 ======================= ===================
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 92ed7d5df677..535d29fa06fa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1259,9 +1259,6 @@ rescan:
 		if (disk_part_scan_enabled(disk) ||
 		    !(disk->flags & GENHD_FL_REMOVABLE))
 			set_capacity(disk, 0);
-	} else {
-		if (disk->fops->revalidate_disk)
-			disk->fops->revalidate_disk(disk);
 	}
 
 	if (get_capacity(disk)) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc6bc8383b43..b4241f73f7a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1870,7 +1870,6 @@ struct block_device_operations {
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	void (*unlock_native_capacity) (struct gendisk *);
-	int (*revalidate_disk) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	int (*set_read_only)(struct block_device *bdev, bool ro);
 	/* this callback is with swap_lock and sometimes page table lock held */
-- 
cgit v1.2.3


From c15380b72d7ae821ee090ba5a56fc6310828dbda Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Mar 2021 16:29:30 +0100
Subject: locking/rtmutex: Remove rt_mutex_timed_lock()

rt_mutex_timed_lock() has no callers since:

  c051b21f71d1f ("rtmutex: Confine deadlock logic to futex")

Remove it.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210326153943.061103415@linutronix.de
---
 include/linux/rtmutex.h  |  3 ---
 kernel/locking/rtmutex.c | 46 ----------------------------------------------
 2 files changed, 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 6fd615a0eea9..32f4a3538c3c 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -115,9 +115,6 @@ extern void rt_mutex_lock(struct rt_mutex *lock);
 #endif
 
 extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
-extern int rt_mutex_timed_lock(struct rt_mutex *lock,
-			       struct hrtimer_sleeper *timeout);
-
 extern int rt_mutex_trylock(struct rt_mutex *lock);
 
 extern void rt_mutex_unlock(struct rt_mutex *lock);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index db31bce114f8..ca93e5d7b026 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1394,21 +1394,6 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
 	return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
 }
 
-static inline int
-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
-			struct hrtimer_sleeper *timeout,
-			enum rtmutex_chainwalk chwalk,
-			int (*slowfn)(struct rt_mutex *lock, int state,
-				      struct hrtimer_sleeper *timeout,
-				      enum rtmutex_chainwalk chwalk))
-{
-	if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
-	    likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
-		return 0;
-
-	return slowfn(lock, state, timeout, chwalk);
-}
-
 static inline int
 rt_mutex_fasttrylock(struct rt_mutex *lock,
 		     int (*slowfn)(struct rt_mutex *lock))
@@ -1516,37 +1501,6 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
 	return __rt_mutex_slowtrylock(lock);
 }
 
-/**
- * rt_mutex_timed_lock - lock a rt_mutex interruptible
- *			the timeout structure is provided
- *			by the caller
- *
- * @lock:		the rt_mutex to be locked
- * @timeout:		timeout structure or NULL (no timeout)
- *
- * Returns:
- *  0		on success
- * -EINTR	when interrupted by a signal
- * -ETIMEDOUT	when the timeout expired
- */
-int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
-{
-	int ret;
-
-	might_sleep();
-
-	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
-				       RT_MUTEX_MIN_CHAINWALK,
-				       rt_mutex_slowlock);
-	if (ret)
-		mutex_release(&lock->dep_map, _RET_IP_);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
-
 /**
  * rt_mutex_trylock - try to lock a rt_mutex
  *
-- 
cgit v1.2.3


From 2d445c3e4a8216cfa9703998124c13250cc13e5e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Mar 2021 16:29:31 +0100
Subject: locking/rtmutex: Remove rtmutex deadlock tester leftovers

The following debug members of 'struct rtmutex' are unused:

 - save_state: No users

 - file,line: Printed if ::name is NULL. This is only used for non-futex
	      locks so ::name is never NULL

 - magic:     Assigned to NULL by rt_mutex_destroy(), no further usage

Remove them along with unused inline and macro leftovers related to
the long gone deadlock tester.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210326153943.195064296@linutronix.de
---
 include/linux/rtmutex.h         | 7 ++-----
 kernel/locking/rtmutex-debug.c  | 7 +------
 kernel/locking/rtmutex-debug.h  | 2 --
 kernel/locking/rtmutex.c        | 3 ---
 kernel/locking/rtmutex.h        | 2 --
 kernel/locking/rtmutex_common.h | 1 -
 6 files changed, 3 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 32f4a3538c3c..48b334b9eb87 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -32,10 +32,7 @@ struct rt_mutex {
 	struct rb_root_cached   waiters;
 	struct task_struct	*owner;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
-	int			save_state;
-	const char		*name, *file;
-	int			line;
-	void			*magic;
+	const char		*name;
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
@@ -60,7 +57,7 @@ struct hrtimer_sleeper;
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
-	, .name = #mutexname, .file = __FILE__, .line = __LINE__
+	, .name = #mutexname
 
 # define rt_mutex_init(mutex) \
 do { \
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 36e69100e8e0..7e411b946d4c 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -42,12 +42,7 @@ static void printk_task(struct task_struct *p)
 
 static void printk_lock(struct rt_mutex *lock, int print_owner)
 {
-	if (lock->name)
-		printk(" [%p] {%s}\n",
-			lock, lock->name);
-	else
-		printk(" [%p] {%s:%d}\n",
-			lock, lock->file, lock->line);
+	printk(" [%p] {%s}\n", lock, lock->name);
 
 	if (print_owner && rt_mutex_owner(lock)) {
 		printk(".. ->owner: %p\n", lock->owner);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index fc549713bba3..772c9b012b62 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -22,8 +22,6 @@ extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
 				    struct rt_mutex_waiter *waiter,
 				    struct rt_mutex *lock);
 extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
-# define debug_rt_mutex_reset_waiter(w)			\
-	do { (w)->deadlock_lock = NULL; } while (0)
 
 static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
 						  enum rtmutex_chainwalk walk)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index ca93e5d7b026..11abc60d1478 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1594,9 +1594,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
 void rt_mutex_destroy(struct rt_mutex *lock)
 {
 	WARN_ON(rt_mutex_is_locked(lock));
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-	lock->magic = NULL;
-#endif
 }
 EXPORT_SYMBOL_GPL(rt_mutex_destroy);
 
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index 732f96abf462..4dbdec15f1a0 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,7 +11,6 @@
  * Non-debug version.
  */
 
-#define rt_mutex_deadlock_check(l)			(0)
 #define debug_rt_mutex_init_waiter(w)			do { } while (0)
 #define debug_rt_mutex_free_waiter(w)			do { } while (0)
 #define debug_rt_mutex_lock(l)				do { } while (0)
@@ -21,7 +20,6 @@
 #define debug_rt_mutex_init(m, n, k)			do { } while (0)
 #define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
 #define debug_rt_mutex_print_deadlock(w)		do { } while (0)
-#define debug_rt_mutex_reset_waiter(w)			do { } while (0)
 
 static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
 {
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index a5007f00c1b7..aa047436dadf 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -30,7 +30,6 @@ struct rt_mutex_waiter {
 	struct task_struct	*task;
 	struct rt_mutex		*lock;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
-	unsigned long		ip;
 	struct pid		*deadlock_task_pid;
 	struct rt_mutex		*deadlock_lock;
 #endif
-- 
cgit v1.2.3


From 6d41c675a5394057f6fb1dc97cc0a0e360f2c2f8 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Mar 2021 16:29:32 +0100
Subject: locking/rtmutex: Remove output from deadlock detector

The rtmutex specific deadlock detector predates lockdep coverage of rtmutex
and since commit f5694788ad8da ("rt_mutex: Add lockdep annotations") it
contains a lot of redundant functionality:

 - lockdep will detect an potential deadlock before rtmutex-debug
   has a chance to do so

 - the deadlock debugging is restricted to rtmutexes which are not
   associated to futexes and have an active waiter, which is covered by
   lockdep already

Remove the redundant functionality and move actual deadlock WARN() into the
deadlock code path. The latter needs a seperate cleanup.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210326153943.320398604@linutronix.de
---
 include/linux/rtmutex.h         |  7 ---
 kernel/locking/rtmutex-debug.c  | 97 -----------------------------------------
 kernel/locking/rtmutex-debug.h  |  9 ----
 kernel/locking/rtmutex.c        |  7 +--
 kernel/locking/rtmutex.h        |  7 ---
 kernel/locking/rtmutex_common.h |  4 --
 6 files changed, 1 insertion(+), 130 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 48b334b9eb87..0725c4b45749 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -31,9 +31,6 @@ struct rt_mutex {
 	raw_spinlock_t		wait_lock;
 	struct rb_root_cached   waiters;
 	struct task_struct	*owner;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-	const char		*name;
-#endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
@@ -56,8 +53,6 @@ struct hrtimer_sleeper;
 #endif
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
-# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
-	, .name = #mutexname
 
 # define rt_mutex_init(mutex) \
 do { \
@@ -67,7 +62,6 @@ do { \
 
  extern void rt_mutex_debug_task_free(struct task_struct *tsk);
 #else
-# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
 # define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL, NULL)
 # define rt_mutex_debug_task_free(t)			do { } while (0)
 #endif
@@ -83,7 +77,6 @@ do { \
 	{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
 	, .waiters = RB_ROOT_CACHED \
 	, .owner = NULL \
-	__DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	__DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
 
 #define DEFINE_RT_MUTEX(mutexname) \
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 7e411b946d4c..fb150100335f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -32,105 +32,12 @@
 
 #include "rtmutex_common.h"
 
-static void printk_task(struct task_struct *p)
-{
-	if (p)
-		printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
-	else
-		printk("<none>");
-}
-
-static void printk_lock(struct rt_mutex *lock, int print_owner)
-{
-	printk(" [%p] {%s}\n", lock, lock->name);
-
-	if (print_owner && rt_mutex_owner(lock)) {
-		printk(".. ->owner: %p\n", lock->owner);
-		printk(".. held by:  ");
-		printk_task(rt_mutex_owner(lock));
-		printk("\n");
-	}
-}
-
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
 	DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
 	DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 
-/*
- * We fill out the fields in the waiter to store the information about
- * the deadlock. We print when we return. act_waiter can be NULL in
- * case of a remove waiter operation.
- */
-void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
-			     struct rt_mutex_waiter *act_waiter,
-			     struct rt_mutex *lock)
-{
-	struct task_struct *task;
-
-	if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
-		return;
-
-	task = rt_mutex_owner(act_waiter->lock);
-	if (task && task != current) {
-		act_waiter->deadlock_task_pid = get_pid(task_pid(task));
-		act_waiter->deadlock_lock = lock;
-	}
-}
-
-void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
-{
-	struct task_struct *task;
-
-	if (!waiter->deadlock_lock || !debug_locks)
-		return;
-
-	rcu_read_lock();
-	task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
-	if (!task) {
-		rcu_read_unlock();
-		return;
-	}
-
-	if (!debug_locks_off()) {
-		rcu_read_unlock();
-		return;
-	}
-
-	pr_warn("\n");
-	pr_warn("============================================\n");
-	pr_warn("WARNING: circular locking deadlock detected!\n");
-	pr_warn("%s\n", print_tainted());
-	pr_warn("--------------------------------------------\n");
-	printk("%s/%d is deadlocking current task %s/%d\n\n",
-	       task->comm, task_pid_nr(task),
-	       current->comm, task_pid_nr(current));
-
-	printk("\n1) %s/%d is trying to acquire this lock:\n",
-	       current->comm, task_pid_nr(current));
-	printk_lock(waiter->lock, 1);
-
-	printk("\n2) %s/%d is blocked on this lock:\n",
-		task->comm, task_pid_nr(task));
-	printk_lock(waiter->deadlock_lock, 1);
-
-	debug_show_held_locks(current);
-	debug_show_held_locks(task);
-
-	printk("\n%s/%d's [blocked] stackdump:\n\n",
-		task->comm, task_pid_nr(task));
-	show_stack(task, NULL, KERN_DEFAULT);
-	printk("\n%s/%d's [current] stackdump:\n\n",
-		current->comm, task_pid_nr(current));
-	dump_stack();
-	debug_show_all_locks();
-	rcu_read_unlock();
-
-	printk("[ turning off deadlock detection."
-	       "Please report this trace. ]\n\n");
-}
-
 void debug_rt_mutex_lock(struct rt_mutex *lock)
 {
 }
@@ -153,12 +60,10 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 {
 	memset(waiter, 0x11, sizeof(*waiter));
-	waiter->deadlock_task_pid = NULL;
 }
 
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
-	put_pid(waiter->deadlock_task_pid);
 	memset(waiter, 0x22, sizeof(*waiter));
 }
 
@@ -168,10 +73,8 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_cl
 	 * Make sure we are not reinitializing a held lock:
 	 */
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
-	lock->name = name;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	lockdep_init_map(&lock->dep_map, name, key, 0);
 #endif
 }
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 772c9b012b62..659e93e256c6 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -18,18 +18,9 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
 extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
 				      struct task_struct *powner);
 extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
-				    struct rt_mutex_waiter *waiter,
-				    struct rt_mutex *lock);
-extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
 
 static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
 						  enum rtmutex_chainwalk walk)
 {
 	return (waiter != NULL);
 }
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
-	debug_rt_mutex_print_deadlock(w);
-}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 11abc60d1478..4beca549aeeb 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -579,7 +579,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 	 * walk, we detected a deadlock.
 	 */
 	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
-		debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
 		raw_spin_unlock(&lock->wait_lock);
 		ret = -EDEADLK;
 		goto out_unlock_pi;
@@ -1171,8 +1170,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 		raw_spin_unlock_irq(&lock->wait_lock);
 
-		debug_rt_mutex_print_deadlock(waiter);
-
 		schedule();
 
 		raw_spin_lock_irq(&lock->wait_lock);
@@ -1196,7 +1193,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	/*
 	 * Yell loudly and stop the task right here.
 	 */
-	rt_mutex_print_deadlock(w);
+	WARN(1, "rtmutex deadlock detected\n");
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
@@ -1704,8 +1701,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 		ret = 0;
 	}
 
-	debug_rt_mutex_print_deadlock(waiter);
-
 	return ret;
 }
 
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index 4dbdec15f1a0..d77cb8280aa6 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -18,13 +18,6 @@
 #define debug_rt_mutex_proxy_unlock(l)			do { } while (0)
 #define debug_rt_mutex_unlock(l)			do { } while (0)
 #define debug_rt_mutex_init(m, n, k)			do { } while (0)
-#define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
-#define debug_rt_mutex_print_deadlock(w)		do { } while (0)
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
-	WARN(1, "rtmutex deadlock detected\n");
-}
 
 static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
 						  enum rtmutex_chainwalk walk)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index aa047436dadf..badb2a2803aa 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -29,10 +29,6 @@ struct rt_mutex_waiter {
 	struct rb_node          pi_tree_entry;
 	struct task_struct	*task;
 	struct rt_mutex		*lock;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-	struct pid		*deadlock_task_pid;
-	struct rt_mutex		*deadlock_lock;
-#endif
 	int prio;
 	u64 deadline;
 };
-- 
cgit v1.2.3


From 199cacd1a625cfc499d624b98b10dc763062f7dd Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Mar 2021 16:29:33 +0100
Subject: locking/rtmutex: Consolidate rt_mutex_init()

rt_mutex_init() only initializes lockdep if CONFIG_DEBUG_RT_MUTEXES is
enabled, which is fine because all lockdep variants select it, but there is
no reason to do so.

Move the function outside of the CONFIG_DEBUG_RT_MUTEXES block which
removes #ifdeffery.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210326153943.437405350@linutronix.de
---
 include/linux/rtmutex.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 0725c4b45749..243fabc2c85f 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -43,6 +43,7 @@ struct hrtimer_sleeper;
  extern int rt_mutex_debug_check_no_locks_freed(const void *from,
 						unsigned long len);
  extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task);
+ extern void rt_mutex_debug_task_free(struct task_struct *tsk);
 #else
  static inline int rt_mutex_debug_check_no_locks_freed(const void *from,
 						       unsigned long len)
@@ -50,22 +51,15 @@ struct hrtimer_sleeper;
 	return 0;
  }
 # define rt_mutex_debug_check_no_locks_held(task)	do { } while (0)
+# define rt_mutex_debug_task_free(t)			do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-
-# define rt_mutex_init(mutex) \
+#define rt_mutex_init(mutex) \
 do { \
 	static struct lock_class_key __key; \
 	__rt_mutex_init(mutex, __func__, &__key); \
 } while (0)
 
- extern void rt_mutex_debug_task_free(struct task_struct *tsk);
-#else
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL, NULL)
-# define rt_mutex_debug_task_free(t)			do { } while (0)
-#endif
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \
 	, .dep_map = { .name = #mutexname }
-- 
cgit v1.2.3


From 8188d74e68174b11ff7c4a635ffc8fd31eacc6b9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Mar 2021 16:29:34 +0100
Subject: locking/rtmutex: Remove empty and unused debug stubs

No users or useless and therefore just ballast.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210326153943.549192485@linutronix.de
---
 include/linux/rtmutex.h        | 14 ++------------
 kernel/locking/rtmutex-debug.c |  9 ---------
 kernel/locking/rtmutex-debug.h |  3 ---
 kernel/locking/rtmutex.c       | 18 ------------------
 kernel/locking/rtmutex.h       |  2 --
 5 files changed, 2 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 243fabc2c85f..d1672de9ca89 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -40,18 +40,9 @@ struct rt_mutex_waiter;
 struct hrtimer_sleeper;
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
- extern int rt_mutex_debug_check_no_locks_freed(const void *from,
-						unsigned long len);
- extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task);
- extern void rt_mutex_debug_task_free(struct task_struct *tsk);
+extern void rt_mutex_debug_task_free(struct task_struct *tsk);
 #else
- static inline int rt_mutex_debug_check_no_locks_freed(const void *from,
-						       unsigned long len)
- {
-	return 0;
- }
-# define rt_mutex_debug_check_no_locks_held(task)	do { } while (0)
-# define rt_mutex_debug_task_free(t)			do { } while (0)
+static inline void rt_mutex_debug_task_free(struct task_struct *tsk) { }
 #endif
 
 #define rt_mutex_init(mutex) \
@@ -88,7 +79,6 @@ static inline int rt_mutex_is_locked(struct rt_mutex *lock)
 }
 
 extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
-extern void rt_mutex_destroy(struct rt_mutex *lock);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index fb150100335f..df584c91710b 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -38,20 +38,11 @@ void rt_mutex_debug_task_free(struct task_struct *task)
 	DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 
-void debug_rt_mutex_lock(struct rt_mutex *lock)
-{
-}
-
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
 	DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
 }
 
-void
-debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
-{
-}
-
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
 	DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 659e93e256c6..392468d7253f 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -13,10 +13,7 @@
 extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
-extern void debug_rt_mutex_lock(struct rt_mutex *lock);
 extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
-				      struct task_struct *powner);
 extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
 
 static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 4beca549aeeb..96c7c537eab4 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -885,9 +885,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 	raw_spin_unlock(&task->pi_lock);
 
 takeit:
-	/* We got the lock. */
-	debug_rt_mutex_lock(lock);
-
 	/*
 	 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
 	 * are still waiters or clears it.
@@ -1580,20 +1577,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
 		rt_mutex_postunlock(&wake_q);
 }
 
-/**
- * rt_mutex_destroy - mark a mutex unusable
- * @lock: the mutex to be destroyed
- *
- * This function marks the mutex uninitialized, and any subsequent
- * use of the mutex is forbidden. The mutex must not be locked when
- * this function is called.
- */
-void rt_mutex_destroy(struct rt_mutex *lock)
-{
-	WARN_ON(rt_mutex_is_locked(lock));
-}
-EXPORT_SYMBOL_GPL(rt_mutex_destroy);
-
 /**
  * __rt_mutex_init - initialize the rt_mutex
  *
@@ -1635,7 +1618,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				struct task_struct *proxy_owner)
 {
 	__rt_mutex_init(lock, NULL, NULL);
-	debug_rt_mutex_proxy_lock(lock, proxy_owner);
 	rt_mutex_set_owner(lock, proxy_owner);
 }
 
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index d77cb8280aa6..1e484abc94ae 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -13,8 +13,6 @@
 
 #define debug_rt_mutex_init_waiter(w)			do { } while (0)
 #define debug_rt_mutex_free_waiter(w)			do { } while (0)
-#define debug_rt_mutex_lock(l)				do { } while (0)
-#define debug_rt_mutex_proxy_lock(l,p)			do { } while (0)
 #define debug_rt_mutex_proxy_unlock(l)			do { } while (0)
 #define debug_rt_mutex_unlock(l)			do { } while (0)
 #define debug_rt_mutex_init(m, n, k)			do { } while (0)
-- 
cgit v1.2.3


From 1a1c130ab7575498eed5bcf7220037ae09cd1f8a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 23 Mar 2021 20:26:52 +0100
Subject: ACPI: tables: x86: Reserve memory occupied by ACPI tables

The following problem has been reported by George Kennedy:

 Since commit 7fef431be9c9 ("mm/page_alloc: place pages to tail
 in __free_pages_core()") the following use after free occurs
 intermittently when ACPI tables are accessed.

 BUG: KASAN: use-after-free in ibft_init+0x134/0xc49
 Read of size 4 at addr ffff8880be453004 by task swapper/0/1
 CPU: 3 PID: 1 Comm: swapper/0 Not tainted 5.12.0-rc1-7a7fd0d #1
 Call Trace:
  dump_stack+0xf6/0x158
  print_address_description.constprop.9+0x41/0x60
  kasan_report.cold.14+0x7b/0xd4
  __asan_report_load_n_noabort+0xf/0x20
  ibft_init+0x134/0xc49
  do_one_initcall+0xc4/0x3e0
  kernel_init_freeable+0x5af/0x66b
  kernel_init+0x16/0x1d0
  ret_from_fork+0x22/0x30

 ACPI tables mapped via kmap() do not have their mapped pages
 reserved and the pages can be "stolen" by the buddy allocator.

Apparently, on the affected system, the ACPI table in question is
not located in "reserved" memory, like ACPI NVS or ACPI Data, that
will not be used by the buddy allocator, so the memory occupied by
that table has to be explicitly reserved to prevent the buddy
allocator from using it.

In order to address this problem, rearrange the initialization of the
ACPI tables on x86 to locate the initial tables earlier and reserve
the memory occupied by them.

The other architectures using ACPI should not be affected by this
change.

Link: https://lore.kernel.org/linux-acpi/1614802160-29362-1-git-send-email-george.kennedy@oracle.com/
Reported-by: George Kennedy <george.kennedy@oracle.com>
Tested-by: George Kennedy <george.kennedy@oracle.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: 5.10+ <stable@vger.kernel.org> # 5.10+
---
 arch/x86/kernel/acpi/boot.c | 25 ++++++++++++-------------
 arch/x86/kernel/setup.c     |  8 +++-----
 drivers/acpi/tables.c       | 42 +++++++++++++++++++++++++++++++++++++++---
 include/linux/acpi.h        |  9 ++++++++-
 4 files changed, 62 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7bdc0239a943..14cd3186dc77 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void)
 	/*
 	 * Initialize the ACPI boot-time table parser.
 	 */
-	if (acpi_table_init()) {
+	if (acpi_locate_initial_tables())
 		disable_acpi();
-		return;
-	}
+	else
+		acpi_reserve_initial_tables();
+}
+
+int __init early_acpi_boot_init(void)
+{
+	if (acpi_disabled)
+		return 1;
+
+	acpi_table_init_complete();
 
 	acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
 
@@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void)
 		} else {
 			printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
 			disable_acpi();
-			return;
+			return 1;
 		}
 	}
-}
-
-int __init early_acpi_boot_init(void)
-{
-	/*
-	 * If acpi_disabled, bail out
-	 */
-	if (acpi_disabled)
-		return 1;
 
 	/*
 	 * Process the Multiple APIC Description Table (MADT), if present
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d883176ef2ce..5ecd69a48393 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1045,6 +1045,9 @@ void __init setup_arch(char **cmdline_p)
 
 	cleanup_highmap();
 
+	/* Look for ACPI tables and reserve memory occupied by them. */
+	acpi_boot_table_init();
+
 	memblock_set_current_limit(ISA_END_ADDRESS);
 	e820__memblock_setup();
 
@@ -1136,11 +1139,6 @@ void __init setup_arch(char **cmdline_p)
 
 	early_platform_quirks();
 
-	/*
-	 * Parse the ACPI tables for possible boot-time SMP configuration.
-	 */
-	acpi_boot_table_init();
-
 	early_acpi_boot_init();
 
 	initmem_init();
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index e48690a006a4..9d581045acff 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -780,7 +780,7 @@ acpi_status acpi_os_table_override(struct acpi_table_header *existing_table,
 }
 
 /*
- * acpi_table_init()
+ * acpi_locate_initial_tables()
  *
  * find RSDP, find and checksum SDT/XSDT.
  * checksum all tables, print SDT/XSDT
@@ -788,7 +788,7 @@ acpi_status acpi_os_table_override(struct acpi_table_header *existing_table,
  * result: sdt_entry[] is initialized
  */
 
-int __init acpi_table_init(void)
+int __init acpi_locate_initial_tables(void)
 {
 	acpi_status status;
 
@@ -803,9 +803,45 @@ int __init acpi_table_init(void)
 	status = acpi_initialize_tables(initial_tables, ACPI_MAX_TABLES, 0);
 	if (ACPI_FAILURE(status))
 		return -EINVAL;
-	acpi_table_initrd_scan();
 
+	return 0;
+}
+
+void __init acpi_reserve_initial_tables(void)
+{
+	int i;
+
+	for (i = 0; i < ACPI_MAX_TABLES; i++) {
+		struct acpi_table_desc *table_desc = &initial_tables[i];
+		u64 start = table_desc->address;
+		u64 size = table_desc->length;
+
+		if (!start || !size)
+			break;
+
+		pr_info("Reserving %4s table memory at [mem 0x%llx-0x%llx]\n",
+			table_desc->signature.ascii, start, start + size - 1);
+
+		memblock_reserve(start, size);
+	}
+}
+
+void __init acpi_table_init_complete(void)
+{
+	acpi_table_initrd_scan();
 	check_multiple_madt();
+}
+
+int __init acpi_table_init(void)
+{
+	int ret;
+
+	ret = acpi_locate_initial_tables();
+	if (ret)
+		return ret;
+
+	acpi_table_init_complete();
+
 	return 0;
 }
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fcdaab723916..3bdcfc4401b7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -222,10 +222,14 @@ void __iomem *__acpi_map_table(unsigned long phys, unsigned long size);
 void __acpi_unmap_table(void __iomem *map, unsigned long size);
 int early_acpi_boot_init(void);
 int acpi_boot_init (void);
+void acpi_boot_table_prepare (void);
 void acpi_boot_table_init (void);
 int acpi_mps_check (void);
 int acpi_numa_init (void);
 
+int acpi_locate_initial_tables (void);
+void acpi_reserve_initial_tables (void);
+void acpi_table_init_complete (void);
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
 int __init acpi_table_parse_entries(char *id, unsigned long table_size,
@@ -814,9 +818,12 @@ static inline int acpi_boot_init(void)
 	return 0;
 }
 
+static inline void acpi_boot_table_prepare(void)
+{
+}
+
 static inline void acpi_boot_table_init(void)
 {
-	return;
 }
 
 static inline int acpi_mps_check(void)
-- 
cgit v1.2.3


From 8058dfa05ab765153f20020fc4ea3b296e391a00 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Sat, 27 Mar 2021 07:56:42 +0100
Subject: soc: qcom: address kernel-doc warnings

The command:

  find ./include/linux/soc/qcom/ | xargs ./scripts/kernel-doc -none

reports:

  ./include/linux/soc/qcom/qmi.h:26: warning: cannot understand function prototype: 'struct qmi_header '
  ./include/linux/soc/qcom/qmi.h:101: warning: cannot understand function prototype: 'struct qmi_response_type_v01 '
  ./include/linux/soc/qcom/irq.h:19: warning: expecting prototype for QCOM specific IRQ domain flags that distinguishes the handling of wakeup(). Prototype was for IRQ_DOMAIN_FLAG_QCOM_PDC_WAKEUP() instead
  ./include/linux/soc/qcom/apr.h:126: warning: Function parameter or member '__apr_driver' not described in 'module_apr_driver'
  ./include/linux/soc/qcom/apr.h:126: warning: Excess function parameter '__aprbus_driver' description in 'module_apr_driver'
  ./include/linux/soc/qcom/llcc-qcom.h:43: warning: cannot understand function prototype: 'struct llcc_slice_desc '
  ./include/linux/soc/qcom/llcc-qcom.h:60: warning: cannot understand function prototype: 'struct llcc_edac_reg_data '
  ./include/linux/soc/qcom/llcc-qcom.h:86: warning: cannot understand function prototype: 'struct llcc_drv_data '

Address all those warnings by:
  - prefixing kernel-doc descriptions for structs with the keyword 'struct',
  - turning a kernel-doc comment that does not follow the kernel-doc syntax
  into a normal comment, and
  - correcting a parameter name in a kernel-doc comment.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20210327065642.11969-3-lukas.bulwahn@gmail.com
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/soc/qcom/apr.h       | 2 +-
 include/linux/soc/qcom/irq.h       | 2 +-
 include/linux/soc/qcom/llcc-qcom.h | 6 +++---
 include/linux/soc/qcom/qmi.h       | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h
index 7f0bc3cf4d61..137f9f2ac4c3 100644
--- a/include/linux/soc/qcom/apr.h
+++ b/include/linux/soc/qcom/apr.h
@@ -113,7 +113,7 @@ void apr_driver_unregister(struct apr_driver *drv);
 
 /**
  * module_apr_driver() - Helper macro for registering a aprbus driver
- * @__aprbus_driver: aprbus_driver struct
+ * @__apr_driver: apr_driver struct
  *
  * Helper macro for aprbus drivers which do not do anything special in
  * module init/exit. This eliminates a lot of boilerplate. Each module
diff --git a/include/linux/soc/qcom/irq.h b/include/linux/soc/qcom/irq.h
index 9e1ece58e55b..72b9231e9fdd 100644
--- a/include/linux/soc/qcom/irq.h
+++ b/include/linux/soc/qcom/irq.h
@@ -7,7 +7,7 @@
 
 #define GPIO_NO_WAKE_IRQ	~0U
 
-/**
+/*
  * QCOM specific IRQ domain flags that distinguishes the handling of wakeup
  * capable interrupts by different interrupt controllers.
  *
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 64fc582ae415..437c9df13229 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -35,7 +35,7 @@
 #define LLCC_WRCACHE     31
 
 /**
- * llcc_slice_desc - Cache slice descriptor
+ * struct llcc_slice_desc - Cache slice descriptor
  * @slice_id: llcc slice id
  * @slice_size: Size allocated for the llcc slice
  */
@@ -45,7 +45,7 @@ struct llcc_slice_desc {
 };
 
 /**
- * llcc_edac_reg_data - llcc edac registers data for each error type
+ * struct llcc_edac_reg_data - llcc edac registers data for each error type
  * @name: Name of the error
  * @synd_reg: Syndrome register address
  * @count_status_reg: Status register address to read the error count
@@ -69,7 +69,7 @@ struct llcc_edac_reg_data {
 };
 
 /**
- * llcc_drv_data - Data associated with the llcc driver
+ * struct llcc_drv_data - Data associated with the llcc driver
  * @regmap: regmap associated with the llcc device
  * @bcast_regmap: regmap associated with llcc broadcast offset
  * @cfg: pointer to the data structure for slice configuration
diff --git a/include/linux/soc/qcom/qmi.h b/include/linux/soc/qcom/qmi.h
index e712f94b89fc..b1f80e756d2a 100644
--- a/include/linux/soc/qcom/qmi.h
+++ b/include/linux/soc/qcom/qmi.h
@@ -16,7 +16,7 @@
 struct socket;
 
 /**
- * qmi_header - wireformat header of QMI messages
+ * struct qmi_header - wireformat header of QMI messages
  * @type:	type of message
  * @txn_id:	transaction id
  * @msg_id:	message id
@@ -93,7 +93,7 @@ struct qmi_elem_info {
 #define QMI_ERR_NOT_SUPPORTED_V01		94
 
 /**
- * qmi_response_type_v01 - common response header (decoded)
+ * struct qmi_response_type_v01 - common response header (decoded)
  * @result:	result of the transaction
  * @error:	error value, when @result is QMI_RESULT_FAILURE_V01
  */
-- 
cgit v1.2.3


From 886ce97a36a05e7a9c9d5d894e72d31f50146f5d Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 23 Mar 2021 13:07:00 +0800
Subject: soundwire: add definition for DPn BlockPackingMode

For some reason we don't have an enum for this concept. Add
definitions following Table 102 of the SoundWire 1.2 specification.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20210323050701.23760-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index eaa1486bdca9..350436db6ddb 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -125,6 +125,12 @@ enum sdw_dpn_grouping {
 	SDW_BLK_GRP_CNT_4 = 3,
 };
 
+/* block packing mode enum */
+enum sdw_dpn_pkg_mode {
+	SDW_BLK_PKG_PER_PORT = 0,
+	SDW_BLK_PKG_PER_CHANNEL = 1
+};
+
 /**
  * enum sdw_stream_type: data stream type
  *
-- 
cgit v1.2.3


From f318482a1c57315d0efccd2861f153f55c2117c6 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Fri, 19 Mar 2021 15:21:32 +0100
Subject: can: dev: can_free_echo_skb(): extend to return can frame length

In order to implement byte queue limits (bql) in CAN drivers, the
length of the CAN frame needs to be passed into the networking stack
even if the transmission failed for some reason.

To avoid to calculate this length twice, extend can_free_echo_skb() to
return that value. Convert all users of this function, too.

This patch is the natural extension of commit:

| 9420e1d495e2 ("can: dev: can_get_echo_skb(): extend to return can
|                frame length")

Link: https://lore.kernel.org/r/20210319142700.305648-3-mkl@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/skb.c                        | 11 +++++++++--
 drivers/net/can/grcan.c                          |  2 +-
 drivers/net/can/m_can/m_can.c                    |  2 +-
 drivers/net/can/rcar/rcar_can.c                  |  2 +-
 drivers/net/can/rcar/rcar_canfd.c                |  2 +-
 drivers/net/can/sja1000/sja1000.c                |  2 +-
 drivers/net/can/spi/hi311x.c                     |  2 +-
 drivers/net/can/spi/mcp251x.c                    |  2 +-
 drivers/net/can/usb/ems_usb.c                    |  2 +-
 drivers/net/can/usb/esd_usb2.c                   |  4 ++--
 drivers/net/can/usb/gs_usb.c                     |  2 +-
 drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c |  2 +-
 drivers/net/can/usb/mcba_usb.c                   |  2 +-
 drivers/net/can/usb/peak_usb/pcan_usb_core.c     |  2 +-
 drivers/net/can/usb/ucan.c                       |  6 +++---
 drivers/net/can/usb/usb_8dev.c                   |  2 +-
 include/linux/can/skb.h                          |  3 ++-
 17 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c
index 2256391ddbb3..387c0bc0fb9c 100644
--- a/drivers/net/can/dev/skb.c
+++ b/drivers/net/can/dev/skb.c
@@ -153,7 +153,8 @@ EXPORT_SYMBOL_GPL(can_get_echo_skb);
  *
  * The function is typically called when TX failed.
  */
-void can_free_echo_skb(struct net_device *dev, unsigned int idx)
+void can_free_echo_skb(struct net_device *dev, unsigned int idx,
+		       unsigned int *frame_len_ptr)
 {
 	struct can_priv *priv = netdev_priv(dev);
 
@@ -164,7 +165,13 @@ void can_free_echo_skb(struct net_device *dev, unsigned int idx)
 	}
 
 	if (priv->echo_skb[idx]) {
-		dev_kfree_skb_any(priv->echo_skb[idx]);
+		struct sk_buff *skb = priv->echo_skb[idx];
+		struct can_skb_priv *can_skb_priv = can_skb_prv(skb);
+
+		if (frame_len_ptr)
+			*frame_len_ptr = can_skb_priv->frame_len;
+
+		dev_kfree_skb_any(skb);
 		priv->echo_skb[idx] = NULL;
 	}
 }
diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c
index 4a8453290530..78e27940b2af 100644
--- a/drivers/net/can/grcan.c
+++ b/drivers/net/can/grcan.c
@@ -520,7 +520,7 @@ static int catch_up_echo_skb(struct net_device *dev, int budget, bool echo)
 			can_get_echo_skb(dev, i, NULL);
 		} else {
 			/* For cleanup of untransmitted messages */
-			can_free_echo_skb(dev, i);
+			can_free_echo_skb(dev, i, NULL);
 		}
 
 		priv->eskbp = grcan_ring_add(priv->eskbp, GRCAN_MSG_SIZE,
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 0c8d36bc668c..2ae3da16cbfe 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -425,7 +425,7 @@ static void m_can_clean(struct net_device *net)
 			putidx = ((m_can_read(cdev, M_CAN_TXFQS) &
 				   TXFQS_TFQPI_MASK) >> TXFQS_TFQPI_SHIFT);
 
-		can_free_echo_skb(cdev->net, putidx);
+		can_free_echo_skb(cdev->net, putidx, NULL);
 		cdev->tx_skb = NULL;
 	}
 }
diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index 4870c4ea190a..00e4533c8bdd 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -217,7 +217,7 @@ static void tx_failure_cleanup(struct net_device *ndev)
 	int i;
 
 	for (i = 0; i < RCAR_CAN_FIFO_DEPTH; i++)
-		can_free_echo_skb(ndev, i);
+		can_free_echo_skb(ndev, i, NULL);
 }
 
 static void rcar_can_error(struct net_device *ndev)
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index d8d233e62990..311e6ca3bdc4 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -617,7 +617,7 @@ static void rcar_canfd_tx_failure_cleanup(struct net_device *ndev)
 	u32 i;
 
 	for (i = 0; i < RCANFD_FIFO_DEPTH; i++)
-		can_free_echo_skb(ndev, i);
+		can_free_echo_skb(ndev, i, NULL);
 }
 
 static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv)
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 9e86488ba55f..3fad54646746 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -525,7 +525,7 @@ irqreturn_t sja1000_interrupt(int irq, void *dev_id)
 			if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT &&
 			    !(status & SR_TCS)) {
 				stats->tx_errors++;
-				can_free_echo_skb(dev, 0);
+				can_free_echo_skb(dev, 0, NULL);
 			} else {
 				/* transmission complete */
 				stats->tx_bytes +=
diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c
index c3e020c90111..6f5d6d04a8b9 100644
--- a/drivers/net/can/spi/hi311x.c
+++ b/drivers/net/can/spi/hi311x.c
@@ -179,7 +179,7 @@ static void hi3110_clean(struct net_device *net)
 		net->stats.tx_errors++;
 	dev_kfree_skb(priv->tx_skb);
 	if (priv->tx_len)
-		can_free_echo_skb(priv->net, 0);
+		can_free_echo_skb(priv->net, 0, NULL);
 	priv->tx_skb = NULL;
 	priv->tx_len = 0;
 }
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index f69fb4238a65..80ab1593ca31 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -276,7 +276,7 @@ static void mcp251x_clean(struct net_device *net)
 		net->stats.tx_errors++;
 	dev_kfree_skb(priv->tx_skb);
 	if (priv->tx_len)
-		can_free_echo_skb(priv->net, 0);
+		can_free_echo_skb(priv->net, 0, NULL);
 	priv->tx_skb = NULL;
 	priv->tx_len = 0;
 }
diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c
index 18f40eb20360..5af69787d9d5 100644
--- a/drivers/net/can/usb/ems_usb.c
+++ b/drivers/net/can/usb/ems_usb.c
@@ -807,7 +807,7 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne
 
 	err = usb_submit_urb(urb, GFP_ATOMIC);
 	if (unlikely(err)) {
-		can_free_echo_skb(netdev, context->echo_index);
+		can_free_echo_skb(netdev, context->echo_index, NULL);
 
 		usb_unanchor_urb(urb);
 		usb_free_coherent(dev->udev, size, buf, urb->transfer_dma);
diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c
index 562acbf454fd..65b58f8fc328 100644
--- a/drivers/net/can/usb/esd_usb2.c
+++ b/drivers/net/can/usb/esd_usb2.c
@@ -360,7 +360,7 @@ static void esd_usb2_tx_done_msg(struct esd_usb2_net_priv *priv,
 		can_get_echo_skb(netdev, context->echo_index, NULL);
 	} else {
 		stats->tx_errors++;
-		can_free_echo_skb(netdev, context->echo_index);
+		can_free_echo_skb(netdev, context->echo_index, NULL);
 	}
 
 	/* Release context */
@@ -793,7 +793,7 @@ static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb,
 
 	err = usb_submit_urb(urb, GFP_ATOMIC);
 	if (err) {
-		can_free_echo_skb(netdev, context->echo_index);
+		can_free_echo_skb(netdev, context->echo_index, NULL);
 
 		atomic_dec(&priv->active_tx_jobs);
 		usb_unanchor_urb(urb);
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index a00dc1904415..5e892bef46b0 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -533,7 +533,7 @@ static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb,
 	if (unlikely(rc)) {			/* usb send failed */
 		atomic_dec(&dev->active_tx_urbs);
 
-		can_free_echo_skb(netdev, idx);
+		can_free_echo_skb(netdev, idx, NULL);
 		gs_free_tx_context(txc);
 
 		usb_unanchor_urb(urb);
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
index 4e97da8434ab..90ebcae13409 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c
@@ -593,7 +593,7 @@ static netdev_tx_t kvaser_usb_start_xmit(struct sk_buff *skb,
 	if (unlikely(err)) {
 		spin_lock_irqsave(&priv->tx_contexts_lock, flags);
 
-		can_free_echo_skb(netdev, context->echo_index);
+		can_free_echo_skb(netdev, context->echo_index, NULL);
 		context->echo_index = dev->max_tx_urbs;
 		--priv->active_tx_contexts;
 		netif_wake_queue(netdev);
diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c
index 1f649d178010..029e77dfa773 100644
--- a/drivers/net/can/usb/mcba_usb.c
+++ b/drivers/net/can/usb/mcba_usb.c
@@ -364,7 +364,7 @@ static netdev_tx_t mcba_usb_start_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 
 xmit_failed:
-	can_free_echo_skb(priv->netdev, ctx->ndx);
+	can_free_echo_skb(priv->netdev, ctx->ndx, NULL);
 	mcba_usb_free_ctx(ctx);
 	dev_kfree_skb(skb);
 	stats->tx_dropped++;
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
index 573b11559d73..29227b5851fe 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c
@@ -371,7 +371,7 @@ static netdev_tx_t peak_usb_ndo_start_xmit(struct sk_buff *skb,
 
 	err = usb_submit_urb(urb, GFP_ATOMIC);
 	if (err) {
-		can_free_echo_skb(netdev, context->echo_index);
+		can_free_echo_skb(netdev, context->echo_index, NULL);
 
 		usb_unanchor_urb(urb);
 
diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
index fa403c080871..11fddedc36d4 100644
--- a/drivers/net/can/usb/ucan.c
+++ b/drivers/net/can/usb/ucan.c
@@ -675,7 +675,7 @@ static void ucan_tx_complete_msg(struct ucan_priv *up,
 			can_get_echo_skb(up->netdev, echo_index, NULL);
 		} else {
 			up->netdev->stats.tx_dropped++;
-			can_free_echo_skb(up->netdev, echo_index);
+			can_free_echo_skb(up->netdev, echo_index, NULL);
 		}
 		spin_unlock_irqrestore(&up->echo_skb_lock, flags);
 	}
@@ -843,7 +843,7 @@ static void ucan_write_bulk_callback(struct urb *urb)
 
 		/* update counters an cleanup */
 		spin_lock_irqsave(&up->echo_skb_lock, flags);
-		can_free_echo_skb(up->netdev, context - up->context_array);
+		can_free_echo_skb(up->netdev, context - up->context_array, NULL);
 		spin_unlock_irqrestore(&up->echo_skb_lock, flags);
 
 		up->netdev->stats.tx_dropped++;
@@ -1157,7 +1157,7 @@ static netdev_tx_t ucan_start_xmit(struct sk_buff *skb,
 		 * frees the skb
 		 */
 		spin_lock_irqsave(&up->echo_skb_lock, flags);
-		can_free_echo_skb(up->netdev, echo_index);
+		can_free_echo_skb(up->netdev, echo_index, NULL);
 		spin_unlock_irqrestore(&up->echo_skb_lock, flags);
 
 		if (ret == -ENODEV) {
diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c
index e8c42430a4fc..b6e7ef0d5bc6 100644
--- a/drivers/net/can/usb/usb_8dev.c
+++ b/drivers/net/can/usb/usb_8dev.c
@@ -691,7 +691,7 @@ nofreecontext:
 	return NETDEV_TX_BUSY;
 
 failed:
-	can_free_echo_skb(netdev, context->echo_index);
+	can_free_echo_skb(netdev, context->echo_index, NULL);
 
 	usb_unanchor_urb(urb);
 	usb_free_coherent(priv->udev, size, buf, urb->transfer_dma);
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index d438eb058069..d311bc369a39 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -23,7 +23,8 @@ struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx,
 				   u8 *len_ptr, unsigned int *frame_len_ptr);
 unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx,
 			      unsigned int *frame_len_ptr);
-void can_free_echo_skb(struct net_device *dev, unsigned int idx);
+void can_free_echo_skb(struct net_device *dev, unsigned int idx,
+		       unsigned int *frame_len_ptr);
 struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf);
 struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 				struct canfd_frame **cfd);
-- 
cgit v1.2.3


From 289ea9e4ae595545e736a63ccaadba65f880e9a4 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 24 Feb 2021 09:20:04 +0900
Subject: can: add new CAN FD bittiming parameters: Transmitter Delay
 Compensation (TDC)

At high bit rates, the propagation delay from the TX pin to the RX pin
of the transceiver causes measurement errors: the sample point on the
RX pin might occur on the previous bit.

This issue is addressed in ISO 11898-1 section 11.3.3 "Transmitter
delay compensation" (TDC).

This patch adds two new structures: can_tdc and can_tdc_const in order
to implement this TDC.

The structures are then added to can_priv.

A controller supports TDC if an only if can_priv::tdc_const is not
NULL.

TDC is active if and only if:
  - fd flag is on
  - can_priv::tdc.tdco is not zero.
It is the driver responsibility to check those two conditions are met.

No new controller modes are introduced (i.e. no CAN_CTRL_MODE_TDC) in
order not to be redundant with above logic.

The names of the parameters are chosen to match existing CAN
controllers specification. References:
  - Bosch C_CAN FD8:
https://www.bosch-semiconductors.com/media/ip_modules/pdf_2/c_can_fd8/users_manual_c_can_fd8_r210_1.pdf
  - Microchip CAN FD Controller Module:
http://ww1.microchip.com/downloads/en/DeviceDoc/MCP251XXFD-CAN-FD-Controller-Module-Family-Reference-Manual-20005678B.pdf
  - SAM E701/S70/V70/V71 Family:
https://www.mouser.com/datasheet/2/268/60001527A-1284321.pdf

Link: https://lore.kernel.org/r/20210224002008.4158-2-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/bittiming.h | 65 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/can/dev.h       |  3 ++
 2 files changed, 68 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 707575c668f4..b31a49f19b47 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /* Copyright (c) 2020 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de>
+ * Copyright (c) 2021 Vincent Mailhol <mailhol.vincent@wanadoo.fr>
  */
 
 #ifndef _CAN_BITTIMING_H
@@ -10,6 +11,70 @@
 
 #define CAN_SYNC_SEG 1
 
+/*
+ * struct can_tdc - CAN FD Transmission Delay Compensation parameters
+ *
+ * At high bit rates, the propagation delay from the TX pin to the RX
+ * pin of the transceiver causes measurement errors: the sample point
+ * on the RX pin might occur on the previous bit.
+ *
+ * To solve this issue, ISO 11898-1 introduces in section 11.3.3
+ * "Transmitter delay compensation" a SSP (Secondary Sample Point)
+ * equal to the distance, in time quanta, from the start of the bit
+ * time on the TX pin to the actual measurement on the RX pin.
+ *
+ * This structure contains the parameters to calculate that SSP.
+ *
+ * @tdcv: Transmitter Delay Compensation Value. Distance, in time
+ *	quanta, from when the bit is sent on the TX pin to when it is
+ *	received on the RX pin of the transmitter. Possible options:
+ *
+ *	  O: automatic mode. The controller dynamically measure @tdcv
+ *	  for each transmitted CAN FD frame.
+ *
+ *	  Other values: manual mode. Use the fixed provided value.
+ *
+ * @tdco: Transmitter Delay Compensation Offset. Offset value, in time
+ *	quanta, defining the distance between the start of the bit
+ *	reception on the RX pin of the transceiver and the SSP
+ *	position such as SSP = @tdcv + @tdco.
+ *
+ *	If @tdco is zero, then TDC is disabled and both @tdcv and
+ *	@tdcf should be ignored.
+ *
+ * @tdcf: Transmitter Delay Compensation Filter window. Defines the
+ *	minimum value for the SSP position in time quanta. If SSP is
+ *	less than @tdcf, then no delay compensations occur and the
+ *	normal sampling point is used instead. The feature is enabled
+ *	if and only if @tdcv is set to zero (automatic mode) and @tdcf
+ *	is configured to a value greater than @tdco.
+ */
+struct can_tdc {
+	u32 tdcv;
+	u32 tdco;
+	u32 tdcf;
+};
+
+/*
+ * struct can_tdc_const - CAN hardware-dependent constant for
+ *	Transmission Delay Compensation
+ *
+ * @tdcv_max: Transmitter Delay Compensation Value maximum value.
+ *	Should be set to zero if the controller does not support
+ *	manual mode for tdcv.
+ * @tdco_max: Transmitter Delay Compensation Offset maximum value.
+ *	Should not be zero. If the controller does not support TDC,
+ *	then the pointer to this structure should be NULL.
+ * @tdcf_max: Transmitter Delay Compensation Filter window maximum
+ *	value. Should be set to zero if the controller does not
+ *	support this feature.
+ */
+struct can_tdc_const {
+	u32 tdcv_max;
+	u32 tdco_max;
+	u32 tdcf_max;
+};
+
 #ifdef CONFIG_CAN_CALC_BITTIMING
 int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
 		       const struct can_bittiming_const *btc);
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index ac4d83a1ab81..4795da0eb949 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -42,6 +42,9 @@ struct can_priv {
 	struct can_bittiming bittiming, data_bittiming;
 	const struct can_bittiming_const *bittiming_const,
 		*data_bittiming_const;
+	struct can_tdc tdc;
+	const struct can_tdc_const *tdc_const;
+
 	const u16 *termination_const;
 	unsigned int termination_const_cnt;
 	u16 termination;
-- 
cgit v1.2.3


From 4c9258dd26fdb3bacb35e767fa55c9a03a78a08e Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 24 Feb 2021 09:20:05 +0900
Subject: can: dev: reorder struct can_priv members for better packing

Save eight bytes of holes on x86-64 architectures by reordering struct
can_priv members.

Before:

$ pahole -C can_priv drivers/net/can/dev/dev.o
struct can_priv {
	struct net_device *        dev;                  /*     0     8 */
	struct can_device_stats    can_stats;            /*     8    24 */
	struct can_bittiming       bittiming;            /*    32    32 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct can_bittiming       data_bittiming;       /*    64    32 */
	const struct can_bittiming_const  * bittiming_const; /*    96     8 */
	const struct can_bittiming_const  * data_bittiming_const; /*   104     8 */
	struct can_tdc             tdc;                  /*   112    12 */

	/* XXX 4 bytes hole, try to pack */

	/* --- cacheline 2 boundary (128 bytes) --- */
	const struct can_tdc_const  * tdc_const;         /*   128     8 */
	const u16  *               termination_const;    /*   136     8 */
	unsigned int               termination_const_cnt; /*   144     4 */
	u16                        termination;          /*   148     2 */

	/* XXX 2 bytes hole, try to pack */

	const u32  *               bitrate_const;        /*   152     8 */
	unsigned int               bitrate_const_cnt;    /*   160     4 */

	/* XXX 4 bytes hole, try to pack */

	const u32  *               data_bitrate_const;   /*   168     8 */
	unsigned int               data_bitrate_const_cnt; /*   176     4 */
	u32                        bitrate_max;          /*   180     4 */
	struct can_clock           clock;                /*   184     4 */
	enum can_state             state;                /*   188     4 */
	/* --- cacheline 3 boundary (192 bytes) --- */
	u32                        ctrlmode;             /*   192     4 */
	u32                        ctrlmode_supported;   /*   196     4 */
	u32                        ctrlmode_static;      /*   200     4 */
	int                        restart_ms;           /*   204     4 */
	struct delayed_work        restart_work;         /*   208   168 */

	/* XXX last struct has 4 bytes of padding */

	/* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */
	int                        (*do_set_bittiming)(struct net_device *); /*   376     8 */
	/* --- cacheline 6 boundary (384 bytes) --- */
	int                        (*do_set_data_bittiming)(struct net_device *); /*   384     8 */
	int                        (*do_set_mode)(struct net_device *, enum can_mode); /*   392     8 */
	int                        (*do_set_termination)(struct net_device *, u16); /*   400     8 */
	int                        (*do_get_state)(const struct net_device  *, enum can_state *); /*   408     8 */
	int                        (*do_get_berr_counter)(const struct net_device  *, struct can_berr_counter *); /*   416     8 */
	unsigned int               echo_skb_max;         /*   424     4 */

	/* XXX 4 bytes hole, try to pack */

	struct sk_buff * *         echo_skb;             /*   432     8 */

	/* size: 440, cachelines: 7, members: 31 */
	/* sum members: 426, holes: 4, sum holes: 14 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 56 bytes */
};

After:

$ pahole -C can_priv drivers/net/can/dev/dev.o
struct can_priv {
	struct net_device *        dev;                  /*     0     8 */
	struct can_device_stats    can_stats;            /*     8    24 */
	const struct can_bittiming_const  * bittiming_const; /*    32     8 */
	const struct can_bittiming_const  * data_bittiming_const; /*    40     8 */
	struct can_bittiming       bittiming;            /*    48    32 */
	/* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */
	struct can_bittiming       data_bittiming;       /*    80    32 */
	const struct can_tdc_const  * tdc_const;         /*   112     8 */
	struct can_tdc             tdc;                  /*   120    12 */
	/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
	unsigned int               bitrate_const_cnt;    /*   132     4 */
	const u32  *               bitrate_const;        /*   136     8 */
	const u32  *               data_bitrate_const;   /*   144     8 */
	unsigned int               data_bitrate_const_cnt; /*   152     4 */
	u32                        bitrate_max;          /*   156     4 */
	struct can_clock           clock;                /*   160     4 */
	unsigned int               termination_const_cnt; /*   164     4 */
	const u16  *               termination_const;    /*   168     8 */
	u16                        termination;          /*   176     2 */

	/* XXX 2 bytes hole, try to pack */

	enum can_state             state;                /*   180     4 */
	u32                        ctrlmode;             /*   184     4 */
	u32                        ctrlmode_supported;   /*   188     4 */
	/* --- cacheline 3 boundary (192 bytes) --- */
	u32                        ctrlmode_static;      /*   192     4 */
	int                        restart_ms;           /*   196     4 */
	struct delayed_work        restart_work;         /*   200   168 */

	/* XXX last struct has 4 bytes of padding */

	/* --- cacheline 5 boundary (320 bytes) was 48 bytes ago --- */
	int                        (*do_set_bittiming)(struct net_device *); /*   368     8 */
	int                        (*do_set_data_bittiming)(struct net_device *); /*   376     8 */
	/* --- cacheline 6 boundary (384 bytes) --- */
	int                        (*do_set_mode)(struct net_device *, enum can_mode); /*   384     8 */
	int                        (*do_set_termination)(struct net_device *, u16); /*   392     8 */
	int                        (*do_get_state)(const struct net_device  *, enum can_state *); /*   400     8 */
	int                        (*do_get_berr_counter)(const struct net_device  *, struct can_berr_counter *); /*   408     8 */
	unsigned int               echo_skb_max;         /*   416     4 */

	/* XXX 4 bytes hole, try to pack */

	struct sk_buff * *         echo_skb;             /*   424     8 */

	/* size: 432, cachelines: 7, members: 31 */
	/* sum members: 426, holes: 2, sum holes: 6 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 48 bytes */
};

Link: https://lore.kernel.org/r/20210224002008.4158-3-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/dev.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 4795da0eb949..27b275e463da 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -39,22 +39,23 @@ struct can_priv {
 	struct net_device *dev;
 	struct can_device_stats can_stats;
 
-	struct can_bittiming bittiming, data_bittiming;
 	const struct can_bittiming_const *bittiming_const,
 		*data_bittiming_const;
-	struct can_tdc tdc;
+	struct can_bittiming bittiming, data_bittiming;
 	const struct can_tdc_const *tdc_const;
+	struct can_tdc tdc;
 
-	const u16 *termination_const;
-	unsigned int termination_const_cnt;
-	u16 termination;
-	const u32 *bitrate_const;
 	unsigned int bitrate_const_cnt;
+	const u32 *bitrate_const;
 	const u32 *data_bitrate_const;
 	unsigned int data_bitrate_const_cnt;
 	u32 bitrate_max;
 	struct can_clock clock;
 
+	unsigned int termination_const_cnt;
+	const u16 *termination_const;
+	u16 termination;
+
 	enum can_state state;
 
 	/* CAN controller features - see include/uapi/linux/can/netlink.h */
-- 
cgit v1.2.3


From c25cc7993243fdc00ab7e608e3764819538015ab Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 24 Feb 2021 09:20:08 +0900
Subject: can: bittiming: add calculation for CAN FD Transmitter Delay
 Compensation (TDC)

The logic for the tdco calculation is to just reuse the normal sample
point: tdco = sp. Because the sample point is expressed in tenth of
percent and the tdco is expressed in time quanta, a conversion is
needed.

At the end,
     ssp = tdcv + tdco
         = tdcv + sp.

Another popular method is to set tdco to the middle of the bit:
     tdc->tdco = can_bit_time(dbt) / 2
During benchmark tests, we could not find a clear advantages for one
of the two methods.

The tdco calculation is triggered each time the data_bittiming is
changed so that users relying on automated calculation can use the
netlink interface the exact same way without need of new parameters.
For example, a command such as:
	ip link set canX type can bitrate 500000 dbitrate 4000000 fd on
would trigger the calculation.

The user using CONFIG_CAN_CALC_BITTIMING who does not want automated
calculation needs to manually set tdco to zero.
For example with:
	ip link set canX type can tdco 0 bitrate 500000 dbitrate 4000000 fd on
(if the tdco parameter is provided in a previous command, it will be
overwritten).

If tdcv is set to zero (default), it is automatically calculated by
the transiver for each frame. As such, there is no code in the kernel
to calculate it.

tdcf has no automated calculation functions because we could not
figure out a formula for this parameter.

Link: https://lore.kernel.org/r/20210224002008.4158-6-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/bittiming.c | 24 ++++++++++++++++++++++++
 drivers/net/can/dev/netlink.c   |  2 ++
 include/linux/can/bittiming.h   |  6 ++++++
 3 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/bittiming.c b/drivers/net/can/dev/bittiming.c
index f7fe226bb395..2907e60c9a57 100644
--- a/drivers/net/can/dev/bittiming.c
+++ b/drivers/net/can/dev/bittiming.c
@@ -174,6 +174,30 @@ int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
 
 	return 0;
 }
+
+void can_calc_tdco(struct net_device *dev)
+{
+	struct can_priv *priv = netdev_priv(dev);
+	const struct can_bittiming *dbt = &priv->data_bittiming;
+	struct can_tdc *tdc = &priv->tdc;
+	const struct can_tdc_const *tdc_const = priv->tdc_const;
+
+	if (!tdc_const)
+		return;
+
+	/* As specified in ISO 11898-1 section 11.3.3 "Transmitter
+	 * delay compensation" (TDC) is only applicable if data BRP is
+	 * one or two.
+	 */
+	if (dbt->brp == 1 || dbt->brp == 2) {
+		/* Reuse "normal" sample point and convert it to time quanta */
+		u32 sample_point_in_tq = can_bit_time(dbt) * dbt->sample_point / 1000;
+
+		tdc->tdco = min(sample_point_in_tq, tdc_const->tdco_max);
+	} else {
+		tdc->tdco = 0;
+	}
+}
 #endif /* CONFIG_CAN_CALC_BITTIMING */
 
 /* Checks the validity of the specified bit-timing parameters prop_seg,
diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
index 8443480a703d..e38c2566aff4 100644
--- a/drivers/net/can/dev/netlink.c
+++ b/drivers/net/can/dev/netlink.c
@@ -186,6 +186,8 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 
 		memcpy(&priv->data_bittiming, &dbt, sizeof(dbt));
 
+		can_calc_tdco(dev);
+
 		if (priv->do_set_data_bittiming) {
 			/* Finally, set the bit-timing registers */
 			err = priv->do_set_data_bittiming(dev);
diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index b31a49f19b47..3c4cad7b52c0 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -78,6 +78,8 @@ struct can_tdc_const {
 #ifdef CONFIG_CAN_CALC_BITTIMING
 int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
 		       const struct can_bittiming_const *btc);
+
+void can_calc_tdco(struct net_device *dev);
 #else /* !CONFIG_CAN_CALC_BITTIMING */
 static inline int
 can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
@@ -86,6 +88,10 @@ can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
 	netdev_err(dev, "bit-timing calculation not available\n");
 	return -EINVAL;
 }
+
+static inline void can_calc_tdco(struct net_device *dev)
+{
+}
 #endif /* CONFIG_CAN_CALC_BITTIMING */
 
 int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt,
-- 
cgit v1.2.3


From 1d7750760b70ba8b0e641146eee1b3a343d1b292 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Sat, 6 Mar 2021 14:40:40 +0900
Subject: can: bittiming: add CAN_KBPS, CAN_MBPS and CAN_MHZ macros

Add three macro to simplify the readability of big bit timing numbers:
  - CAN_KBPS: kilobits per second (one thousand)
  - CAN_MBPS: megabits per second (one million)
  - CAN_MHZ: megahertz per second (one million)

Example:
	u32 bitrate_max = 8 * CAN_MBPS;
	struct can_clock clock = {.freq = 80 * CAN_MHZ};
instead of:
	u32 bitrate_max = 8000000;
	struct can_clock clock = {.freq = 80000000};

Apply the new macro to driver/net/can/dev/bittiming.c.

Link: https://lore.kernel.org/r/20210306054040.76483-1-mailhol.vincent@wanadoo.fr
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/bittiming.c | 4 ++--
 include/linux/can/bittiming.h   | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev/bittiming.c b/drivers/net/can/dev/bittiming.c
index 2907e60c9a57..f49170eadd54 100644
--- a/drivers/net/can/dev/bittiming.c
+++ b/drivers/net/can/dev/bittiming.c
@@ -81,9 +81,9 @@ int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt,
 	if (bt->sample_point) {
 		sample_point_nominal = bt->sample_point;
 	} else {
-		if (bt->bitrate > 800000)
+		if (bt->bitrate > 800 * CAN_KBPS)
 			sample_point_nominal = 750;
-		else if (bt->bitrate > 500000)
+		else if (bt->bitrate > 500 * CAN_KBPS)
 			sample_point_nominal = 800;
 		else
 			sample_point_nominal = 875;
diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 3c4cad7b52c0..ae7a3411167c 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -11,6 +11,14 @@
 
 #define CAN_SYNC_SEG 1
 
+
+/* Kilobits and Megabits per second */
+#define CAN_KBPS 1000UL
+#define CAN_MBPS 1000000UL
+
+/* Megahertz */
+#define CAN_MHZ 1000000UL
+
 /*
  * struct can_tdc - CAN FD Transmission Delay Compensation parameters
  *
-- 
cgit v1.2.3


From e9ce2ce17da626d930812199568bd426b2832f57 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 12 Feb 2021 14:16:10 +0100
Subject: mmc: core: Drop reference counting of the bus_ops

When the mmc_rescan work is enabled for execution (host->rescan_disable),
it's the only instance per mmc host that is allowed to set/clear the
host->bus_ops pointer.

Besides the mmc_rescan work, there are a couple of scenarios when the
host->bus_ops pointer may be accessed. Typically, those can be described as
as below:

*)
Upper mmc driver layers (like the mmc block device driver or an SDIO
functional driver) needs to execute a host->bus_ops callback. This can be
considered as safe without having to use some special locking mechanism,
because they operate on top of the struct mmc_card. As long as there is a
card to operate upon, the mmc core guarantees that there is a host->bus_ops
assigned as well. Note that, upper layer mmc drivers are of course
responsible to clean up from themselves from their ->remove() callbacks,
otherwise things would fall apart anyways.

**)
Via the mmc host instance, we may need to force a removal of an inserted
mmc card. This happens when a mmc host driver gets unbind, for example. In
this case, we protect the host->bus_ops pointer from concurrent accesses,
by disabling the mmc_rescan work upfront (host->rescan_disable). See
mmc_stop_host() for example.

This said, it seems like the reference counting of the host->bus_ops
pointer at some point have become superfluous. As this is an old mechanism
of the mmc core, it a bit difficult to digest the history of when that
could have happened. However, let's drop the reference counting to avoid
unnecessary code-paths and lockings.

Cc: Pierre Ossman <pierre@ossman.eu>
Cc: Russell King <linux@armlinux.org.uk>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Link: https://lore.kernel.org/r/20210212131610.236843-1-ulf.hansson@linaro.org
---
 drivers/mmc/core/core.c  | 89 +++---------------------------------------------
 include/linux/mmc/host.h |  2 --
 2 files changed, 4 insertions(+), 87 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 99f0df9e0491..9c13f7a52699 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1377,63 +1377,13 @@ void mmc_power_cycle(struct mmc_host *host, u32 ocr)
 	mmc_power_up(host, ocr);
 }
 
-/*
- * Cleanup when the last reference to the bus operator is dropped.
- */
-static void __mmc_release_bus(struct mmc_host *host)
-{
-	WARN_ON(!host->bus_dead);
-
-	host->bus_ops = NULL;
-}
-
-/*
- * Increase reference count of bus operator
- */
-static inline void mmc_bus_get(struct mmc_host *host)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&host->lock, flags);
-	host->bus_refs++;
-	spin_unlock_irqrestore(&host->lock, flags);
-}
-
-/*
- * Decrease reference count of bus operator and free it if
- * it is the last reference.
- */
-static inline void mmc_bus_put(struct mmc_host *host)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&host->lock, flags);
-	host->bus_refs--;
-	if ((host->bus_refs == 0) && host->bus_ops)
-		__mmc_release_bus(host);
-	spin_unlock_irqrestore(&host->lock, flags);
-}
-
 /*
  * Assign a mmc bus handler to a host. Only one bus handler may control a
  * host at any given time.
  */
 void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops)
 {
-	unsigned long flags;
-
-	WARN_ON(!host->claimed);
-
-	spin_lock_irqsave(&host->lock, flags);
-
-	WARN_ON(host->bus_ops);
-	WARN_ON(host->bus_refs);
-
 	host->bus_ops = ops;
-	host->bus_refs = 1;
-	host->bus_dead = 0;
-
-	spin_unlock_irqrestore(&host->lock, flags);
 }
 
 /*
@@ -1441,18 +1391,7 @@ void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops)
  */
 void mmc_detach_bus(struct mmc_host *host)
 {
-	unsigned long flags;
-
-	WARN_ON(!host->claimed);
-	WARN_ON(!host->bus_ops);
-
-	spin_lock_irqsave(&host->lock, flags);
-
-	host->bus_dead = 1;
-
-	spin_unlock_irqrestore(&host->lock, flags);
-
-	mmc_bus_put(host);
+	host->bus_ops = NULL;
 }
 
 void _mmc_detect_change(struct mmc_host *host, unsigned long delay, bool cd_irq)
@@ -2244,32 +2183,15 @@ void mmc_rescan(struct work_struct *work)
 		host->trigger_card_event = false;
 	}
 
-	mmc_bus_get(host);
-
 	/* Verify a registered card to be functional, else remove it. */
-	if (host->bus_ops && !host->bus_dead)
+	if (host->bus_ops)
 		host->bus_ops->detect(host);
 
 	host->detect_change = 0;
 
-	/*
-	 * Let mmc_bus_put() free the bus/bus_ops if we've found that
-	 * the card is no longer present.
-	 */
-	mmc_bus_put(host);
-	mmc_bus_get(host);
-
 	/* if there still is a card present, stop here */
-	if (host->bus_ops != NULL) {
-		mmc_bus_put(host);
+	if (host->bus_ops != NULL)
 		goto out;
-	}
-
-	/*
-	 * Only we can add a new handler, so it's safe to
-	 * release the lock here.
-	 */
-	mmc_bus_put(host);
 
 	mmc_claim_host(host);
 	if (mmc_card_is_removable(host) && host->ops->get_cd &&
@@ -2332,18 +2254,15 @@ void mmc_stop_host(struct mmc_host *host)
 	/* clear pm flags now and let card drivers set them as needed */
 	host->pm_flags = 0;
 
-	mmc_bus_get(host);
-	if (host->bus_ops && !host->bus_dead) {
+	if (host->bus_ops) {
 		/* Calling bus_ops->remove() with a claimed host can deadlock */
 		host->bus_ops->remove(host);
 		mmc_claim_host(host);
 		mmc_detach_bus(host);
 		mmc_power_off(host);
 		mmc_release_host(host);
-		mmc_bus_put(host);
 		return;
 	}
-	mmc_bus_put(host);
 
 	mmc_claim_host(host);
 	mmc_power_off(host);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 26a3c7bc29ae..a001ad2f5f23 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -423,7 +423,6 @@ struct mmc_host {
 	/* group bitfields together to minimize padding */
 	unsigned int		use_spi_crc:1;
 	unsigned int		claimed:1;	/* host exclusively claimed */
-	unsigned int		bus_dead:1;	/* bus has been released */
 	unsigned int		doing_init_tune:1; /* initial tuning in progress */
 	unsigned int		can_retune:1;	/* re-tuning can be used */
 	unsigned int		doing_retune:1;	/* re-tuning in progress */
@@ -454,7 +453,6 @@ struct mmc_host {
 	struct mmc_slot		slot;
 
 	const struct mmc_bus_ops *bus_ops;	/* current bus driver */
-	unsigned int		bus_refs;	/* reference counter */
 
 	unsigned int		sdio_irqs;
 	struct task_struct	*sdio_irq_thread;
-- 
cgit v1.2.3


From 18bbda900ffa7770b93daa1bc1ce3be39e643101 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Date: Thu, 11 Mar 2021 17:41:57 +0800
Subject: mmc: sdio: fix a typo in the comment of SDIO_SD_REV_3_00

I believe "Spev" is typo, should be "Spec".

Signed-off-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Link: https://lore.kernel.org/r/20210311174157.561dada9@xhacker.debian
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/sdio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmc/sdio.h b/include/linux/mmc/sdio.h
index e28769991e82..2a05d1ac4f0e 100644
--- a/include/linux/mmc/sdio.h
+++ b/include/linux/mmc/sdio.h
@@ -82,7 +82,7 @@
 #define  SDIO_SD_REV_1_01	0	/* SD Physical Spec Version 1.01 */
 #define  SDIO_SD_REV_1_10	1	/* SD Physical Spec Version 1.10 */
 #define  SDIO_SD_REV_2_00	2	/* SD Physical Spec Version 2.00 */
-#define  SDIO_SD_REV_3_00	3	/* SD Physical Spev Version 3.00 */
+#define  SDIO_SD_REV_3_00	3	/* SD Physical Spec Version 3.00 */
 
 #define SDIO_CCCR_IOEx		0x02
 #define SDIO_CCCR_IORx		0x03
-- 
cgit v1.2.3


From 17a17bf50612e6048a9975450cf1bd30f93815b5 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 10 Mar 2021 16:29:00 +0100
Subject: mmc: core: Fix hanging on I/O during system suspend for removable
 cards

The mmc core uses a PM notifier to temporarily during system suspend, turn
off the card detection mechanism for removal/insertion of (e)MMC/SD/SDIO
cards. Additionally, the notifier may be used to remove an SDIO card
entirely, if a corresponding SDIO functional driver don't have the system
suspend/resume callbacks assigned. This behaviour has been around for a
very long time.

However, a recent bug report tells us there are problems with this
approach. More precisely, when receiving the PM_SUSPEND_PREPARE
notification, we may end up hanging on I/O to be completed, thus also
preventing the system from getting suspended.

In the end what happens, is that the cancel_delayed_work_sync() in
mmc_pm_notify() ends up waiting for mmc_rescan() to complete - and since
mmc_rescan() wants to claim the host, it needs to wait for the I/O to be
completed first.

Typically, this problem is triggered in Android, if there is ongoing I/O
while the user decides to suspend, resume and then suspend the system
again. This due to that after the resume, an mmc_rescan() work gets punted
to the workqueue, which job is to verify that the card remains inserted
after the system has resumed.

To fix this problem, userspace needs to become frozen to suspend the I/O,
prior to turning off the card detection mechanism. Therefore, let's drop
the PM notifiers for mmc subsystem altogether and rely on the card
detection to be turned off/on as a part of the system_freezable_wq, that we
are already using.

Moreover, to allow and SDIO card to be removed during system suspend, let's
manage this from a ->prepare() callback, assigned at the mmc_host_class
level. In this way, we can use the parent device (the mmc_host_class
device), to remove the card device that is the child, in the
device_prepare() phase.

Reported-by: Kiwoong Kim <kwmad.kim@samsung.com>
Cc: stable@vger.kernel.org # v4.5+
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20210310152900.149380-1-ulf.hansson@linaro.org
Reviewed-by: Kiwoong Kim <kwmad.kim@samsung.com>
---
 drivers/mmc/core/core.c  | 74 ------------------------------------------------
 drivers/mmc/core/core.h  |  8 ------
 drivers/mmc/core/host.c  | 40 ++++++++++++++++++++++++--
 drivers/mmc/core/sdio.c  | 28 ++++++++++++++----
 include/linux/mmc/host.h |  3 --
 5 files changed, 59 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 9c13f7a52699..f194940c5974 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2269,80 +2269,6 @@ void mmc_stop_host(struct mmc_host *host)
 	mmc_release_host(host);
 }
 
-#ifdef CONFIG_PM_SLEEP
-/* Do the card removal on suspend if card is assumed removeable
- * Do that in pm notifier while userspace isn't yet frozen, so we will be able
-   to sync the card.
-*/
-static int mmc_pm_notify(struct notifier_block *notify_block,
-			unsigned long mode, void *unused)
-{
-	struct mmc_host *host = container_of(
-		notify_block, struct mmc_host, pm_notify);
-	unsigned long flags;
-	int err = 0;
-
-	switch (mode) {
-	case PM_HIBERNATION_PREPARE:
-	case PM_SUSPEND_PREPARE:
-	case PM_RESTORE_PREPARE:
-		spin_lock_irqsave(&host->lock, flags);
-		host->rescan_disable = 1;
-		spin_unlock_irqrestore(&host->lock, flags);
-		cancel_delayed_work_sync(&host->detect);
-
-		if (!host->bus_ops)
-			break;
-
-		/* Validate prerequisites for suspend */
-		if (host->bus_ops->pre_suspend)
-			err = host->bus_ops->pre_suspend(host);
-		if (!err)
-			break;
-
-		if (!mmc_card_is_removable(host)) {
-			dev_warn(mmc_dev(host),
-				 "pre_suspend failed for non-removable host: "
-				 "%d\n", err);
-			/* Avoid removing non-removable hosts */
-			break;
-		}
-
-		/* Calling bus_ops->remove() with a claimed host can deadlock */
-		host->bus_ops->remove(host);
-		mmc_claim_host(host);
-		mmc_detach_bus(host);
-		mmc_power_off(host);
-		mmc_release_host(host);
-		host->pm_flags = 0;
-		break;
-
-	case PM_POST_SUSPEND:
-	case PM_POST_HIBERNATION:
-	case PM_POST_RESTORE:
-
-		spin_lock_irqsave(&host->lock, flags);
-		host->rescan_disable = 0;
-		spin_unlock_irqrestore(&host->lock, flags);
-		_mmc_detect_change(host, 0, false);
-
-	}
-
-	return 0;
-}
-
-void mmc_register_pm_notifier(struct mmc_host *host)
-{
-	host->pm_notify.notifier_call = mmc_pm_notify;
-	register_pm_notifier(&host->pm_notify);
-}
-
-void mmc_unregister_pm_notifier(struct mmc_host *host)
-{
-	unregister_pm_notifier(&host->pm_notify);
-}
-#endif
-
 static int __init mmc_init(void)
 {
 	int ret;
diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 575ac0257af2..8032451abaea 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -93,14 +93,6 @@ int mmc_execute_tuning(struct mmc_card *card);
 int mmc_hs200_to_hs400(struct mmc_card *card);
 int mmc_hs400_to_hs200(struct mmc_card *card);
 
-#ifdef CONFIG_PM_SLEEP
-void mmc_register_pm_notifier(struct mmc_host *host);
-void mmc_unregister_pm_notifier(struct mmc_host *host);
-#else
-static inline void mmc_register_pm_notifier(struct mmc_host *host) { }
-static inline void mmc_unregister_pm_notifier(struct mmc_host *host) { }
-#endif
-
 void mmc_wait_for_req_done(struct mmc_host *host, struct mmc_request *mrq);
 bool mmc_is_req_done(struct mmc_host *host, struct mmc_request *mrq);
 
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 9b89a91b6b47..fe05b3645fe9 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -35,6 +35,42 @@
 
 static DEFINE_IDA(mmc_host_ida);
 
+#ifdef CONFIG_PM_SLEEP
+static int mmc_host_class_prepare(struct device *dev)
+{
+	struct mmc_host *host = cls_dev_to_mmc_host(dev);
+
+	/*
+	 * It's safe to access the bus_ops pointer, as both userspace and the
+	 * workqueue for detecting cards are frozen at this point.
+	 */
+	if (!host->bus_ops)
+		return 0;
+
+	/* Validate conditions for system suspend. */
+	if (host->bus_ops->pre_suspend)
+		return host->bus_ops->pre_suspend(host);
+
+	return 0;
+}
+
+static void mmc_host_class_complete(struct device *dev)
+{
+	struct mmc_host *host = cls_dev_to_mmc_host(dev);
+
+	_mmc_detect_change(host, 0, false);
+}
+
+static const struct dev_pm_ops mmc_host_class_dev_pm_ops = {
+	.prepare = mmc_host_class_prepare,
+	.complete = mmc_host_class_complete,
+};
+
+#define MMC_HOST_CLASS_DEV_PM_OPS (&mmc_host_class_dev_pm_ops)
+#else
+#define MMC_HOST_CLASS_DEV_PM_OPS NULL
+#endif
+
 static void mmc_host_classdev_release(struct device *dev)
 {
 	struct mmc_host *host = cls_dev_to_mmc_host(dev);
@@ -46,6 +82,7 @@ static void mmc_host_classdev_release(struct device *dev)
 static struct class mmc_host_class = {
 	.name		= "mmc_host",
 	.dev_release	= mmc_host_classdev_release,
+	.pm		= MMC_HOST_CLASS_DEV_PM_OPS,
 };
 
 int mmc_register_host_class(void)
@@ -538,8 +575,6 @@ int mmc_add_host(struct mmc_host *host)
 #endif
 
 	mmc_start_host(host);
-	mmc_register_pm_notifier(host);
-
 	return 0;
 }
 
@@ -555,7 +590,6 @@ EXPORT_SYMBOL(mmc_add_host);
  */
 void mmc_remove_host(struct mmc_host *host)
 {
-	mmc_unregister_pm_notifier(host);
 	mmc_stop_host(host);
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 0fda7784cab2..3eb94ac2712e 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -985,21 +985,37 @@ out:
  */
 static int mmc_sdio_pre_suspend(struct mmc_host *host)
 {
-	int i, err = 0;
+	int i;
 
 	for (i = 0; i < host->card->sdio_funcs; i++) {
 		struct sdio_func *func = host->card->sdio_func[i];
 		if (func && sdio_func_present(func) && func->dev.driver) {
 			const struct dev_pm_ops *pmops = func->dev.driver->pm;
-			if (!pmops || !pmops->suspend || !pmops->resume) {
+			if (!pmops || !pmops->suspend || !pmops->resume)
 				/* force removal of entire card in that case */
-				err = -ENOSYS;
-				break;
-			}
+				goto remove;
 		}
 	}
 
-	return err;
+	return 0;
+
+remove:
+	if (!mmc_card_is_removable(host)) {
+		dev_warn(mmc_dev(host),
+			 "missing suspend/resume ops for non-removable SDIO card\n");
+		/* Don't remove a non-removable card - we can't re-detect it. */
+		return 0;
+	}
+
+	/* Remove the SDIO card and let it be re-detected later on. */
+	mmc_sdio_remove(host);
+	mmc_claim_host(host);
+	mmc_detach_bus(host);
+	mmc_power_off(host);
+	mmc_release_host(host);
+	host->pm_flags = 0;
+
+	return 0;
 }
 
 /*
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index a001ad2f5f23..17d7b326af29 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -302,9 +302,6 @@ struct mmc_host {
 	u32			ocr_avail_sdio;	/* SDIO-specific OCR */
 	u32			ocr_avail_sd;	/* SD-specific OCR */
 	u32			ocr_avail_mmc;	/* MMC-specific OCR */
-#ifdef CONFIG_PM_SLEEP
-	struct notifier_block	pm_notify;
-#endif
 	struct wakeup_source	*ws;		/* Enable consume of uevents */
 	u32			max_current_330;
 	u32			max_current_300;
-- 
cgit v1.2.3


From 137e68659e90b242049bfd1fab2700bbcf476140 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:44 +0000
Subject: firmware: arm_scmi: Remove legacy scmi_clk_ops protocol interface

Now that all the SCMI driver users have been migrated to the new interface
remove the legacy interface and all the transient code.

Link: https://lore.kernel.org/r/20210316124903.35011-20-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/clock.c | 67 ---------------------------------------
 include/linux/scmi_protocol.h     | 15 ---------
 2 files changed, 82 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 1b3b0278b73d..827cf25fc122 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -235,16 +235,6 @@ scmi_clock_rate_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int
-__scmi_clock_rate_get(const struct scmi_handle *handle,
-		      u32 clk_id, u64 *value)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
-
-	return scmi_clock_rate_get(ph, clk_id, value);
-}
-
 static int scmi_clock_rate_set(const struct scmi_protocol_handle *ph,
 			       u32 clk_id, u64 rate)
 {
@@ -280,15 +270,6 @@ static int scmi_clock_rate_set(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_clock_rate_set(const struct scmi_handle *handle,
-				 u32 clk_id, u64 rate)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
-
-	return scmi_clock_rate_set(ph, clk_id, rate);
-}
-
 static int
 scmi_clock_config_set(const struct scmi_protocol_handle *ph, u32 clk_id,
 		      u32 config)
@@ -317,27 +298,11 @@ static int scmi_clock_enable(const struct scmi_protocol_handle *ph, u32 clk_id)
 	return scmi_clock_config_set(ph, clk_id, CLOCK_ENABLE);
 }
 
-static int __scmi_clock_enable(const struct scmi_handle *handle, u32 clk_id)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
-
-	return scmi_clock_enable(ph, clk_id);
-}
-
 static int scmi_clock_disable(const struct scmi_protocol_handle *ph, u32 clk_id)
 {
 	return scmi_clock_config_set(ph, clk_id, 0);
 }
 
-static int __scmi_clock_disable(const struct scmi_handle *handle, u32 clk_id)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
-
-	return scmi_clock_disable(ph, clk_id);
-}
-
 static int scmi_clock_count_get(const struct scmi_protocol_handle *ph)
 {
 	struct clock_info *ci = ph->get_priv(ph);
@@ -345,14 +310,6 @@ static int scmi_clock_count_get(const struct scmi_protocol_handle *ph)
 	return ci->num_clocks;
 }
 
-static int __scmi_clock_count_get(const struct scmi_handle *handle)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
-
-	return scmi_clock_count_get(ph);
-}
-
 static const struct scmi_clock_info *
 scmi_clock_info_get(const struct scmi_protocol_handle *ph, u32 clk_id)
 {
@@ -365,24 +322,6 @@ scmi_clock_info_get(const struct scmi_protocol_handle *ph, u32 clk_id)
 	return clk;
 }
 
-static const struct scmi_clock_info *
-__scmi_clock_info_get(const struct scmi_handle *handle, u32 clk_id)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_CLOCK);
-
-	return scmi_clock_info_get(ph, clk_id);
-}
-
-static const struct scmi_clk_ops clk_ops = {
-	.count_get = __scmi_clock_count_get,
-	.info_get = __scmi_clock_info_get,
-	.rate_get = __scmi_clock_rate_get,
-	.rate_set = __scmi_clock_rate_set,
-	.enable = __scmi_clock_enable,
-	.disable = __scmi_clock_disable,
-};
-
 static const struct scmi_clk_proto_ops clk_proto_ops = {
 	.count_get = scmi_clock_count_get,
 	.info_get = scmi_clock_info_get,
@@ -397,7 +336,6 @@ static int scmi_clock_protocol_init(const struct scmi_protocol_handle *ph)
 	u32 version;
 	int clkid, ret;
 	struct clock_info *cinfo;
-	struct scmi_handle *handle;
 
 	ph->xops->version_get(ph, &version);
 
@@ -424,11 +362,6 @@ static int scmi_clock_protocol_init(const struct scmi_protocol_handle *ph)
 	}
 
 	cinfo->version = version;
-
-	/* Transient code for legacy ops interface */
-	handle = scmi_map_scmi_handle(ph);
-	handle->clk_ops = &clk_ops;
-
 	return ph->set_priv(ph, cinfo);
 }
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index b19797b23c5f..e2291b7787f8 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -84,19 +84,6 @@ struct scmi_clk_proto_ops {
 	int (*disable)(const struct scmi_protocol_handle *ph, u32 clk_id);
 };
 
-struct scmi_clk_ops {
-	int (*count_get)(const struct scmi_handle *hamdle);
-
-	const struct scmi_clock_info *(*info_get)
-		(const struct scmi_handle *handle, u32 clk_id);
-	int (*rate_get)(const struct scmi_handle *handle, u32 clk_id,
-			u64 *rate);
-	int (*rate_set)(const struct scmi_handle *handle, u32 clk_id,
-			u64 rate);
-	int (*enable)(const struct scmi_handle *handle, u32 clk_id);
-	int (*disable)(const struct scmi_handle *handle, u32 clk_id);
-};
-
 /**
  * struct scmi_perf_proto_ops - represents the various operations provided
  *	by SCMI Performance Protocol
@@ -618,7 +605,6 @@ struct scmi_notify_ops {
  *
  * @dev: pointer to the SCMI device
  * @version: pointer to the structure containing SCMI version information
- * @clk_ops: pointer to set of clock protocol operations
  * @sensor_ops: pointer to set of sensor protocol operations
  * @reset_ops: pointer to set of reset protocol operations
  * @voltage_ops: pointer to set of voltage protocol operations
@@ -638,7 +624,6 @@ struct scmi_notify_ops {
 struct scmi_handle {
 	struct device *dev;
 	struct scmi_revision_info *version;
-	const struct scmi_clk_ops *clk_ops;
 	const struct scmi_sensor_ops *sensor_ops;
 	const struct scmi_reset_ops *reset_ops;
 	const struct scmi_voltage_ops *voltage_ops;
-- 
cgit v1.2.3


From 7e029344223824dbf21e759bf9c0d08f633edb16 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:45 +0000
Subject: firmware: arm_scmi: Port reset protocol to new protocols interface

Convert internals of protocol implementation to use protocol handles and
expose a new protocol operations interface for SCMI driver using the new
get/put common operations, while keeping the old handle->reset_ops still
around to ease transition.

Remove handle->reset_priv now unused.

Link: https://lore.kernel.org/r/20210316124903.35011-21-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/reset.c | 172 ++++++++++++++++++++++++++------------
 include/linux/scmi_protocol.h     |  14 +++-
 2 files changed, 129 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c
index 572dc1fdcede..26d517745d50 100644
--- a/drivers/firmware/arm_scmi/reset.c
+++ b/drivers/firmware/arm_scmi/reset.c
@@ -64,46 +64,45 @@ struct scmi_reset_info {
 	struct reset_dom_info *dom_info;
 };
 
-static int scmi_reset_attributes_get(const struct scmi_handle *handle,
+static int scmi_reset_attributes_get(const struct scmi_protocol_handle *ph,
 				     struct scmi_reset_info *pi)
 {
 	int ret;
 	struct scmi_xfer *t;
 	u32 attr;
 
-	ret = scmi_xfer_get_init(handle, PROTOCOL_ATTRIBUTES,
-				 SCMI_PROTOCOL_RESET, 0, sizeof(attr), &t);
+	ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES,
+				      0, sizeof(attr), &t);
 	if (ret)
 		return ret;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		attr = get_unaligned_le32(t->rx.buf);
 		pi->num_domains = attr & NUM_RESET_DOMAIN_MASK;
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
 static int
-scmi_reset_domain_attributes_get(const struct scmi_handle *handle, u32 domain,
-				 struct reset_dom_info *dom_info)
+scmi_reset_domain_attributes_get(const struct scmi_protocol_handle *ph,
+				 u32 domain, struct reset_dom_info *dom_info)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_reset_domain_attributes *attr;
 
-	ret = scmi_xfer_get_init(handle, RESET_DOMAIN_ATTRIBUTES,
-				 SCMI_PROTOCOL_RESET, sizeof(domain),
-				 sizeof(*attr), &t);
+	ret = ph->xops->xfer_get_init(ph, RESET_DOMAIN_ATTRIBUTES,
+				      sizeof(domain), sizeof(*attr), &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(domain, t->tx.buf);
 	attr = t->rx.buf;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		u32 attributes = le32_to_cpu(attr->attributes);
 
@@ -115,47 +114,75 @@ scmi_reset_domain_attributes_get(const struct scmi_handle *handle, u32 domain,
 		strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE);
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_reset_num_domains_get(const struct scmi_handle *handle)
+static int scmi_reset_num_domains_get(const struct scmi_protocol_handle *ph)
 {
-	struct scmi_reset_info *pi = handle->reset_priv;
+	struct scmi_reset_info *pi = ph->get_priv(ph);
 
 	return pi->num_domains;
 }
 
-static char *scmi_reset_name_get(const struct scmi_handle *handle, u32 domain)
+static int __scmi_reset_num_domains_get(const struct scmi_handle *handle)
 {
-	struct scmi_reset_info *pi = handle->reset_priv;
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
+
+	return scmi_reset_num_domains_get(ph);
+}
+
+static char *scmi_reset_name_get(const struct scmi_protocol_handle *ph,
+				 u32 domain)
+{
+	struct scmi_reset_info *pi = ph->get_priv(ph);
+
 	struct reset_dom_info *dom = pi->dom_info + domain;
 
 	return dom->name;
 }
 
-static int scmi_reset_latency_get(const struct scmi_handle *handle, u32 domain)
+static char *__scmi_reset_name_get(const struct scmi_handle *handle,
+				   u32 domain)
 {
-	struct scmi_reset_info *pi = handle->reset_priv;
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
+
+	return scmi_reset_name_get(ph, domain);
+}
+
+static int scmi_reset_latency_get(const struct scmi_protocol_handle *ph,
+				  u32 domain)
+{
+	struct scmi_reset_info *pi = ph->get_priv(ph);
 	struct reset_dom_info *dom = pi->dom_info + domain;
 
 	return dom->latency_us;
 }
 
-static int scmi_domain_reset(const struct scmi_handle *handle, u32 domain,
+static int __scmi_reset_latency_get(const struct scmi_handle *handle,
+				    u32 domain)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
+
+	return scmi_reset_latency_get(ph, domain);
+}
+
+static int scmi_domain_reset(const struct scmi_protocol_handle *ph, u32 domain,
 			     u32 flags, u32 state)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_reset_domain_reset *dom;
-	struct scmi_reset_info *pi = handle->reset_priv;
+	struct scmi_reset_info *pi = ph->get_priv(ph);
 	struct reset_dom_info *rdom = pi->dom_info + domain;
 
 	if (rdom->async_reset)
 		flags |= ASYNCHRONOUS_RESET;
 
-	ret = scmi_xfer_get_init(handle, RESET, SCMI_PROTOCOL_RESET,
-				 sizeof(*dom), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, RESET, sizeof(*dom), 0, &t);
 	if (ret)
 		return ret;
 
@@ -165,34 +192,71 @@ static int scmi_domain_reset(const struct scmi_handle *handle, u32 domain,
 	dom->reset_state = cpu_to_le32(state);
 
 	if (rdom->async_reset)
-		ret = scmi_do_xfer_with_response(handle, t);
+		ret = ph->xops->do_xfer_with_response(ph, t);
 	else
-		ret = scmi_do_xfer(handle, t);
+		ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_reset_domain_reset(const struct scmi_handle *handle, u32 domain)
+static int scmi_reset_domain_reset(const struct scmi_protocol_handle *ph,
+				   u32 domain)
 {
-	return scmi_domain_reset(handle, domain, AUTONOMOUS_RESET,
+	return scmi_domain_reset(ph, domain, AUTONOMOUS_RESET,
 				 ARCH_COLD_RESET);
 }
 
+static int __scmi_reset_domain_reset(const struct scmi_handle *handle,
+				     u32 domain)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
+
+	return scmi_reset_domain_reset(ph, domain);
+}
+
 static int
-scmi_reset_domain_assert(const struct scmi_handle *handle, u32 domain)
+scmi_reset_domain_assert(const struct scmi_protocol_handle *ph, u32 domain)
 {
-	return scmi_domain_reset(handle, domain, EXPLICIT_RESET_ASSERT,
+	return scmi_domain_reset(ph, domain, EXPLICIT_RESET_ASSERT,
 				 ARCH_COLD_RESET);
 }
 
 static int
-scmi_reset_domain_deassert(const struct scmi_handle *handle, u32 domain)
+__scmi_reset_domain_assert(const struct scmi_handle *handle, u32 domain)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
+
+	return scmi_reset_domain_assert(ph, domain);
+}
+
+static int
+scmi_reset_domain_deassert(const struct scmi_protocol_handle *ph, u32 domain)
 {
-	return scmi_domain_reset(handle, domain, 0, ARCH_COLD_RESET);
+	return scmi_domain_reset(ph, domain, 0, ARCH_COLD_RESET);
+}
+
+static int
+__scmi_reset_domain_deassert(const struct scmi_handle *handle, u32 domain)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
+
+	return scmi_reset_domain_deassert(ph, domain);
 }
 
 static const struct scmi_reset_ops reset_ops = {
+	.num_domains_get = __scmi_reset_num_domains_get,
+	.name_get = __scmi_reset_name_get,
+	.latency_get = __scmi_reset_latency_get,
+	.reset = __scmi_reset_domain_reset,
+	.assert = __scmi_reset_domain_assert,
+	.deassert = __scmi_reset_domain_deassert,
+};
+
+static const struct scmi_reset_proto_ops reset_proto_ops = {
 	.num_domains_get = scmi_reset_num_domains_get,
 	.name_get = scmi_reset_name_get,
 	.latency_get = scmi_reset_latency_get,
@@ -201,16 +265,15 @@ static const struct scmi_reset_ops reset_ops = {
 	.deassert = scmi_reset_domain_deassert,
 };
 
-static int scmi_reset_notify(const struct scmi_handle *handle, u32 domain_id,
-			     bool enable)
+static int scmi_reset_notify(const struct scmi_protocol_handle *ph,
+			     u32 domain_id, bool enable)
 {
 	int ret;
 	u32 evt_cntl = enable ? RESET_TP_NOTIFY_ALL : 0;
 	struct scmi_xfer *t;
 	struct scmi_msg_reset_notify *cfg;
 
-	ret = scmi_xfer_get_init(handle, RESET_NOTIFY,
-				 SCMI_PROTOCOL_RESET, sizeof(*cfg), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, RESET_NOTIFY, sizeof(*cfg), 0, &t);
 	if (ret)
 		return ret;
 
@@ -218,18 +281,18 @@ static int scmi_reset_notify(const struct scmi_handle *handle, u32 domain_id,
 	cfg->id = cpu_to_le32(domain_id);
 	cfg->event_control = cpu_to_le32(evt_cntl);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_reset_set_notify_enabled(const void *handle,
+static int scmi_reset_set_notify_enabled(const void *ph,
 					 u8 evt_id, u32 src_id, bool enable)
 {
 	int ret;
 
-	ret = scmi_reset_notify(handle, src_id, enable);
+	ret = scmi_reset_notify(ph, src_id, enable);
 	if (ret)
 		pr_debug("FAIL_ENABLED - evt[%X] dom[%d] - ret:%d\n",
 			 evt_id, src_id, ret);
@@ -237,7 +300,7 @@ static int scmi_reset_set_notify_enabled(const void *handle,
 	return ret;
 }
 
-static void *scmi_reset_fill_custom_report(const void *handle,
+static void *scmi_reset_fill_custom_report(const void *ph,
 					   u8 evt_id, ktime_t timestamp,
 					   const void *payld, size_t payld_sz,
 					   void *report, u32 *src_id)
@@ -257,10 +320,10 @@ static void *scmi_reset_fill_custom_report(const void *handle,
 	return r;
 }
 
-static int scmi_reset_get_num_sources(const void *handle)
+static int scmi_reset_get_num_sources(const void *ph)
 {
 	struct scmi_reset_info *pinfo =
-		((const struct scmi_handle *)(handle))->reset_priv;
+		((const struct scmi_protocol_handle *)ph)->get_priv(ph);
 
 	if (!pinfo)
 		return -EINVAL;
@@ -289,24 +352,25 @@ static const struct scmi_protocol_events reset_protocol_events = {
 	.num_events = ARRAY_SIZE(reset_events),
 };
 
-static int scmi_reset_protocol_init(struct scmi_handle *handle)
+static int scmi_reset_protocol_init(const struct scmi_protocol_handle *ph)
 {
 	int domain;
 	u32 version;
 	struct scmi_reset_info *pinfo;
+	struct scmi_handle *handle;
 
-	scmi_version_get(handle, SCMI_PROTOCOL_RESET, &version);
+	ph->xops->version_get(ph, &version);
 
-	dev_dbg(handle->dev, "Reset Version %d.%d\n",
+	dev_dbg(ph->dev, "Reset Version %d.%d\n",
 		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
 
-	pinfo = devm_kzalloc(handle->dev, sizeof(*pinfo), GFP_KERNEL);
+	pinfo = devm_kzalloc(ph->dev, sizeof(*pinfo), GFP_KERNEL);
 	if (!pinfo)
 		return -ENOMEM;
 
-	scmi_reset_attributes_get(handle, pinfo);
+	scmi_reset_attributes_get(ph, pinfo);
 
-	pinfo->dom_info = devm_kcalloc(handle->dev, pinfo->num_domains,
+	pinfo->dom_info = devm_kcalloc(ph->dev, pinfo->num_domains,
 				       sizeof(*pinfo->dom_info), GFP_KERNEL);
 	if (!pinfo->dom_info)
 		return -ENOMEM;
@@ -314,20 +378,22 @@ static int scmi_reset_protocol_init(struct scmi_handle *handle)
 	for (domain = 0; domain < pinfo->num_domains; domain++) {
 		struct reset_dom_info *dom = pinfo->dom_info + domain;
 
-		scmi_reset_domain_attributes_get(handle, domain, dom);
+		scmi_reset_domain_attributes_get(ph, domain, dom);
 	}
 
 	pinfo->version = version;
+
+	/* Transient code for legacy ops interface */
+	handle = scmi_map_scmi_handle(ph);
 	handle->reset_ops = &reset_ops;
-	handle->reset_priv = pinfo;
 
-	return 0;
+	return ph->set_priv(ph, pinfo);
 }
 
 static const struct scmi_protocol scmi_reset = {
 	.id = SCMI_PROTOCOL_RESET,
-	.init = &scmi_reset_protocol_init,
-	.ops = &reset_ops,
+	.instance_init = &scmi_reset_protocol_init,
+	.ops = &reset_proto_ops,
 	.events = &reset_protocol_events,
 };
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index e2291b7787f8..234b03a3f038 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -464,7 +464,7 @@ struct scmi_sensor_ops {
 };
 
 /**
- * struct scmi_reset_ops - represents the various operations provided
+ * struct scmi_reset_proto_ops - represents the various operations provided
  *	by SCMI Reset Protocol
  *
  * @num_domains_get: get the count of reset domains provided by SCMI
@@ -474,6 +474,15 @@ struct scmi_sensor_ops {
  * @assert: explicitly assert reset signal of the specified reset domain
  * @deassert: explicitly deassert reset signal of the specified reset domain
  */
+struct scmi_reset_proto_ops {
+	int (*num_domains_get)(const struct scmi_protocol_handle *ph);
+	char *(*name_get)(const struct scmi_protocol_handle *ph, u32 domain);
+	int (*latency_get)(const struct scmi_protocol_handle *ph, u32 domain);
+	int (*reset)(const struct scmi_protocol_handle *ph, u32 domain);
+	int (*assert)(const struct scmi_protocol_handle *ph, u32 domain);
+	int (*deassert)(const struct scmi_protocol_handle *ph, u32 domain);
+};
+
 struct scmi_reset_ops {
 	int (*num_domains_get)(const struct scmi_handle *handle);
 	char *(*name_get)(const struct scmi_handle *handle, u32 domain);
@@ -614,8 +623,6 @@ struct scmi_notify_ops {
  * @notify_ops: pointer to set of notifications related operations
  * @sensor_priv: pointer to private data structure specific to sensors
  *	protocol(for internal use only)
- * @reset_priv: pointer to private data structure specific to reset
- *	protocol(for internal use only)
  * @voltage_priv: pointer to private data structure specific to voltage
  *	protocol(for internal use only)
  * @notify_priv: pointer to private data structure specific to notifications
@@ -636,7 +643,6 @@ struct scmi_handle {
 	const struct scmi_notify_ops *notify_ops;
 	/* for protocol internal use */
 	void *sensor_priv;
-	void *reset_priv;
 	void *voltage_priv;
 	void *notify_priv;
 	void *system_priv;
-- 
cgit v1.2.3


From 497ef0cbc6d166cef129b180dd292eec744a04fb Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:47 +0000
Subject: firmware: arm_scmi: Remove legacy scmi_reset_ops protocol interface

Now that all the SCMI driver users have been migrated to the new interface
remove the legacy interface and all the transient code.

Link: https://lore.kernel.org/r/20210316124903.35011-23-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/reset.c | 68 ---------------------------------------
 include/linux/scmi_protocol.h     | 11 -------
 2 files changed, 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c
index 26d517745d50..7ea1286bfed6 100644
--- a/drivers/firmware/arm_scmi/reset.c
+++ b/drivers/firmware/arm_scmi/reset.c
@@ -125,14 +125,6 @@ static int scmi_reset_num_domains_get(const struct scmi_protocol_handle *ph)
 	return pi->num_domains;
 }
 
-static int __scmi_reset_num_domains_get(const struct scmi_handle *handle)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
-
-	return scmi_reset_num_domains_get(ph);
-}
-
 static char *scmi_reset_name_get(const struct scmi_protocol_handle *ph,
 				 u32 domain)
 {
@@ -143,15 +135,6 @@ static char *scmi_reset_name_get(const struct scmi_protocol_handle *ph,
 	return dom->name;
 }
 
-static char *__scmi_reset_name_get(const struct scmi_handle *handle,
-				   u32 domain)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
-
-	return scmi_reset_name_get(ph, domain);
-}
-
 static int scmi_reset_latency_get(const struct scmi_protocol_handle *ph,
 				  u32 domain)
 {
@@ -161,15 +144,6 @@ static int scmi_reset_latency_get(const struct scmi_protocol_handle *ph,
 	return dom->latency_us;
 }
 
-static int __scmi_reset_latency_get(const struct scmi_handle *handle,
-				    u32 domain)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
-
-	return scmi_reset_latency_get(ph, domain);
-}
-
 static int scmi_domain_reset(const struct scmi_protocol_handle *ph, u32 domain,
 			     u32 flags, u32 state)
 {
@@ -207,15 +181,6 @@ static int scmi_reset_domain_reset(const struct scmi_protocol_handle *ph,
 				 ARCH_COLD_RESET);
 }
 
-static int __scmi_reset_domain_reset(const struct scmi_handle *handle,
-				     u32 domain)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
-
-	return scmi_reset_domain_reset(ph, domain);
-}
-
 static int
 scmi_reset_domain_assert(const struct scmi_protocol_handle *ph, u32 domain)
 {
@@ -223,39 +188,12 @@ scmi_reset_domain_assert(const struct scmi_protocol_handle *ph, u32 domain)
 				 ARCH_COLD_RESET);
 }
 
-static int
-__scmi_reset_domain_assert(const struct scmi_handle *handle, u32 domain)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
-
-	return scmi_reset_domain_assert(ph, domain);
-}
-
 static int
 scmi_reset_domain_deassert(const struct scmi_protocol_handle *ph, u32 domain)
 {
 	return scmi_domain_reset(ph, domain, 0, ARCH_COLD_RESET);
 }
 
-static int
-__scmi_reset_domain_deassert(const struct scmi_handle *handle, u32 domain)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_RESET);
-
-	return scmi_reset_domain_deassert(ph, domain);
-}
-
-static const struct scmi_reset_ops reset_ops = {
-	.num_domains_get = __scmi_reset_num_domains_get,
-	.name_get = __scmi_reset_name_get,
-	.latency_get = __scmi_reset_latency_get,
-	.reset = __scmi_reset_domain_reset,
-	.assert = __scmi_reset_domain_assert,
-	.deassert = __scmi_reset_domain_deassert,
-};
-
 static const struct scmi_reset_proto_ops reset_proto_ops = {
 	.num_domains_get = scmi_reset_num_domains_get,
 	.name_get = scmi_reset_name_get,
@@ -357,7 +295,6 @@ static int scmi_reset_protocol_init(const struct scmi_protocol_handle *ph)
 	int domain;
 	u32 version;
 	struct scmi_reset_info *pinfo;
-	struct scmi_handle *handle;
 
 	ph->xops->version_get(ph, &version);
 
@@ -382,11 +319,6 @@ static int scmi_reset_protocol_init(const struct scmi_protocol_handle *ph)
 	}
 
 	pinfo->version = version;
-
-	/* Transient code for legacy ops interface */
-	handle = scmi_map_scmi_handle(ph);
-	handle->reset_ops = &reset_ops;
-
 	return ph->set_priv(ph, pinfo);
 }
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 234b03a3f038..c5bf9dc707aa 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -483,15 +483,6 @@ struct scmi_reset_proto_ops {
 	int (*deassert)(const struct scmi_protocol_handle *ph, u32 domain);
 };
 
-struct scmi_reset_ops {
-	int (*num_domains_get)(const struct scmi_handle *handle);
-	char *(*name_get)(const struct scmi_handle *handle, u32 domain);
-	int (*latency_get)(const struct scmi_handle *handle, u32 domain);
-	int (*reset)(const struct scmi_handle *handle, u32 domain);
-	int (*assert)(const struct scmi_handle *handle, u32 domain);
-	int (*deassert)(const struct scmi_handle *handle, u32 domain);
-};
-
 /**
  * struct scmi_voltage_info - describe one available SCMI Voltage Domain
  *
@@ -615,7 +606,6 @@ struct scmi_notify_ops {
  * @dev: pointer to the SCMI device
  * @version: pointer to the structure containing SCMI version information
  * @sensor_ops: pointer to set of sensor protocol operations
- * @reset_ops: pointer to set of reset protocol operations
  * @voltage_ops: pointer to set of voltage protocol operations
  * @devm_protocol_get: devres managed method to acquire a protocol and get specific
  *		       operations and a dedicated protocol handler
@@ -632,7 +622,6 @@ struct scmi_handle {
 	struct device *dev;
 	struct scmi_revision_info *version;
 	const struct scmi_sensor_ops *sensor_ops;
-	const struct scmi_reset_ops *reset_ops;
 	const struct scmi_voltage_ops *voltage_ops;
 
 	const void __must_check *
-- 
cgit v1.2.3


From 9694a7f623593425a1559bb7a82bee91dfdd843b Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:48 +0000
Subject: firmware: arm_scmi: Port sensor protocol to new protocols interface

Convert internals of protocol implementation to use protocol handles and
expose a new protocol operations interface for SCMI driver using the new
get/put common operations, while keeping the old handle->sensor_ops still
around to ease transition.

Remove handle->sensor_priv now unused.

Link: https://lore.kernel.org/r/20210316124903.35011-24-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 275 +++++++++++++++++++++++-------------
 include/linux/scmi_protocol.h       |  22 ++-
 2 files changed, 194 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index e40e49869a5d..ae9f727730c8 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -201,21 +201,21 @@ struct sensors_info {
 	struct scmi_sensor_info *sensors;
 };
 
-static int scmi_sensor_attributes_get(const struct scmi_handle *handle,
+static int scmi_sensor_attributes_get(const struct scmi_protocol_handle *ph,
 				      struct sensors_info *si)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_sensor_attributes *attr;
 
-	ret = scmi_xfer_get_init(handle, PROTOCOL_ATTRIBUTES,
-				 SCMI_PROTOCOL_SENSOR, 0, sizeof(*attr), &t);
+	ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES,
+				      0, sizeof(*attr), &t);
 	if (ret)
 		return ret;
 
 	attr = t->rx.buf;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
 		si->num_sensors = le16_to_cpu(attr->num_sensors);
 		si->max_requests = attr->max_requests;
@@ -224,7 +224,7 @@ static int scmi_sensor_attributes_get(const struct scmi_handle *handle,
 		si->reg_size = le32_to_cpu(attr->reg_size);
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
@@ -235,7 +235,7 @@ static inline void scmi_parse_range_attrs(struct scmi_range_attrs *out,
 	out->max_range = get_unaligned_le64((void *)&in->max_range_low);
 }
 
-static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
+static int scmi_sensor_update_intervals(const struct scmi_protocol_handle *ph,
 					struct scmi_sensor_info *s)
 {
 	int ret, cnt;
@@ -245,8 +245,8 @@ static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
 	struct scmi_msg_resp_sensor_list_update_intervals *buf;
 	struct scmi_msg_sensor_list_update_intervals *msg;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_LIST_UPDATE_INTERVALS,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &ti);
+	ret = ph->xops->xfer_get_init(ph, SENSOR_LIST_UPDATE_INTERVALS,
+				      sizeof(*msg), 0, &ti);
 	if (ret)
 		return ret;
 
@@ -259,7 +259,7 @@ static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
 		msg->id = cpu_to_le32(s->id);
 		msg->index = cpu_to_le32(desc_index);
 
-		ret = scmi_do_xfer(handle, ti);
+		ret = ph->xops->do_xfer(ph, ti);
 		if (ret)
 			break;
 
@@ -277,7 +277,7 @@ static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
 			/* segmented intervals are reported in one triplet */
 			if (s->intervals.segmented &&
 			    (num_remaining || num_returned != 3)) {
-				dev_err(handle->dev,
+				dev_err(ph->dev,
 					"Sensor ID:%d advertises an invalid segmented interval (%d)\n",
 					s->id, s->intervals.count);
 				s->intervals.segmented = false;
@@ -288,7 +288,7 @@ static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
 			/* Direct allocation when exceeding pre-allocated */
 			if (s->intervals.count >= SCMI_MAX_PREALLOC_POOL) {
 				s->intervals.desc =
-					devm_kcalloc(handle->dev,
+					devm_kcalloc(ph->dev,
 						     s->intervals.count,
 						     sizeof(*s->intervals.desc),
 						     GFP_KERNEL);
@@ -300,7 +300,7 @@ static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
 				}
 			}
 		} else if (desc_index + num_returned > s->intervals.count) {
-			dev_err(handle->dev,
+			dev_err(ph->dev,
 				"No. of update intervals can't exceed %d\n",
 				s->intervals.count);
 			ret = -EINVAL;
@@ -313,18 +313,18 @@ static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
 
 		desc_index += num_returned;
 
-		scmi_reset_rx_to_maxsz(handle, ti);
+		ph->xops->reset_rx_to_maxsz(ph, ti);
 		/*
 		 * check for both returned and remaining to avoid infinite
 		 * loop due to buggy firmware
 		 */
 	} while (num_returned && num_remaining);
 
-	scmi_xfer_put(handle, ti);
+	ph->xops->xfer_put(ph, ti);
 	return ret;
 }
 
-static int scmi_sensor_axis_description(const struct scmi_handle *handle,
+static int scmi_sensor_axis_description(const struct scmi_protocol_handle *ph,
 					struct scmi_sensor_info *s)
 {
 	int ret, cnt;
@@ -334,13 +334,13 @@ static int scmi_sensor_axis_description(const struct scmi_handle *handle,
 	struct scmi_msg_resp_sensor_axis_description *buf;
 	struct scmi_msg_sensor_axis_description_get *msg;
 
-	s->axis = devm_kcalloc(handle->dev, s->num_axis,
+	s->axis = devm_kcalloc(ph->dev, s->num_axis,
 			       sizeof(*s->axis), GFP_KERNEL);
 	if (!s->axis)
 		return -ENOMEM;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_AXIS_DESCRIPTION_GET,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &te);
+	ret = ph->xops->xfer_get_init(ph, SENSOR_AXIS_DESCRIPTION_GET,
+				      sizeof(*msg), 0, &te);
 	if (ret)
 		return ret;
 
@@ -354,7 +354,7 @@ static int scmi_sensor_axis_description(const struct scmi_handle *handle,
 		msg->id = cpu_to_le32(s->id);
 		msg->axis_desc_index = cpu_to_le32(desc_index);
 
-		ret = scmi_do_xfer(handle, te);
+		ret = ph->xops->do_xfer(ph, te);
 		if (ret)
 			break;
 
@@ -363,7 +363,7 @@ static int scmi_sensor_axis_description(const struct scmi_handle *handle,
 		num_remaining = NUM_AXIS_REMAINING(flags);
 
 		if (desc_index + num_returned > s->num_axis) {
-			dev_err(handle->dev, "No. of axis can't exceed %d\n",
+			dev_err(ph->dev, "No. of axis can't exceed %d\n",
 				s->num_axis);
 			break;
 		}
@@ -405,18 +405,18 @@ static int scmi_sensor_axis_description(const struct scmi_handle *handle,
 
 		desc_index += num_returned;
 
-		scmi_reset_rx_to_maxsz(handle, te);
+		ph->xops->reset_rx_to_maxsz(ph, te);
 		/*
 		 * check for both returned and remaining to avoid infinite
 		 * loop due to buggy firmware
 		 */
 	} while (num_returned && num_remaining);
 
-	scmi_xfer_put(handle, te);
+	ph->xops->xfer_put(ph, te);
 	return ret;
 }
 
-static int scmi_sensor_description_get(const struct scmi_handle *handle,
+static int scmi_sensor_description_get(const struct scmi_protocol_handle *ph,
 				       struct sensors_info *si)
 {
 	int ret, cnt;
@@ -425,8 +425,8 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 	struct scmi_xfer *t;
 	struct scmi_msg_resp_sensor_description *buf;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_DESCRIPTION_GET,
-				 SCMI_PROTOCOL_SENSOR, sizeof(__le32), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, SENSOR_DESCRIPTION_GET,
+				      sizeof(__le32), 0, &t);
 	if (ret)
 		return ret;
 
@@ -437,7 +437,8 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 
 		/* Set the number of sensors to be skipped/already read */
 		put_unaligned_le32(desc_index, t->tx.buf);
-		ret = scmi_do_xfer(handle, t);
+
+		ret = ph->xops->do_xfer(ph, t);
 		if (ret)
 			break;
 
@@ -445,7 +446,7 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 		num_remaining = le16_to_cpu(buf->num_remaining);
 
 		if (desc_index + num_returned > si->num_sensors) {
-			dev_err(handle->dev, "No. of sensors can't exceed %d",
+			dev_err(ph->dev, "No. of sensors can't exceed %d",
 				si->num_sensors);
 			break;
 		}
@@ -500,8 +501,8 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 				 * Since the command is optional, on error carry
 				 * on without any update interval.
 				 */
-				if (scmi_sensor_update_intervals(handle, s))
-					dev_dbg(handle->dev,
+				if (scmi_sensor_update_intervals(ph, s))
+					dev_dbg(ph->dev,
 						"Update Intervals not available for sensor ID:%d\n",
 						s->id);
 			}
@@ -535,7 +536,7 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 				}
 			}
 			if (s->num_axis > 0) {
-				ret = scmi_sensor_axis_description(handle, s);
+				ret = scmi_sensor_axis_description(ph, s);
 				if (ret)
 					goto out;
 			}
@@ -545,7 +546,7 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 
 		desc_index += num_returned;
 
-		scmi_reset_rx_to_maxsz(handle, t);
+		ph->xops->reset_rx_to_maxsz(ph, t);
 		/*
 		 * check for both returned and remaining to avoid infinite
 		 * loop due to buggy firmware
@@ -553,12 +554,12 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 	} while (num_returned && num_remaining);
 
 out:
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
 static inline int
-scmi_sensor_request_notify(const struct scmi_handle *handle, u32 sensor_id,
+scmi_sensor_request_notify(const struct scmi_protocol_handle *ph, u32 sensor_id,
 			   u8 message_id, bool enable)
 {
 	int ret;
@@ -566,8 +567,7 @@ scmi_sensor_request_notify(const struct scmi_handle *handle, u32 sensor_id,
 	struct scmi_xfer *t;
 	struct scmi_msg_sensor_request_notify *cfg;
 
-	ret = scmi_xfer_get_init(handle, message_id,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*cfg), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, message_id, sizeof(*cfg), 0, &t);
 	if (ret)
 		return ret;
 
@@ -575,40 +575,40 @@ scmi_sensor_request_notify(const struct scmi_handle *handle, u32 sensor_id,
 	cfg->id = cpu_to_le32(sensor_id);
 	cfg->event_control = cpu_to_le32(evt_cntl);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_sensor_trip_point_notify(const struct scmi_handle *handle,
+static int scmi_sensor_trip_point_notify(const struct scmi_protocol_handle *ph,
 					 u32 sensor_id, bool enable)
 {
-	return scmi_sensor_request_notify(handle, sensor_id,
+	return scmi_sensor_request_notify(ph, sensor_id,
 					  SENSOR_TRIP_POINT_NOTIFY,
 					  enable);
 }
 
 static int
-scmi_sensor_continuous_update_notify(const struct scmi_handle *handle,
+scmi_sensor_continuous_update_notify(const struct scmi_protocol_handle *ph,
 				     u32 sensor_id, bool enable)
 {
-	return scmi_sensor_request_notify(handle, sensor_id,
+	return scmi_sensor_request_notify(ph, sensor_id,
 					  SENSOR_CONTINUOUS_UPDATE_NOTIFY,
 					  enable);
 }
 
 static int
-scmi_sensor_trip_point_config(const struct scmi_handle *handle, u32 sensor_id,
-			      u8 trip_id, u64 trip_value)
+scmi_sensor_trip_point_config(const struct scmi_protocol_handle *ph,
+			      u32 sensor_id, u8 trip_id, u64 trip_value)
 {
 	int ret;
 	u32 evt_cntl = SENSOR_TP_BOTH;
 	struct scmi_xfer *t;
 	struct scmi_msg_set_sensor_trip_point *trip;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_TRIP_POINT_CONFIG,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*trip), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, SENSOR_TRIP_POINT_CONFIG,
+				      sizeof(*trip), 0, &t);
 	if (ret)
 		return ret;
 
@@ -618,47 +618,66 @@ scmi_sensor_trip_point_config(const struct scmi_handle *handle, u32 sensor_id,
 	trip->value_low = cpu_to_le32(trip_value & 0xffffffff);
 	trip->value_high = cpu_to_le32(trip_value >> 32);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_sensor_config_get(const struct scmi_handle *handle,
+static int
+__scmi_sensor_trip_point_config(const struct scmi_handle *handle,
+				u32 sensor_id, u8 trip_id, u64 trip_value)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
+
+	return scmi_sensor_trip_point_config(ph, sensor_id, trip_id,
+					     trip_value);
+}
+
+static int scmi_sensor_config_get(const struct scmi_protocol_handle *ph,
 				  u32 sensor_id, u32 *sensor_config)
 {
 	int ret;
 	struct scmi_xfer *t;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_CONFIG_GET,
-				 SCMI_PROTOCOL_SENSOR, sizeof(__le32),
-				 sizeof(__le32), &t);
+	ret = ph->xops->xfer_get_init(ph, SENSOR_CONFIG_GET,
+				      sizeof(__le32), sizeof(__le32), &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(cpu_to_le32(sensor_id), t->tx.buf);
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
-		struct sensors_info *si = handle->sensor_priv;
+		struct sensors_info *si = ph->get_priv(ph);
 		struct scmi_sensor_info *s = si->sensors + sensor_id;
 
 		*sensor_config = get_unaligned_le64(t->rx.buf);
 		s->sensor_config = *sensor_config;
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_sensor_config_set(const struct scmi_handle *handle,
+static int __scmi_sensor_config_get(const struct scmi_handle *handle,
+				    u32 sensor_id, u32 *sensor_config)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
+
+	return scmi_sensor_config_get(ph, sensor_id, sensor_config);
+}
+
+static int scmi_sensor_config_set(const struct scmi_protocol_handle *ph,
 				  u32 sensor_id, u32 sensor_config)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_sensor_config_set *msg;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_CONFIG_SET,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, SENSOR_CONFIG_SET,
+				      sizeof(*msg), 0, &t);
 	if (ret)
 		return ret;
 
@@ -666,21 +685,30 @@ static int scmi_sensor_config_set(const struct scmi_handle *handle,
 	msg->id = cpu_to_le32(sensor_id);
 	msg->sensor_config = cpu_to_le32(sensor_config);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret) {
-		struct sensors_info *si = handle->sensor_priv;
+		struct sensors_info *si = ph->get_priv(ph);
 		struct scmi_sensor_info *s = si->sensors + sensor_id;
 
 		s->sensor_config = sensor_config;
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
+static int __scmi_sensor_config_set(const struct scmi_handle *handle,
+				    u32 sensor_id, u32 sensor_config)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
+
+	return scmi_sensor_config_set(ph, sensor_id, sensor_config);
+}
+
 /**
  * scmi_sensor_reading_get  - Read scalar sensor value
- * @handle: Platform handle
+ * @ph: Protocol handle
  * @sensor_id: Sensor ID
  * @value: The 64bit value sensor reading
  *
@@ -693,17 +721,17 @@ static int scmi_sensor_config_set(const struct scmi_handle *handle,
  *
  * Return: 0 on Success
  */
-static int scmi_sensor_reading_get(const struct scmi_handle *handle,
+static int scmi_sensor_reading_get(const struct scmi_protocol_handle *ph,
 				   u32 sensor_id, u64 *value)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_sensor_reading_get *sensor;
-	struct sensors_info *si = handle->sensor_priv;
+	struct sensors_info *si = ph->get_priv(ph);
 	struct scmi_sensor_info *s = si->sensors + sensor_id;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_READING_GET,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*sensor), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, SENSOR_READING_GET,
+				      sizeof(*sensor), 0, &t);
 	if (ret)
 		return ret;
 
@@ -711,7 +739,7 @@ static int scmi_sensor_reading_get(const struct scmi_handle *handle,
 	sensor->id = cpu_to_le32(sensor_id);
 	if (s->async) {
 		sensor->flags = cpu_to_le32(SENSOR_READ_ASYNC);
-		ret = scmi_do_xfer_with_response(handle, t);
+		ret = ph->xops->do_xfer_with_response(ph, t);
 		if (!ret) {
 			struct scmi_resp_sensor_reading_complete *resp;
 
@@ -723,15 +751,24 @@ static int scmi_sensor_reading_get(const struct scmi_handle *handle,
 		}
 	} else {
 		sensor->flags = cpu_to_le32(0);
-		ret = scmi_do_xfer(handle, t);
+		ret = ph->xops->do_xfer(ph, t);
 		if (!ret)
 			*value = get_unaligned_le64(t->rx.buf);
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
+static int __scmi_sensor_reading_get(const struct scmi_handle *handle,
+				     u32 sensor_id, u64 *value)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
+
+	return scmi_sensor_reading_get(ph, sensor_id, value);
+}
+
 static inline void
 scmi_parse_sensor_readings(struct scmi_sensor_reading *out,
 			   const struct scmi_sensor_reading_resp *in)
@@ -742,7 +779,7 @@ scmi_parse_sensor_readings(struct scmi_sensor_reading *out,
 
 /**
  * scmi_sensor_reading_get_timestamped  - Read multiple-axis timestamped values
- * @handle: Platform handle
+ * @ph: Protocol handle
  * @sensor_id: Sensor ID
  * @count: The length of the provided @readings array
  * @readings: An array of elements each representing a timestamped per-axis
@@ -755,22 +792,22 @@ scmi_parse_sensor_readings(struct scmi_sensor_reading *out,
  * Return: 0 on Success
  */
 static int
-scmi_sensor_reading_get_timestamped(const struct scmi_handle *handle,
+scmi_sensor_reading_get_timestamped(const struct scmi_protocol_handle *ph,
 				    u32 sensor_id, u8 count,
 				    struct scmi_sensor_reading *readings)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_msg_sensor_reading_get *sensor;
-	struct sensors_info *si = handle->sensor_priv;
+	struct sensors_info *si = ph->get_priv(ph);
 	struct scmi_sensor_info *s = si->sensors + sensor_id;
 
 	if (!count || !readings ||
 	    (!s->num_axis && count > 1) || (s->num_axis && count > s->num_axis))
 		return -EINVAL;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_READING_GET,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*sensor), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, SENSOR_READING_GET,
+				      sizeof(*sensor), 0, &t);
 	if (ret)
 		return ret;
 
@@ -778,7 +815,7 @@ scmi_sensor_reading_get_timestamped(const struct scmi_handle *handle,
 	sensor->id = cpu_to_le32(sensor_id);
 	if (s->async) {
 		sensor->flags = cpu_to_le32(SENSOR_READ_ASYNC);
-		ret = scmi_do_xfer_with_response(handle, t);
+		ret = ph->xops->do_xfer_with_response(ph, t);
 		if (!ret) {
 			int i;
 			struct scmi_resp_sensor_reading_complete_v3 *resp;
@@ -794,7 +831,7 @@ scmi_sensor_reading_get_timestamped(const struct scmi_handle *handle,
 		}
 	} else {
 		sensor->flags = cpu_to_le32(0);
-		ret = scmi_do_xfer(handle, t);
+		ret = ph->xops->do_xfer(ph, t);
 		if (!ret) {
 			int i;
 			struct scmi_sensor_reading_resp *resp_readings;
@@ -806,26 +843,65 @@ scmi_sensor_reading_get_timestamped(const struct scmi_handle *handle,
 		}
 	}
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
+static int
+__scmi_sensor_reading_get_timestamped(const struct scmi_handle *handle,
+				      u32 sensor_id, u8 count,
+				      struct scmi_sensor_reading *readings)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
+
+	return scmi_sensor_reading_get_timestamped(ph, sensor_id, count,
+						   readings);
+}
+
 static const struct scmi_sensor_info *
-scmi_sensor_info_get(const struct scmi_handle *handle, u32 sensor_id)
+scmi_sensor_info_get(const struct scmi_protocol_handle *ph, u32 sensor_id)
 {
-	struct sensors_info *si = handle->sensor_priv;
+	struct sensors_info *si = ph->get_priv(ph);
 
 	return si->sensors + sensor_id;
 }
 
-static int scmi_sensor_count_get(const struct scmi_handle *handle)
+static const struct scmi_sensor_info *
+__scmi_sensor_info_get(const struct scmi_handle *handle, u32 sensor_id)
 {
-	struct sensors_info *si = handle->sensor_priv;
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
+
+	return scmi_sensor_info_get(ph, sensor_id);
+}
+
+static int scmi_sensor_count_get(const struct scmi_protocol_handle *ph)
+{
+	struct sensors_info *si = ph->get_priv(ph);
 
 	return si->num_sensors;
 }
 
+static int __scmi_sensor_count_get(const struct scmi_handle *handle)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
+
+	return scmi_sensor_count_get(ph);
+}
+
 static const struct scmi_sensor_ops sensor_ops = {
+	.count_get = __scmi_sensor_count_get,
+	.info_get = __scmi_sensor_info_get,
+	.trip_point_config = __scmi_sensor_trip_point_config,
+	.reading_get = __scmi_sensor_reading_get,
+	.reading_get_timestamped = __scmi_sensor_reading_get_timestamped,
+	.config_get = __scmi_sensor_config_get,
+	.config_set = __scmi_sensor_config_set,
+};
+
+static const struct scmi_sensor_proto_ops sensor_proto_ops = {
 	.count_get = scmi_sensor_count_get,
 	.info_get = scmi_sensor_info_get,
 	.trip_point_config = scmi_sensor_trip_point_config,
@@ -835,18 +911,17 @@ static const struct scmi_sensor_ops sensor_ops = {
 	.config_set = scmi_sensor_config_set,
 };
 
-static int scmi_sensor_set_notify_enabled(const void *handle,
+static int scmi_sensor_set_notify_enabled(const void *ph,
 					  u8 evt_id, u32 src_id, bool enable)
 {
 	int ret;
 
 	switch (evt_id) {
 	case SCMI_EVENT_SENSOR_TRIP_POINT_EVENT:
-		ret = scmi_sensor_trip_point_notify(handle, src_id, enable);
+		ret = scmi_sensor_trip_point_notify(ph, src_id, enable);
 		break;
 	case SCMI_EVENT_SENSOR_UPDATE:
-		ret = scmi_sensor_continuous_update_notify(handle, src_id,
-							   enable);
+		ret = scmi_sensor_continuous_update_notify(ph, src_id, enable);
 		break;
 	default:
 		ret = -EINVAL;
@@ -860,7 +935,7 @@ static int scmi_sensor_set_notify_enabled(const void *handle,
 	return ret;
 }
 
-static void *scmi_sensor_fill_custom_report(const void *handle,
+static void *scmi_sensor_fill_custom_report(const void *ph,
 					    u8 evt_id, ktime_t timestamp,
 					    const void *payld, size_t payld_sz,
 					    void *report, u32 *src_id)
@@ -891,7 +966,7 @@ static void *scmi_sensor_fill_custom_report(const void *handle,
 		const struct scmi_sensor_update_notify_payld *p = payld;
 		struct scmi_sensor_update_report *r = report;
 		struct sensors_info *sinfo =
-			((const struct scmi_handle *)(handle))->sensor_priv;
+			((const struct scmi_protocol_handle *)ph)->get_priv(ph);
 
 		/* payld_sz is variable for this event */
 		r->sensor_id = le32_to_cpu(p->sensor_id);
@@ -921,10 +996,10 @@ static void *scmi_sensor_fill_custom_report(const void *handle,
 	return rep;
 }
 
-static int scmi_sensor_get_num_sources(const void *handle)
+static int scmi_sensor_get_num_sources(const void *ph)
 {
 	struct sensors_info *si =
-		((const struct scmi_handle *)(handle))->sensor_priv;
+		((const struct scmi_protocol_handle *)ph)->get_priv(ph);
 
 	return si->num_sensors;
 }
@@ -960,44 +1035,46 @@ static const struct scmi_protocol_events sensor_protocol_events = {
 	.num_events = ARRAY_SIZE(sensor_events),
 };
 
-static int scmi_sensors_protocol_init(struct scmi_handle *handle)
+static int scmi_sensors_protocol_init(const struct scmi_protocol_handle *ph)
 {
 	u32 version;
 	int ret;
 	struct sensors_info *sinfo;
+	struct scmi_handle *handle;
 
-	scmi_version_get(handle, SCMI_PROTOCOL_SENSOR, &version);
+	ph->xops->version_get(ph, &version);
 
-	dev_dbg(handle->dev, "Sensor Version %d.%d\n",
+	dev_dbg(ph->dev, "Sensor Version %d.%d\n",
 		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
 
-	sinfo = devm_kzalloc(handle->dev, sizeof(*sinfo), GFP_KERNEL);
+	sinfo = devm_kzalloc(ph->dev, sizeof(*sinfo), GFP_KERNEL);
 	if (!sinfo)
 		return -ENOMEM;
 	sinfo->version = version;
 
-	ret = scmi_sensor_attributes_get(handle, sinfo);
+	ret = scmi_sensor_attributes_get(ph, sinfo);
 	if (ret)
 		return ret;
-	sinfo->sensors = devm_kcalloc(handle->dev, sinfo->num_sensors,
+	sinfo->sensors = devm_kcalloc(ph->dev, sinfo->num_sensors,
 				      sizeof(*sinfo->sensors), GFP_KERNEL);
 	if (!sinfo->sensors)
 		return -ENOMEM;
 
-	ret = scmi_sensor_description_get(handle, sinfo);
+	ret = scmi_sensor_description_get(ph, sinfo);
 	if (ret)
 		return ret;
 
-	handle->sensor_priv = sinfo;
+	/* Transient code for legacy ops interface */
+	handle = scmi_map_scmi_handle(ph);
 	handle->sensor_ops = &sensor_ops;
 
-	return 0;
+	return ph->set_priv(ph, sinfo);
 }
 
 static const struct scmi_protocol scmi_sensors = {
 	.id = SCMI_PROTOCOL_SENSOR,
-	.init = &scmi_sensors_protocol_init,
-	.ops = &sensor_ops,
+	.instance_init = &scmi_sensors_protocol_init,
+	.ops = &sensor_proto_ops,
 	.events = &sensor_protocol_events,
 };
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index c5bf9dc707aa..ead06db7be18 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -431,7 +431,7 @@ enum scmi_sensor_class {
 };
 
 /**
- * struct scmi_sensor_ops - represents the various operations provided
+ * struct scmi_sensor_proto_ops - represents the various operations provided
  *	by SCMI Sensor Protocol
  *
  * @count_get: get the count of sensors provided by SCMI
@@ -446,6 +446,23 @@ enum scmi_sensor_class {
  * @config_get: Get sensor current configuration
  * @config_set: Set sensor current configuration
  */
+struct scmi_sensor_proto_ops {
+	int (*count_get)(const struct scmi_protocol_handle *ph);
+	const struct scmi_sensor_info *(*info_get)
+		(const struct scmi_protocol_handle *ph, u32 sensor_id);
+	int (*trip_point_config)(const struct scmi_protocol_handle *ph,
+				 u32 sensor_id, u8 trip_id, u64 trip_value);
+	int (*reading_get)(const struct scmi_protocol_handle *ph, u32 sensor_id,
+			   u64 *value);
+	int (*reading_get_timestamped)(const struct scmi_protocol_handle *ph,
+				       u32 sensor_id, u8 count,
+				       struct scmi_sensor_reading *readings);
+	int (*config_get)(const struct scmi_protocol_handle *ph,
+			  u32 sensor_id, u32 *sensor_config);
+	int (*config_set)(const struct scmi_protocol_handle *ph,
+			  u32 sensor_id, u32 sensor_config);
+};
+
 struct scmi_sensor_ops {
 	int (*count_get)(const struct scmi_handle *handle);
 	const struct scmi_sensor_info *(*info_get)
@@ -611,8 +628,6 @@ struct scmi_notify_ops {
  *		       operations and a dedicated protocol handler
  * @devm_protocol_put: devres managed method to release a protocol
  * @notify_ops: pointer to set of notifications related operations
- * @sensor_priv: pointer to private data structure specific to sensors
- *	protocol(for internal use only)
  * @voltage_priv: pointer to private data structure specific to voltage
  *	protocol(for internal use only)
  * @notify_priv: pointer to private data structure specific to notifications
@@ -631,7 +646,6 @@ struct scmi_handle {
 
 	const struct scmi_notify_ops *notify_ops;
 	/* for protocol internal use */
-	void *sensor_priv;
 	void *voltage_priv;
 	void *notify_priv;
 	void *system_priv;
-- 
cgit v1.2.3


From f3690d9729105963661d06c8cfd752ff1fa2499c Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:51 +0000
Subject: firmware: arm_scmi: Remove legacy scmi_sensor_ops protocol interface

Now that all the SCMI driver users have been migrated to the new interface
remove the legacy interface and all the transient code.

Link: https://lore.kernel.org/r/20210316124903.35011-27-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 82 -------------------------------------
 include/linux/scmi_protocol.h       | 19 ---------
 2 files changed, 101 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index ae9f727730c8..2f5b06135468 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -624,17 +624,6 @@ scmi_sensor_trip_point_config(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int
-__scmi_sensor_trip_point_config(const struct scmi_handle *handle,
-				u32 sensor_id, u8 trip_id, u64 trip_value)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
-
-	return scmi_sensor_trip_point_config(ph, sensor_id, trip_id,
-					     trip_value);
-}
-
 static int scmi_sensor_config_get(const struct scmi_protocol_handle *ph,
 				  u32 sensor_id, u32 *sensor_config)
 {
@@ -660,15 +649,6 @@ static int scmi_sensor_config_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_sensor_config_get(const struct scmi_handle *handle,
-				    u32 sensor_id, u32 *sensor_config)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
-
-	return scmi_sensor_config_get(ph, sensor_id, sensor_config);
-}
-
 static int scmi_sensor_config_set(const struct scmi_protocol_handle *ph,
 				  u32 sensor_id, u32 sensor_config)
 {
@@ -697,15 +677,6 @@ static int scmi_sensor_config_set(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_sensor_config_set(const struct scmi_handle *handle,
-				    u32 sensor_id, u32 sensor_config)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
-
-	return scmi_sensor_config_set(ph, sensor_id, sensor_config);
-}
-
 /**
  * scmi_sensor_reading_get  - Read scalar sensor value
  * @ph: Protocol handle
@@ -760,15 +731,6 @@ static int scmi_sensor_reading_get(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_sensor_reading_get(const struct scmi_handle *handle,
-				     u32 sensor_id, u64 *value)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
-
-	return scmi_sensor_reading_get(ph, sensor_id, value);
-}
-
 static inline void
 scmi_parse_sensor_readings(struct scmi_sensor_reading *out,
 			   const struct scmi_sensor_reading_resp *in)
@@ -847,18 +809,6 @@ scmi_sensor_reading_get_timestamped(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int
-__scmi_sensor_reading_get_timestamped(const struct scmi_handle *handle,
-				      u32 sensor_id, u8 count,
-				      struct scmi_sensor_reading *readings)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
-
-	return scmi_sensor_reading_get_timestamped(ph, sensor_id, count,
-						   readings);
-}
-
 static const struct scmi_sensor_info *
 scmi_sensor_info_get(const struct scmi_protocol_handle *ph, u32 sensor_id)
 {
@@ -867,15 +817,6 @@ scmi_sensor_info_get(const struct scmi_protocol_handle *ph, u32 sensor_id)
 	return si->sensors + sensor_id;
 }
 
-static const struct scmi_sensor_info *
-__scmi_sensor_info_get(const struct scmi_handle *handle, u32 sensor_id)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
-
-	return scmi_sensor_info_get(ph, sensor_id);
-}
-
 static int scmi_sensor_count_get(const struct scmi_protocol_handle *ph)
 {
 	struct sensors_info *si = ph->get_priv(ph);
@@ -883,24 +824,6 @@ static int scmi_sensor_count_get(const struct scmi_protocol_handle *ph)
 	return si->num_sensors;
 }
 
-static int __scmi_sensor_count_get(const struct scmi_handle *handle)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_SENSOR);
-
-	return scmi_sensor_count_get(ph);
-}
-
-static const struct scmi_sensor_ops sensor_ops = {
-	.count_get = __scmi_sensor_count_get,
-	.info_get = __scmi_sensor_info_get,
-	.trip_point_config = __scmi_sensor_trip_point_config,
-	.reading_get = __scmi_sensor_reading_get,
-	.reading_get_timestamped = __scmi_sensor_reading_get_timestamped,
-	.config_get = __scmi_sensor_config_get,
-	.config_set = __scmi_sensor_config_set,
-};
-
 static const struct scmi_sensor_proto_ops sensor_proto_ops = {
 	.count_get = scmi_sensor_count_get,
 	.info_get = scmi_sensor_info_get,
@@ -1040,7 +963,6 @@ static int scmi_sensors_protocol_init(const struct scmi_protocol_handle *ph)
 	u32 version;
 	int ret;
 	struct sensors_info *sinfo;
-	struct scmi_handle *handle;
 
 	ph->xops->version_get(ph, &version);
 
@@ -1064,10 +986,6 @@ static int scmi_sensors_protocol_init(const struct scmi_protocol_handle *ph)
 	if (ret)
 		return ret;
 
-	/* Transient code for legacy ops interface */
-	handle = scmi_map_scmi_handle(ph);
-	handle->sensor_ops = &sensor_ops;
-
 	return ph->set_priv(ph, sinfo);
 }
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index ead06db7be18..3ec0ac30fe60 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -463,23 +463,6 @@ struct scmi_sensor_proto_ops {
 			  u32 sensor_id, u32 sensor_config);
 };
 
-struct scmi_sensor_ops {
-	int (*count_get)(const struct scmi_handle *handle);
-	const struct scmi_sensor_info *(*info_get)
-		(const struct scmi_handle *handle, u32 sensor_id);
-	int (*trip_point_config)(const struct scmi_handle *handle,
-				 u32 sensor_id, u8 trip_id, u64 trip_value);
-	int (*reading_get)(const struct scmi_handle *handle, u32 sensor_id,
-			   u64 *value);
-	int (*reading_get_timestamped)(const struct scmi_handle *handle,
-				       u32 sensor_id, u8 count,
-				       struct scmi_sensor_reading *readings);
-	int (*config_get)(const struct scmi_handle *handle,
-			  u32 sensor_id, u32 *sensor_config);
-	int (*config_set)(const struct scmi_handle *handle,
-			  u32 sensor_id, u32 sensor_config);
-};
-
 /**
  * struct scmi_reset_proto_ops - represents the various operations provided
  *	by SCMI Reset Protocol
@@ -622,7 +605,6 @@ struct scmi_notify_ops {
  *
  * @dev: pointer to the SCMI device
  * @version: pointer to the structure containing SCMI version information
- * @sensor_ops: pointer to set of sensor protocol operations
  * @voltage_ops: pointer to set of voltage protocol operations
  * @devm_protocol_get: devres managed method to acquire a protocol and get specific
  *		       operations and a dedicated protocol handler
@@ -636,7 +618,6 @@ struct scmi_notify_ops {
 struct scmi_handle {
 	struct device *dev;
 	struct scmi_revision_info *version;
-	const struct scmi_sensor_ops *sensor_ops;
 	const struct scmi_voltage_ops *voltage_ops;
 
 	const void __must_check *
-- 
cgit v1.2.3


From b46d852718c1ba725e0a8b06bd0a039f85465838 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:52 +0000
Subject: firmware: arm_scmi: Port systempower protocol to new protocols
 interface

Convert internals of protocol implementation to use protocol handles and
expose a new protocol operations interface for SCMI driver using the new
get/put common operations.

Remove handle->system_priv now unused.

Link: https://lore.kernel.org/r/20210316124903.35011-28-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/system.c | 30 ++++++++++++++----------------
 include/linux/scmi_protocol.h      |  1 -
 2 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/system.c b/drivers/firmware/arm_scmi/system.c
index 9d016dff4be5..ca6fb4698963 100644
--- a/drivers/firmware/arm_scmi/system.c
+++ b/drivers/firmware/arm_scmi/system.c
@@ -32,40 +32,40 @@ struct scmi_system_info {
 	u32 version;
 };
 
-static int scmi_system_request_notify(const struct scmi_handle *handle,
+static int scmi_system_request_notify(const struct scmi_protocol_handle *ph,
 				      bool enable)
 {
 	int ret;
 	struct scmi_xfer *t;
 	struct scmi_system_power_state_notify *notify;
 
-	ret = scmi_xfer_get_init(handle, SYSTEM_POWER_STATE_NOTIFY,
-				 SCMI_PROTOCOL_SYSTEM, sizeof(*notify), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, SYSTEM_POWER_STATE_NOTIFY,
+				      sizeof(*notify), 0, &t);
 	if (ret)
 		return ret;
 
 	notify = t->tx.buf;
 	notify->notify_enable = enable ? cpu_to_le32(BIT(0)) : 0;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_system_set_notify_enabled(const void *handle,
+static int scmi_system_set_notify_enabled(const void *ph,
 					  u8 evt_id, u32 src_id, bool enable)
 {
 	int ret;
 
-	ret = scmi_system_request_notify(handle, enable);
+	ret = scmi_system_request_notify(ph, enable);
 	if (ret)
 		pr_debug("FAIL_ENABLE - evt[%X] - ret:%d\n", evt_id, ret);
 
 	return ret;
 }
 
-static void *scmi_system_fill_custom_report(const void *handle,
+static void *scmi_system_fill_custom_report(const void *ph,
 					    u8 evt_id, ktime_t timestamp,
 					    const void *payld, size_t payld_sz,
 					    void *report, u32 *src_id)
@@ -109,29 +109,27 @@ static const struct scmi_protocol_events system_protocol_events = {
 	.num_sources = SCMI_SYSTEM_NUM_SOURCES,
 };
 
-static int scmi_system_protocol_init(struct scmi_handle *handle)
+static int scmi_system_protocol_init(const struct scmi_protocol_handle *ph)
 {
 	u32 version;
 	struct scmi_system_info *pinfo;
 
-	scmi_version_get(handle, SCMI_PROTOCOL_SYSTEM, &version);
+	ph->xops->version_get(ph, &version);
 
-	dev_dbg(handle->dev, "System Power Version %d.%d\n",
+	dev_dbg(ph->dev, "System Power Version %d.%d\n",
 		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
 
-	pinfo = devm_kzalloc(handle->dev, sizeof(*pinfo), GFP_KERNEL);
+	pinfo = devm_kzalloc(ph->dev, sizeof(*pinfo), GFP_KERNEL);
 	if (!pinfo)
 		return -ENOMEM;
 
 	pinfo->version = version;
-	handle->system_priv = pinfo;
-
-	return 0;
+	return ph->set_priv(ph, pinfo);
 }
 
 static const struct scmi_protocol scmi_system = {
 	.id = SCMI_PROTOCOL_SYSTEM,
-	.init = &scmi_system_protocol_init,
+	.instance_init = &scmi_system_protocol_init,
 	.ops = NULL,
 	.events = &system_protocol_events,
 };
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 3ec0ac30fe60..17b82c76cf7a 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -629,7 +629,6 @@ struct scmi_handle {
 	/* for protocol internal use */
 	void *voltage_priv;
 	void *notify_priv;
-	void *system_priv;
 };
 
 enum scmi_std_protocol {
-- 
cgit v1.2.3


From fe4894d968f4333f3d425221f03add8666881d72 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:53 +0000
Subject: firmware: arm_scmi: Port voltage protocol to new protocols interface

Convert internals of protocol implementation to use protocol handles and
expose a new protocol operations interface for SCMI driver using the new
get/put common operations, while keeping the old handle->voltage_ops still
around to ease transition.

Remove handle->voltage_priv now unused.

Link: https://lore.kernel.org/r/20210316124903.35011-29-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/voltage.c | 179 ++++++++++++++++++++++++------------
 include/linux/scmi_protocol.h       |  24 +++--
 2 files changed, 136 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/voltage.c b/drivers/firmware/arm_scmi/voltage.c
index b16f8d42fa0a..d32320ec8f38 100644
--- a/drivers/firmware/arm_scmi/voltage.c
+++ b/drivers/firmware/arm_scmi/voltage.c
@@ -59,23 +59,23 @@ struct voltage_info {
 	struct scmi_voltage_info *domains;
 };
 
-static int scmi_protocol_attributes_get(const struct scmi_handle *handle,
+static int scmi_protocol_attributes_get(const struct scmi_protocol_handle *ph,
 					struct voltage_info *vinfo)
 {
 	int ret;
 	struct scmi_xfer *t;
 
-	ret = scmi_xfer_get_init(handle, PROTOCOL_ATTRIBUTES,
-				 SCMI_PROTOCOL_VOLTAGE, 0, sizeof(__le32), &t);
+	ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES, 0,
+				      sizeof(__le32), &t);
 	if (ret)
 		return ret;
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret)
 		vinfo->num_domains =
 			NUM_VOLTAGE_DOMAINS(get_unaligned_le32(t->rx.buf));
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
@@ -109,24 +109,23 @@ static int scmi_init_voltage_levels(struct device *dev,
 	return 0;
 }
 
-static int scmi_voltage_descriptors_get(const struct scmi_handle *handle,
+static int scmi_voltage_descriptors_get(const struct scmi_protocol_handle *ph,
 					struct voltage_info *vinfo)
 {
 	int ret, dom;
 	struct scmi_xfer *td, *tl;
-	struct device *dev = handle->dev;
+	struct device *dev = ph->dev;
 	struct scmi_msg_resp_domain_attributes *resp_dom;
 	struct scmi_msg_resp_describe_levels *resp_levels;
 
-	ret = scmi_xfer_get_init(handle, VOLTAGE_DOMAIN_ATTRIBUTES,
-				 SCMI_PROTOCOL_VOLTAGE, sizeof(__le32),
-				 sizeof(*resp_dom), &td);
+	ret = ph->xops->xfer_get_init(ph, VOLTAGE_DOMAIN_ATTRIBUTES,
+				      sizeof(__le32), sizeof(*resp_dom), &td);
 	if (ret)
 		return ret;
 	resp_dom = td->rx.buf;
 
-	ret = scmi_xfer_get_init(handle, VOLTAGE_DESCRIBE_LEVELS,
-				 SCMI_PROTOCOL_VOLTAGE, sizeof(__le64), 0, &tl);
+	ret = ph->xops->xfer_get_init(ph, VOLTAGE_DESCRIBE_LEVELS,
+				      sizeof(__le64), 0, &tl);
 	if (ret)
 		goto outd;
 	resp_levels = tl->rx.buf;
@@ -139,7 +138,7 @@ static int scmi_voltage_descriptors_get(const struct scmi_handle *handle,
 
 		/* Retrieve domain attributes at first ... */
 		put_unaligned_le32(dom, td->tx.buf);
-		ret = scmi_do_xfer(handle, td);
+		ret = ph->xops->do_xfer(ph, td);
 		/* Skip domain on comms error */
 		if (ret)
 			continue;
@@ -157,7 +156,7 @@ static int scmi_voltage_descriptors_get(const struct scmi_handle *handle,
 
 			cmd->domain_id = cpu_to_le32(v->id);
 			cmd->level_index = desc_index;
-			ret = scmi_do_xfer(handle, tl);
+			ret = ph->xops->do_xfer(ph, tl);
 			if (ret)
 				break;
 
@@ -176,7 +175,7 @@ static int scmi_voltage_descriptors_get(const struct scmi_handle *handle,
 			}
 
 			if (desc_index + num_returned > v->num_levels) {
-				dev_err(handle->dev,
+				dev_err(ph->dev,
 					"No. of voltage levels can't exceed %d\n",
 					v->num_levels);
 				ret = -EINVAL;
@@ -195,7 +194,7 @@ static int scmi_voltage_descriptors_get(const struct scmi_handle *handle,
 
 			desc_index += num_returned;
 
-			scmi_reset_rx_to_maxsz(handle, tl);
+			ph->xops->reset_rx_to_maxsz(ph, tl);
 			/* check both to avoid infinite loop due to buggy fw */
 		} while (num_returned && num_remaining);
 
@@ -204,55 +203,52 @@ static int scmi_voltage_descriptors_get(const struct scmi_handle *handle,
 			devm_kfree(dev, v->levels_uv);
 		}
 
-		scmi_reset_rx_to_maxsz(handle, td);
+		ph->xops->reset_rx_to_maxsz(ph, td);
 	}
 
-	scmi_xfer_put(handle, tl);
+	ph->xops->xfer_put(ph, tl);
 outd:
-	scmi_xfer_put(handle, td);
+	ph->xops->xfer_put(ph, td);
 
 	return ret;
 }
 
-static int __scmi_voltage_get_u32(const struct scmi_handle *handle,
+static int __scmi_voltage_get_u32(const struct scmi_protocol_handle *ph,
 				  u8 cmd_id, u32 domain_id, u32 *value)
 {
 	int ret;
 	struct scmi_xfer *t;
-	struct voltage_info *vinfo = handle->voltage_priv;
+	struct voltage_info *vinfo = ph->get_priv(ph);
 
 	if (domain_id >= vinfo->num_domains)
 		return -EINVAL;
 
-	ret = scmi_xfer_get_init(handle, cmd_id,
-				 SCMI_PROTOCOL_VOLTAGE,
-				 sizeof(__le32), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, cmd_id, sizeof(__le32), 0, &t);
 	if (ret)
 		return ret;
 
 	put_unaligned_le32(domain_id, t->tx.buf);
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 	if (!ret)
 		*value = get_unaligned_le32(t->rx.buf);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_voltage_config_set(const struct scmi_handle *handle,
+static int scmi_voltage_config_set(const struct scmi_protocol_handle *ph,
 				   u32 domain_id, u32 config)
 {
 	int ret;
 	struct scmi_xfer *t;
-	struct voltage_info *vinfo = handle->voltage_priv;
+	struct voltage_info *vinfo = ph->get_priv(ph);
 	struct scmi_msg_cmd_config_set *cmd;
 
 	if (domain_id >= vinfo->num_domains)
 		return -EINVAL;
 
-	ret = scmi_xfer_get_init(handle, VOLTAGE_CONFIG_SET,
-				 SCMI_PROTOCOL_VOLTAGE,
-				 sizeof(*cmd), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, VOLTAGE_CONFIG_SET,
+				     sizeof(*cmd), 0, &t);
 	if (ret)
 		return ret;
 
@@ -260,33 +256,50 @@ static int scmi_voltage_config_set(const struct scmi_handle *handle,
 	cmd->domain_id = cpu_to_le32(domain_id);
 	cmd->config = cpu_to_le32(config & GENMASK(3, 0));
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_voltage_config_get(const struct scmi_handle *handle,
+static int __scmi_voltage_config_set(const struct scmi_handle *handle,
+				     u32 domain_id, u32 config)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
+
+	return scmi_voltage_config_set(ph, domain_id, config);
+}
+
+static int scmi_voltage_config_get(const struct scmi_protocol_handle *ph,
 				   u32 domain_id, u32 *config)
 {
-	return __scmi_voltage_get_u32(handle, VOLTAGE_CONFIG_GET,
+	return __scmi_voltage_get_u32(ph, VOLTAGE_CONFIG_GET,
 				      domain_id, config);
 }
 
-static int scmi_voltage_level_set(const struct scmi_handle *handle,
+static int __scmi_voltage_config_get(const struct scmi_handle *handle,
+				     u32 domain_id, u32 *config)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
+
+	return scmi_voltage_config_get(ph, domain_id, config);
+}
+
+static int scmi_voltage_level_set(const struct scmi_protocol_handle *ph,
 				  u32 domain_id, u32 flags, s32 volt_uV)
 {
 	int ret;
 	struct scmi_xfer *t;
-	struct voltage_info *vinfo = handle->voltage_priv;
+	struct voltage_info *vinfo = ph->get_priv(ph);
 	struct scmi_msg_cmd_level_set *cmd;
 
 	if (domain_id >= vinfo->num_domains)
 		return -EINVAL;
 
-	ret = scmi_xfer_get_init(handle, VOLTAGE_LEVEL_SET,
-				 SCMI_PROTOCOL_VOLTAGE,
-				 sizeof(*cmd), 0, &t);
+	ret = ph->xops->xfer_get_init(ph, VOLTAGE_LEVEL_SET,
+				      sizeof(*cmd), 0, &t);
 	if (ret)
 		return ret;
 
@@ -295,23 +308,41 @@ static int scmi_voltage_level_set(const struct scmi_handle *handle,
 	cmd->flags = cpu_to_le32(flags);
 	cmd->voltage_level = cpu_to_le32(volt_uV);
 
-	ret = scmi_do_xfer(handle, t);
+	ret = ph->xops->do_xfer(ph, t);
 
-	scmi_xfer_put(handle, t);
+	ph->xops->xfer_put(ph, t);
 	return ret;
 }
 
-static int scmi_voltage_level_get(const struct scmi_handle *handle,
+static int __scmi_voltage_level_set(const struct scmi_handle *handle,
+				    u32 domain_id, u32 flags, s32 volt_uV)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
+
+	return scmi_voltage_level_set(ph, domain_id, flags, volt_uV);
+}
+
+static int scmi_voltage_level_get(const struct scmi_protocol_handle *ph,
 				  u32 domain_id, s32 *volt_uV)
 {
-	return __scmi_voltage_get_u32(handle, VOLTAGE_LEVEL_GET,
+	return __scmi_voltage_get_u32(ph, VOLTAGE_LEVEL_GET,
 				      domain_id, (u32 *)volt_uV);
 }
 
+static int __scmi_voltage_level_get(const struct scmi_handle *handle,
+				    u32 domain_id, s32 *volt_uV)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
+
+	return scmi_voltage_level_get(ph, domain_id, volt_uV);
+}
+
 static const struct scmi_voltage_info * __must_check
-scmi_voltage_info_get(const struct scmi_handle *handle, u32 domain_id)
+scmi_voltage_info_get(const struct scmi_protocol_handle *ph, u32 domain_id)
 {
-	struct voltage_info *vinfo = handle->voltage_priv;
+	struct voltage_info *vinfo = ph->get_priv(ph);
 
 	if (domain_id >= vinfo->num_domains ||
 	    !vinfo->domains[domain_id].num_levels)
@@ -320,14 +351,40 @@ scmi_voltage_info_get(const struct scmi_handle *handle, u32 domain_id)
 	return vinfo->domains + domain_id;
 }
 
-static int scmi_voltage_domains_num_get(const struct scmi_handle *handle)
+static const struct scmi_voltage_info * __must_check
+__scmi_voltage_info_get(const struct scmi_handle *handle, u32 domain_id)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
+
+	return scmi_voltage_info_get(ph, domain_id);
+}
+
+static int scmi_voltage_domains_num_get(const struct scmi_protocol_handle *ph)
 {
-	struct voltage_info *vinfo = handle->voltage_priv;
+	struct voltage_info *vinfo = ph->get_priv(ph);
 
 	return vinfo->num_domains;
 }
 
+static int __scmi_voltage_domains_num_get(const struct scmi_handle *handle)
+{
+	const struct scmi_protocol_handle *ph =
+		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
+
+	return scmi_voltage_domains_num_get(ph);
+}
+
 static struct scmi_voltage_ops voltage_ops = {
+	.num_domains_get = __scmi_voltage_domains_num_get,
+	.info_get = __scmi_voltage_info_get,
+	.config_set = __scmi_voltage_config_set,
+	.config_get = __scmi_voltage_config_get,
+	.level_set = __scmi_voltage_level_set,
+	.level_get = __scmi_voltage_level_get,
+};
+
+static struct scmi_voltage_proto_ops voltage_proto_ops = {
 	.num_domains_get = scmi_voltage_domains_num_get,
 	.info_get = scmi_voltage_info_get,
 	.config_set = scmi_voltage_config_set,
@@ -336,51 +393,53 @@ static struct scmi_voltage_ops voltage_ops = {
 	.level_get = scmi_voltage_level_get,
 };
 
-static int scmi_voltage_protocol_init(struct scmi_handle *handle)
+static int scmi_voltage_protocol_init(const struct scmi_protocol_handle *ph)
 {
 	int ret;
 	u32 version;
 	struct voltage_info *vinfo;
+	struct scmi_handle *handle;
 
-	ret = scmi_version_get(handle, SCMI_PROTOCOL_VOLTAGE, &version);
+	ret = ph->xops->version_get(ph, &version);
 	if (ret)
 		return ret;
 
-	dev_dbg(handle->dev, "Voltage Version %d.%d\n",
+	dev_dbg(ph->dev, "Voltage Version %d.%d\n",
 		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
 
-	vinfo = devm_kzalloc(handle->dev, sizeof(*vinfo), GFP_KERNEL);
+	vinfo = devm_kzalloc(ph->dev, sizeof(*vinfo), GFP_KERNEL);
 	if (!vinfo)
 		return -ENOMEM;
 	vinfo->version = version;
 
-	ret = scmi_protocol_attributes_get(handle, vinfo);
+	ret = scmi_protocol_attributes_get(ph, vinfo);
 	if (ret)
 		return ret;
 
 	if (vinfo->num_domains) {
-		vinfo->domains = devm_kcalloc(handle->dev, vinfo->num_domains,
+		vinfo->domains = devm_kcalloc(ph->dev, vinfo->num_domains,
 					      sizeof(*vinfo->domains),
 					      GFP_KERNEL);
 		if (!vinfo->domains)
 			return -ENOMEM;
-		ret = scmi_voltage_descriptors_get(handle, vinfo);
+		ret = scmi_voltage_descriptors_get(ph, vinfo);
 		if (ret)
 			return ret;
 	} else {
-		dev_warn(handle->dev, "No Voltage domains found.\n");
+		dev_warn(ph->dev, "No Voltage domains found.\n");
 	}
 
+	/* Transient code for legacy ops interface */
+	handle = scmi_map_scmi_handle(ph);
 	handle->voltage_ops = &voltage_ops;
-	handle->voltage_priv = vinfo;
 
-	return 0;
+	return ph->set_priv(ph, vinfo);
 }
 
 static const struct scmi_protocol scmi_voltage = {
 	.id = SCMI_PROTOCOL_VOLTAGE,
-	.init = &scmi_voltage_protocol_init,
-	.ops = &voltage_ops,
+	.instance_init = &scmi_voltage_protocol_init,
+	.ops = &voltage_proto_ops,
 };
 
 DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(voltage, scmi_voltage)
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 17b82c76cf7a..00fdf0c5786b 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -515,7 +515,7 @@ struct scmi_voltage_info {
 };
 
 /**
- * struct scmi_voltage_ops - represents the various operations provided
+ * struct scmi_voltage_proto_ops - represents the various operations provided
  * by SCMI Voltage Protocol
  *
  * @num_domains_get: get the count of voltage domains provided by SCMI
@@ -525,14 +525,28 @@ struct scmi_voltage_info {
  * @level_set: set the voltage level for the specified domain
  * @level_get: get the voltage level of the specified domain
  */
+struct scmi_voltage_proto_ops {
+	int (*num_domains_get)(const struct scmi_protocol_handle *ph);
+	const struct scmi_voltage_info __must_check *(*info_get)
+		(const struct scmi_protocol_handle *ph, u32 domain_id);
+	int (*config_set)(const struct scmi_protocol_handle *ph, u32 domain_id,
+			  u32 config);
+#define	SCMI_VOLTAGE_ARCH_STATE_OFF		0x0
+#define	SCMI_VOLTAGE_ARCH_STATE_ON		0x7
+	int (*config_get)(const struct scmi_protocol_handle *ph, u32 domain_id,
+			  u32 *config);
+	int (*level_set)(const struct scmi_protocol_handle *ph, u32 domain_id,
+			 u32 flags, s32 volt_uV);
+	int (*level_get)(const struct scmi_protocol_handle *ph, u32 domain_id,
+			 s32 *volt_uV);
+};
+
 struct scmi_voltage_ops {
 	int (*num_domains_get)(const struct scmi_handle *handle);
 	const struct scmi_voltage_info __must_check *(*info_get)
 		(const struct scmi_handle *handle, u32 domain_id);
 	int (*config_set)(const struct scmi_handle *handle, u32 domain_id,
 			  u32 config);
-#define	SCMI_VOLTAGE_ARCH_STATE_OFF		0x0
-#define	SCMI_VOLTAGE_ARCH_STATE_ON		0x7
 	int (*config_get)(const struct scmi_handle *handle, u32 domain_id,
 			  u32 *config);
 	int (*level_set)(const struct scmi_handle *handle, u32 domain_id,
@@ -610,8 +624,6 @@ struct scmi_notify_ops {
  *		       operations and a dedicated protocol handler
  * @devm_protocol_put: devres managed method to release a protocol
  * @notify_ops: pointer to set of notifications related operations
- * @voltage_priv: pointer to private data structure specific to voltage
- *	protocol(for internal use only)
  * @notify_priv: pointer to private data structure specific to notifications
  *	(for internal use only)
  */
@@ -626,8 +638,6 @@ struct scmi_handle {
 	void (*devm_protocol_put)(struct scmi_device *sdev, u8 proto);
 
 	const struct scmi_notify_ops *notify_ops;
-	/* for protocol internal use */
-	void *voltage_priv;
 	void *notify_priv;
 };
 
-- 
cgit v1.2.3


From c3ed5e953ef0cdd599e70558e2b26696f1fbe77d Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:48:55 +0000
Subject: firmware: arm_scmi: Remove legacy scmi_voltage_ops protocol interface

Now that all the SCMI driver users have been migrated to the new interface
remove the legacy interface and all the transient code.

Link: https://lore.kernel.org/r/20210316124903.35011-31-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/voltage.c | 67 -------------------------------------
 include/linux/scmi_protocol.h       | 16 ---------
 2 files changed, 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/voltage.c b/drivers/firmware/arm_scmi/voltage.c
index d32320ec8f38..364bc14aa5fe 100644
--- a/drivers/firmware/arm_scmi/voltage.c
+++ b/drivers/firmware/arm_scmi/voltage.c
@@ -262,15 +262,6 @@ static int scmi_voltage_config_set(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_voltage_config_set(const struct scmi_handle *handle,
-				     u32 domain_id, u32 config)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
-
-	return scmi_voltage_config_set(ph, domain_id, config);
-}
-
 static int scmi_voltage_config_get(const struct scmi_protocol_handle *ph,
 				   u32 domain_id, u32 *config)
 {
@@ -278,15 +269,6 @@ static int scmi_voltage_config_get(const struct scmi_protocol_handle *ph,
 				      domain_id, config);
 }
 
-static int __scmi_voltage_config_get(const struct scmi_handle *handle,
-				     u32 domain_id, u32 *config)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
-
-	return scmi_voltage_config_get(ph, domain_id, config);
-}
-
 static int scmi_voltage_level_set(const struct scmi_protocol_handle *ph,
 				  u32 domain_id, u32 flags, s32 volt_uV)
 {
@@ -314,15 +296,6 @@ static int scmi_voltage_level_set(const struct scmi_protocol_handle *ph,
 	return ret;
 }
 
-static int __scmi_voltage_level_set(const struct scmi_handle *handle,
-				    u32 domain_id, u32 flags, s32 volt_uV)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
-
-	return scmi_voltage_level_set(ph, domain_id, flags, volt_uV);
-}
-
 static int scmi_voltage_level_get(const struct scmi_protocol_handle *ph,
 				  u32 domain_id, s32 *volt_uV)
 {
@@ -330,15 +303,6 @@ static int scmi_voltage_level_get(const struct scmi_protocol_handle *ph,
 				      domain_id, (u32 *)volt_uV);
 }
 
-static int __scmi_voltage_level_get(const struct scmi_handle *handle,
-				    u32 domain_id, s32 *volt_uV)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
-
-	return scmi_voltage_level_get(ph, domain_id, volt_uV);
-}
-
 static const struct scmi_voltage_info * __must_check
 scmi_voltage_info_get(const struct scmi_protocol_handle *ph, u32 domain_id)
 {
@@ -351,15 +315,6 @@ scmi_voltage_info_get(const struct scmi_protocol_handle *ph, u32 domain_id)
 	return vinfo->domains + domain_id;
 }
 
-static const struct scmi_voltage_info * __must_check
-__scmi_voltage_info_get(const struct scmi_handle *handle, u32 domain_id)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
-
-	return scmi_voltage_info_get(ph, domain_id);
-}
-
 static int scmi_voltage_domains_num_get(const struct scmi_protocol_handle *ph)
 {
 	struct voltage_info *vinfo = ph->get_priv(ph);
@@ -367,23 +322,6 @@ static int scmi_voltage_domains_num_get(const struct scmi_protocol_handle *ph)
 	return vinfo->num_domains;
 }
 
-static int __scmi_voltage_domains_num_get(const struct scmi_handle *handle)
-{
-	const struct scmi_protocol_handle *ph =
-		scmi_map_protocol_handle(handle, SCMI_PROTOCOL_VOLTAGE);
-
-	return scmi_voltage_domains_num_get(ph);
-}
-
-static struct scmi_voltage_ops voltage_ops = {
-	.num_domains_get = __scmi_voltage_domains_num_get,
-	.info_get = __scmi_voltage_info_get,
-	.config_set = __scmi_voltage_config_set,
-	.config_get = __scmi_voltage_config_get,
-	.level_set = __scmi_voltage_level_set,
-	.level_get = __scmi_voltage_level_get,
-};
-
 static struct scmi_voltage_proto_ops voltage_proto_ops = {
 	.num_domains_get = scmi_voltage_domains_num_get,
 	.info_get = scmi_voltage_info_get,
@@ -398,7 +336,6 @@ static int scmi_voltage_protocol_init(const struct scmi_protocol_handle *ph)
 	int ret;
 	u32 version;
 	struct voltage_info *vinfo;
-	struct scmi_handle *handle;
 
 	ret = ph->xops->version_get(ph, &version);
 	if (ret)
@@ -429,10 +366,6 @@ static int scmi_voltage_protocol_init(const struct scmi_protocol_handle *ph)
 		dev_warn(ph->dev, "No Voltage domains found.\n");
 	}
 
-	/* Transient code for legacy ops interface */
-	handle = scmi_map_scmi_handle(ph);
-	handle->voltage_ops = &voltage_ops;
-
 	return ph->set_priv(ph, vinfo);
 }
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 00fdf0c5786b..284dda52006e 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -541,20 +541,6 @@ struct scmi_voltage_proto_ops {
 			 s32 *volt_uV);
 };
 
-struct scmi_voltage_ops {
-	int (*num_domains_get)(const struct scmi_handle *handle);
-	const struct scmi_voltage_info __must_check *(*info_get)
-		(const struct scmi_handle *handle, u32 domain_id);
-	int (*config_set)(const struct scmi_handle *handle, u32 domain_id,
-			  u32 config);
-	int (*config_get)(const struct scmi_handle *handle, u32 domain_id,
-			  u32 *config);
-	int (*level_set)(const struct scmi_handle *handle, u32 domain_id,
-			 u32 flags, s32 volt_uV);
-	int (*level_get)(const struct scmi_handle *handle, u32 domain_id,
-			 s32 *volt_uV);
-};
-
 /**
  * struct scmi_notify_ops  - represents notifications' operations provided by
  * SCMI core
@@ -619,7 +605,6 @@ struct scmi_notify_ops {
  *
  * @dev: pointer to the SCMI device
  * @version: pointer to the structure containing SCMI version information
- * @voltage_ops: pointer to set of voltage protocol operations
  * @devm_protocol_get: devres managed method to acquire a protocol and get specific
  *		       operations and a dedicated protocol handler
  * @devm_protocol_put: devres managed method to release a protocol
@@ -630,7 +615,6 @@ struct scmi_notify_ops {
 struct scmi_handle {
 	struct device *dev;
 	struct scmi_revision_info *version;
-	const struct scmi_voltage_ops *voltage_ops;
 
 	const void __must_check *
 		(*devm_protocol_get)(struct scmi_device *sdev, u8 proto,
-- 
cgit v1.2.3


From a02d7c93c1f3cc892b69b50069ae757c92c03b0d Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:49:00 +0000
Subject: firmware: arm_scmi: Make notify_priv really private

Notification private data is currently accessible via handle->notify_priv,
this data was indeed meant to be private to the notification core support
and not to be accessible by SCMI drivers. Make it private hiding it
inside instance descriptor struct scmi_info and accessible only via
dedicated helpers.

Link: https://lore.kernel.org/r/20210316124903.35011-36-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/common.h |  4 ++++
 drivers/firmware/arm_scmi/driver.c | 21 ++++++++++++++++++
 drivers/firmware/arm_scmi/notify.c | 45 +++++++++++++-------------------------
 include/linux/scmi_protocol.h      |  3 ---
 4 files changed, 40 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index c4a1262fb18d..c093f332cdcd 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -343,4 +343,8 @@ void shmem_clear_channel(struct scmi_shared_mem __iomem *shmem);
 bool shmem_poll_done(struct scmi_shared_mem __iomem *shmem,
 		     struct scmi_xfer *xfer);
 
+void scmi_notification_instance_data_set(const struct scmi_handle *handle,
+					 void *priv);
+void *scmi_notification_instance_data_get(const struct scmi_handle *handle);
+
 #endif /* _SCMI_COMMON_H */
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 27213bf768c0..563a64131035 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -113,6 +113,7 @@ struct scmi_protocol_instance {
  * @protocols_mtx: A mutex to protect protocols instances initialization.
  * @protocols_imp: List of protocols implemented, currently maximum of
  *	MAX_PROTOCOLS_IMP elements allocated by the base protocol
+ * @notify_priv: Pointer to private data structure specific to notifications.
  * @node: List head
  * @users: Number of users of this instance
  */
@@ -129,6 +130,7 @@ struct scmi_info {
 	/* Ensure mutual exclusive access to protocols instance array */
 	struct mutex protocols_mtx;
 	u8 *protocols_imp;
+	void *notify_priv;
 	struct list_head node;
 	int users;
 };
@@ -170,6 +172,25 @@ static inline void scmi_dump_header_dbg(struct device *dev,
 		hdr->id, hdr->seq, hdr->protocol_id);
 }
 
+void scmi_notification_instance_data_set(const struct scmi_handle *handle,
+					 void *priv)
+{
+	struct scmi_info *info = handle_to_scmi_info(handle);
+
+	info->notify_priv = priv;
+	/* Ensure updated protocol private date are visible */
+	smp_wmb();
+}
+
+void *scmi_notification_instance_data_get(const struct scmi_handle *handle)
+{
+	struct scmi_info *info = handle_to_scmi_info(handle);
+
+	/* Ensure protocols_private_data has been updated */
+	smp_rmb();
+	return info->notify_priv;
+}
+
 /**
  * scmi_xfer_get() - Allocate one message
  *
diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index 023c93deb14b..9ca019dd0aeb 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -582,11 +582,9 @@ int scmi_notify(const struct scmi_handle *handle, u8 proto_id, u8 evt_id,
 	struct scmi_event_header eh;
 	struct scmi_notify_instance *ni;
 
-	/* Ensure notify_priv is updated */
-	smp_rmb();
-	if (!handle->notify_priv)
+	ni = scmi_notification_instance_data_get(handle);
+	if (!ni)
 		return 0;
-	ni = handle->notify_priv;
 
 	r_evt = SCMI_GET_REVT(ni, proto_id, evt_id);
 	if (!r_evt)
@@ -762,11 +760,9 @@ int scmi_register_protocol_events(const struct scmi_handle *handle, u8 proto_id,
 	    (!ee->num_sources && !ee->ops->get_num_sources))
 		return -EINVAL;
 
-	/* Ensure notify_priv is updated */
-	smp_rmb();
-	if (!handle->notify_priv)
+	ni = scmi_notification_instance_data_get(handle);
+	if (!ni)
 		return -ENOMEM;
-	ni = handle->notify_priv;
 
 	/* num_sources cannot be <= 0 */
 	if (ee->num_sources) {
@@ -846,12 +842,10 @@ void scmi_deregister_protocol_events(const struct scmi_handle *handle,
 	struct scmi_notify_instance *ni;
 	struct scmi_registered_events_desc *pd;
 
-	/* Ensure notify_priv is updated */
-	smp_rmb();
-	if (!handle->notify_priv)
+	ni = scmi_notification_instance_data_get(handle);
+	if (!ni)
 		return;
 
-	ni = handle->notify_priv;
 	pd = ni->registered_protocols[proto_id];
 	if (!pd)
 		return;
@@ -1354,11 +1348,9 @@ static int scmi_register_notifier(const struct scmi_handle *handle,
 	struct scmi_event_handler *hndl;
 	struct scmi_notify_instance *ni;
 
-	/* Ensure notify_priv is updated */
-	smp_rmb();
-	if (!handle->notify_priv)
+	ni = scmi_notification_instance_data_get(handle);
+	if (!ni)
 		return -ENODEV;
-	ni = handle->notify_priv;
 
 	evt_key = MAKE_HASH_KEY(proto_id, evt_id,
 				src_id ? *src_id : SRC_ID_MASK);
@@ -1402,11 +1394,9 @@ static int scmi_unregister_notifier(const struct scmi_handle *handle,
 	struct scmi_event_handler *hndl;
 	struct scmi_notify_instance *ni;
 
-	/* Ensure notify_priv is updated */
-	smp_rmb();
-	if (!handle->notify_priv)
+	ni = scmi_notification_instance_data_get(handle);
+	if (!ni)
 		return -ENODEV;
-	ni = handle->notify_priv;
 
 	evt_key = MAKE_HASH_KEY(proto_id, evt_id,
 				src_id ? *src_id : SRC_ID_MASK);
@@ -1681,8 +1671,8 @@ int scmi_notification_init(struct scmi_handle *handle)
 
 	INIT_WORK(&ni->init_work, scmi_protocols_late_init);
 
+	scmi_notification_instance_data_set(handle, ni);
 	handle->notify_ops = &notify_ops;
-	handle->notify_priv = ni;
 	/* Ensure handle is up to date */
 	smp_wmb();
 
@@ -1694,7 +1684,7 @@ int scmi_notification_init(struct scmi_handle *handle)
 
 err:
 	dev_warn(handle->dev, "Initialization Failed.\n");
-	devres_release_group(handle->dev, NULL);
+	devres_release_group(handle->dev, gid);
 	return -ENOMEM;
 }
 
@@ -1706,15 +1696,10 @@ void scmi_notification_exit(struct scmi_handle *handle)
 {
 	struct scmi_notify_instance *ni;
 
-	/* Ensure notify_priv is updated */
-	smp_rmb();
-	if (!handle->notify_priv)
+	ni = scmi_notification_instance_data_get(handle);
+	if (!ni)
 		return;
-	ni = handle->notify_priv;
-
-	handle->notify_priv = NULL;
-	/* Ensure handle is up to date */
-	smp_wmb();
+	scmi_notification_instance_data_set(handle, NULL);
 
 	/* Destroy while letting pending work complete */
 	destroy_workqueue(ni->notify_wq);
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 284dda52006e..114890bd7af0 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -609,8 +609,6 @@ struct scmi_notify_ops {
  *		       operations and a dedicated protocol handler
  * @devm_protocol_put: devres managed method to release a protocol
  * @notify_ops: pointer to set of notifications related operations
- * @notify_priv: pointer to private data structure specific to notifications
- *	(for internal use only)
  */
 struct scmi_handle {
 	struct device *dev;
@@ -622,7 +620,6 @@ struct scmi_handle {
 	void (*devm_protocol_put)(struct scmi_device *sdev, u8 proto);
 
 	const struct scmi_notify_ops *notify_ops;
-	void *notify_priv;
 };
 
 enum scmi_std_protocol {
-- 
cgit v1.2.3


From aa1fd3e4cb976248651c319bb0ef929345958cf4 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:49:01 +0000
Subject: firmware: arm_scmi: Rename non devres notify_ops

Rename non devres managed notify_ops to use a naming pattern which exposes
the performed action verb as last token.

No functional change.

Link: https://lore.kernel.org/r/20210316124903.35011-37-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/notify.c | 18 +++++++++---------
 include/linux/scmi_protocol.h      |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index 9ca019dd0aeb..d860bebd984a 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -1307,7 +1307,7 @@ static int scmi_event_handler_enable_events(struct scmi_event_handler *hndl)
 }
 
 /**
- * scmi_register_notifier()  - Register a notifier_block for an event
+ * scmi_notifier_register()  - Register a notifier_block for an event
  * @handle: The handle identifying the platform instance against which the
  *	    callback is registered
  * @proto_id: Protocol ID
@@ -1339,7 +1339,7 @@ static int scmi_event_handler_enable_events(struct scmi_event_handler *hndl)
  *
  * Return: 0 on Success
  */
-static int scmi_register_notifier(const struct scmi_handle *handle,
+static int scmi_notifier_register(const struct scmi_handle *handle,
 				  u8 proto_id, u8 evt_id, const u32 *src_id,
 				  struct notifier_block *nb)
 {
@@ -1371,7 +1371,7 @@ static int scmi_register_notifier(const struct scmi_handle *handle,
 }
 
 /**
- * scmi_unregister_notifier()  - Unregister a notifier_block for an event
+ * scmi_notifier_unregister()  - Unregister a notifier_block for an event
  * @handle: The handle identifying the platform instance against which the
  *	    callback is unregistered
  * @proto_id: Protocol ID
@@ -1386,7 +1386,7 @@ static int scmi_register_notifier(const struct scmi_handle *handle,
  *
  * Return: 0 on Success
  */
-static int scmi_unregister_notifier(const struct scmi_handle *handle,
+static int scmi_notifier_unregister(const struct scmi_handle *handle,
 				    u8 proto_id, u8 evt_id, const u32 *src_id,
 				    struct notifier_block *nb)
 {
@@ -1412,7 +1412,7 @@ static int scmi_unregister_notifier(const struct scmi_handle *handle,
 	scmi_put_handler(ni, hndl);
 
 	/*
-	 * This balances the initial get issued in @scmi_register_notifier.
+	 * This balances the initial get issued in @scmi_notifier_register.
 	 * If this notifier_block happened to be the last known user callback
 	 * for this event, the handler is here freed and the event's generation
 	 * stopped.
@@ -1440,7 +1440,7 @@ static void scmi_devm_release_notifier(struct device *dev, void *res)
 {
 	struct scmi_notifier_devres *dres = res;
 
-	scmi_unregister_notifier(dres->handle, dres->proto_id, dres->evt_id,
+	scmi_notifier_unregister(dres->handle, dres->proto_id, dres->evt_id,
 				 dres->src_id, dres->nb);
 }
 
@@ -1471,7 +1471,7 @@ static int scmi_devm_notifier_register(struct scmi_device *sdev,
 	if (!dres)
 		return -ENOMEM;
 
-	ret = scmi_register_notifier(sdev->handle, proto_id,
+	ret = scmi_notifier_register(sdev->handle, proto_id,
 				     evt_id, src_id, nb);
 	if (ret) {
 		devres_free(dres);
@@ -1609,8 +1609,8 @@ static void scmi_protocols_late_init(struct work_struct *work)
 static const struct scmi_notify_ops notify_ops = {
 	.devm_event_notifier_register = scmi_devm_notifier_register,
 	.devm_event_notifier_unregister = scmi_devm_notifier_unregister,
-	.register_event_notifier = scmi_register_notifier,
-	.unregister_event_notifier = scmi_unregister_notifier,
+	.event_notifier_register = scmi_notifier_register,
+	.event_notifier_unregister = scmi_notifier_unregister,
 };
 
 /**
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 114890bd7af0..b80496d519f3 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -548,8 +548,8 @@ struct scmi_voltage_proto_ops {
  *				  the requested event
  * @devm_event_notifier_unregister: Managed unregistration of a notifier_block
  *				    for the requested event
- * @register_event_notifier: Register a notifier_block for the requested event
- * @unregister_event_notifier: Unregister a notifier_block for the requested
+ * @event_notifier_register: Register a notifier_block for the requested event
+ * @event_notifier_unregister: Unregister a notifier_block for the requested
  *			       event
  *
  * A user can register/unregister its own notifier_block against the wanted
@@ -590,11 +590,11 @@ struct scmi_notify_ops {
 					      u8 proto_id, u8 evt_id,
 					      const u32 *src_id,
 					      struct notifier_block *nb);
-	int (*register_event_notifier)(const struct scmi_handle *handle,
+	int (*event_notifier_register)(const struct scmi_handle *handle,
 				       u8 proto_id, u8 evt_id,
 				       const u32 *src_id,
 				       struct notifier_block *nb);
-	int (*unregister_event_notifier)(const struct scmi_handle *handle,
+	int (*event_notifier_unregister)(const struct scmi_handle *handle,
 					 u8 proto_id, u8 evt_id,
 					 const u32 *src_id,
 					 struct notifier_block *nb);
-- 
cgit v1.2.3


From f5800e0bf6f9b7ff7dfa4db8b0681bd062eb16a4 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Tue, 16 Mar 2021 12:49:02 +0000
Subject: firmware: arm_scmi: Add protocol modularization support

Extend SCMI protocols accounting mechanism to address possible module
usage and add the support to possibly define new protocols as loadable
modules.

Keep the standard protocols built into the SCMI core.

Link: https://lore.kernel.org/r/20210316124903.35011-38-cristian.marussi@arm.com
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/base.c    |  2 ++
 drivers/firmware/arm_scmi/bus.c     | 11 ++++++++++-
 drivers/firmware/arm_scmi/clock.c   |  2 ++
 drivers/firmware/arm_scmi/common.h  |  4 ++++
 drivers/firmware/arm_scmi/driver.c  | 17 +++++++++++++----
 drivers/firmware/arm_scmi/perf.c    |  2 ++
 drivers/firmware/arm_scmi/power.c   |  2 ++
 drivers/firmware/arm_scmi/reset.c   |  2 ++
 drivers/firmware/arm_scmi/sensors.c |  2 ++
 drivers/firmware/arm_scmi/system.c  |  2 ++
 drivers/firmware/arm_scmi/voltage.c |  2 ++
 include/linux/scmi_protocol.h       | 12 ++++++++++++
 12 files changed, 55 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/base.c b/drivers/firmware/arm_scmi/base.c
index 0d28f0f825b6..de416f9e7921 100644
--- a/drivers/firmware/arm_scmi/base.c
+++ b/drivers/firmware/arm_scmi/base.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt) "SCMI Notifications BASE - " fmt
 
+#include <linux/module.h>
 #include <linux/scmi_protocol.h>
 
 #include "common.h"
@@ -373,6 +374,7 @@ static int scmi_base_protocol_init(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_protocol scmi_base = {
 	.id = SCMI_PROTOCOL_BASE,
+	.owner = NULL,
 	.instance_init = &scmi_base_protocol_init,
 	.ops = NULL,
 	.events = &base_protocol_events,
diff --git a/drivers/firmware/arm_scmi/bus.c b/drivers/firmware/arm_scmi/bus.c
index 992d50e10111..aadf2da21561 100644
--- a/drivers/firmware/arm_scmi/bus.c
+++ b/drivers/firmware/arm_scmi/bus.c
@@ -56,7 +56,7 @@ const struct scmi_protocol *scmi_protocol_get(int protocol_id)
 	const struct scmi_protocol *proto;
 
 	proto = idr_find(&scmi_protocols, protocol_id);
-	if (!proto) {
+	if (!proto || !try_module_get(proto->owner)) {
 		pr_warn("SCMI Protocol 0x%x not found!\n", protocol_id);
 		return NULL;
 	}
@@ -66,6 +66,15 @@ const struct scmi_protocol *scmi_protocol_get(int protocol_id)
 	return proto;
 }
 
+void scmi_protocol_put(int protocol_id)
+{
+	const struct scmi_protocol *proto;
+
+	proto = idr_find(&scmi_protocols, protocol_id);
+	if (proto)
+		module_put(proto->owner);
+}
+
 static int scmi_dev_probe(struct device *dev)
 {
 	struct scmi_driver *scmi_drv = to_scmi_driver(dev->driver);
diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c
index 827cf25fc122..35b56c8ba0c0 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2018-2021 ARM Ltd.
  */
 
+#include <linux/module.h>
 #include <linux/sort.h>
 
 #include "common.h"
@@ -367,6 +368,7 @@ static int scmi_clock_protocol_init(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_protocol scmi_clock = {
 	.id = SCMI_PROTOCOL_CLOCK,
+	.owner = THIS_MODULE,
 	.instance_init = &scmi_clock_protocol_init,
 	.ops = &clk_proto_ops,
 };
diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index c093f332cdcd..762dd5419700 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -14,6 +14,7 @@
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/scmi_protocol.h>
 #include <linux/types.h>
 
@@ -214,6 +215,7 @@ typedef int (*scmi_prot_init_ph_fn_t)(const struct scmi_protocol_handle *);
 /**
  * struct scmi_protocol  - Protocol descriptor
  * @id: Protocol ID.
+ * @owner: Module reference if any.
  * @instance_init: Mandatory protocol initialization function.
  * @instance_deinit: Optional protocol de-initialization function.
  * @ops: Optional reference to the operations provided by the protocol and
@@ -222,6 +224,7 @@ typedef int (*scmi_prot_init_ph_fn_t)(const struct scmi_protocol_handle *);
  */
 struct scmi_protocol {
 	const u8				id;
+	struct module				*owner;
 	const scmi_prot_init_ph_fn_t		instance_init;
 	const scmi_prot_init_ph_fn_t		instance_deinit;
 	const void				*ops;
@@ -257,6 +260,7 @@ void __exit scmi_##name##_unregister(void)			\
 }
 
 const struct scmi_protocol *scmi_protocol_get(int protocol_id);
+void scmi_protocol_put(int protocol_id);
 
 int scmi_protocol_acquire(const struct scmi_handle *handle, u8 protocol_id);
 void scmi_protocol_release(const struct scmi_handle *handle, u8 protocol_id);
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 563a64131035..c25c20c150dc 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -660,7 +660,8 @@ scmi_revision_area_get(const struct scmi_protocol_handle *ph)
  *
  * Context: Assumes to be called with @protocols_mtx already acquired.
  * Return: A reference to a freshly allocated and initialized protocol instance
- *	   or ERR_PTR on failure.
+ *	   or ERR_PTR on failure. On failure the @proto reference is at first
+ *	   put using @scmi_protocol_put() before releasing all the devres group.
  */
 static struct scmi_protocol_instance *
 scmi_alloc_init_protocol_instance(struct scmi_info *info,
@@ -673,8 +674,10 @@ scmi_alloc_init_protocol_instance(struct scmi_info *info,
 
 	/* Protocol specific devres group */
 	gid = devres_open_group(handle->dev, NULL, GFP_KERNEL);
-	if (!gid)
+	if (!gid) {
+		scmi_protocol_put(proto->id);
 		goto out;
+	}
 
 	pi = devm_kzalloc(handle->dev, sizeof(*pi), GFP_KERNEL);
 	if (!pi)
@@ -718,6 +721,8 @@ scmi_alloc_init_protocol_instance(struct scmi_info *info,
 	return pi;
 
 clean:
+	/* Take care to put the protocol module's owner before releasing all */
+	scmi_protocol_put(proto->id);
 	devres_release_group(handle->dev, gid);
 out:
 	return ERR_PTR(ret);
@@ -732,7 +737,9 @@ out:
  * instance, allocate and initialize all the needed structures while handling
  * resource allocation with a dedicated per-protocol devres subgroup.
  *
- * Return: A reference to an initialized protocol instance or error on failure.
+ * Return: A reference to an initialized protocol instance or error on failure:
+ *	   in particular returns -EPROBE_DEFER when the desired protocol could
+ *	   NOT be found.
  */
 static struct scmi_protocol_instance * __must_check
 scmi_get_protocol_instance(const struct scmi_handle *handle, u8 protocol_id)
@@ -753,7 +760,7 @@ scmi_get_protocol_instance(const struct scmi_handle *handle, u8 protocol_id)
 		if (proto)
 			pi = scmi_alloc_init_protocol_instance(info, proto);
 		else
-			pi = ERR_PTR(-ENODEV);
+			pi = ERR_PTR(-EPROBE_DEFER);
 	}
 	mutex_unlock(&info->protocols_mtx);
 
@@ -804,6 +811,8 @@ void scmi_protocol_release(const struct scmi_handle *handle, u8 protocol_id)
 
 		idr_remove(&info->protocols, protocol_id);
 
+		scmi_protocol_put(protocol_id);
+
 		devres_release_group(handle->dev, gid);
 		dev_dbg(handle->dev, "De-Initialized protocol: 0x%X\n",
 			protocol_id);
diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 059d6214f93b..f4cd5193b961 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -11,6 +11,7 @@
 #include <linux/of.h>
 #include <linux/io.h>
 #include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pm_opp.h>
 #include <linux/scmi_protocol.h>
@@ -909,6 +910,7 @@ static int scmi_perf_protocol_init(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_protocol scmi_perf = {
 	.id = SCMI_PROTOCOL_PERF,
+	.owner = THIS_MODULE,
 	.instance_init = &scmi_perf_protocol_init,
 	.ops = &perf_proto_ops,
 	.events = &perf_protocol_events,
diff --git a/drivers/firmware/arm_scmi/power.c b/drivers/firmware/arm_scmi/power.c
index e0987f0a8fb2..ad2ab080f344 100644
--- a/drivers/firmware/arm_scmi/power.c
+++ b/drivers/firmware/arm_scmi/power.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt) "SCMI Notifications POWER - " fmt
 
+#include <linux/module.h>
 #include <linux/scmi_protocol.h>
 
 #include "common.h"
@@ -312,6 +313,7 @@ static int scmi_power_protocol_init(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_protocol scmi_power = {
 	.id = SCMI_PROTOCOL_POWER,
+	.owner = THIS_MODULE,
 	.instance_init = &scmi_power_protocol_init,
 	.ops = &power_proto_ops,
 	.events = &power_protocol_events,
diff --git a/drivers/firmware/arm_scmi/reset.c b/drivers/firmware/arm_scmi/reset.c
index 48b487302db8..9bf2478ec6d1 100644
--- a/drivers/firmware/arm_scmi/reset.c
+++ b/drivers/firmware/arm_scmi/reset.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt) "SCMI Notifications RESET - " fmt
 
+#include <linux/module.h>
 #include <linux/scmi_protocol.h>
 
 #include "common.h"
@@ -324,6 +325,7 @@ static int scmi_reset_protocol_init(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_protocol scmi_reset = {
 	.id = SCMI_PROTOCOL_RESET,
+	.owner = THIS_MODULE,
 	.instance_init = &scmi_reset_protocol_init,
 	.ops = &reset_proto_ops,
 	.events = &reset_protocol_events,
diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index 935c4b08829d..2c88aa221559 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -8,6 +8,7 @@
 #define pr_fmt(fmt) "SCMI Notifications SENSOR - " fmt
 
 #include <linux/bitfield.h>
+#include <linux/module.h>
 #include <linux/scmi_protocol.h>
 
 #include "common.h"
@@ -990,6 +991,7 @@ static int scmi_sensors_protocol_init(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_protocol scmi_sensors = {
 	.id = SCMI_PROTOCOL_SENSOR,
+	.owner = THIS_MODULE,
 	.instance_init = &scmi_sensors_protocol_init,
 	.ops = &sensor_proto_ops,
 	.events = &sensor_protocol_events,
diff --git a/drivers/firmware/arm_scmi/system.c b/drivers/firmware/arm_scmi/system.c
index 3631a9f94d27..e5175ef73b40 100644
--- a/drivers/firmware/arm_scmi/system.c
+++ b/drivers/firmware/arm_scmi/system.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt) "SCMI Notifications SYSTEM - " fmt
 
+#include <linux/module.h>
 #include <linux/scmi_protocol.h>
 
 #include "common.h"
@@ -130,6 +131,7 @@ static int scmi_system_protocol_init(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_protocol scmi_system = {
 	.id = SCMI_PROTOCOL_SYSTEM,
+	.owner = THIS_MODULE,
 	.instance_init = &scmi_system_protocol_init,
 	.ops = NULL,
 	.events = &system_protocol_events,
diff --git a/drivers/firmware/arm_scmi/voltage.c b/drivers/firmware/arm_scmi/voltage.c
index 364bc14aa5fe..a5048956a0be 100644
--- a/drivers/firmware/arm_scmi/voltage.c
+++ b/drivers/firmware/arm_scmi/voltage.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2020-2021 ARM Ltd.
  */
 
+#include <linux/module.h>
 #include <linux/scmi_protocol.h>
 
 #include "common.h"
@@ -371,6 +372,7 @@ static int scmi_voltage_protocol_init(const struct scmi_protocol_handle *ph)
 
 static const struct scmi_protocol scmi_voltage = {
 	.id = SCMI_PROTOCOL_VOLTAGE,
+	.owner = THIS_MODULE,
 	.instance_init = &scmi_voltage_protocol_init,
 	.ops = &voltage_proto_ops,
 };
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index b80496d519f3..79d0a1237e6c 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -704,6 +704,18 @@ static inline void scmi_driver_unregister(struct scmi_driver *driver) {}
 #define module_scmi_driver(__scmi_driver)	\
 	module_driver(__scmi_driver, scmi_register, scmi_unregister)
 
+/**
+ * module_scmi_protocol() - Helper macro for registering a scmi protocol
+ * @__scmi_protocol: scmi_protocol structure
+ *
+ * Helper macro for scmi drivers to set up proper module init / exit
+ * functions.  Replaces module_init() and module_exit() and keeps people from
+ * printing pointless things to the kernel log when their driver is loaded.
+ */
+#define module_scmi_protocol(__scmi_protocol)	\
+	module_driver(__scmi_protocol,		\
+		      scmi_protocol_register, scmi_protocol_unregister)
+
 struct scmi_protocol;
 int scmi_protocol_register(const struct scmi_protocol *proto);
 void scmi_protocol_unregister(const struct scmi_protocol *proto);
-- 
cgit v1.2.3


From 01ad444e3be719f8ad13f136a9b0d301806183c8 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Tue, 30 Mar 2021 15:47:17 +0100
Subject: soundwire: export sdw_compare_devid, sdw_extract_slave_id and
 sdw_slave_add

Exporting these three functions makes sense as it can be used by
other controllers like Qualcomm during auto-enumeration!

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20210330144719.13284-8-srinivas.kandagatla@linaro.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/bus.c       | 4 +++-
 drivers/soundwire/slave.c     | 1 +
 include/linux/soundwire/sdw.h | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 9bd83c91a873..a9e0aa72654d 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -603,7 +603,7 @@ static struct sdw_slave *sdw_get_slave(struct sdw_bus *bus, int i)
 	return NULL;
 }
 
-static int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id)
+int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id)
 {
 	if (slave->id.mfg_id != id.mfg_id ||
 	    slave->id.part_id != id.part_id ||
@@ -614,6 +614,7 @@ static int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id)
 
 	return 0;
 }
+EXPORT_SYMBOL(sdw_compare_devid);
 
 /* called with bus_lock held */
 static int sdw_get_device_num(struct sdw_slave *slave)
@@ -698,6 +699,7 @@ void sdw_extract_slave_id(struct sdw_bus *bus,
 		"SDW Slave class_id 0x%02x, mfg_id 0x%04x, part_id 0x%04x, unique_id 0x%x, version 0x%x\n",
 		id->class_id, id->mfg_id, id->part_id, id->unique_id, id->sdw_version);
 }
+EXPORT_SYMBOL(sdw_extract_slave_id);
 
 static int sdw_program_device_num(struct sdw_bus *bus)
 {
diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c
index 112b21967c7a..0eed38a79c6d 100644
--- a/drivers/soundwire/slave.c
+++ b/drivers/soundwire/slave.c
@@ -88,6 +88,7 @@ int sdw_slave_add(struct sdw_bus *bus,
 
 	return ret;
 }
+EXPORT_SYMBOL(sdw_slave_add);
 
 #if IS_ENABLED(CONFIG_ACPI)
 
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 350436db6ddb..5ff9a8f37e91 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -1039,5 +1039,7 @@ int sdw_write_no_pm(struct sdw_slave *slave, u32 addr, u8 value);
 int sdw_read_no_pm(struct sdw_slave *slave, u32 addr);
 int sdw_nread(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
 int sdw_nwrite(struct sdw_slave *slave, u32 addr, size_t count, u8 *val);
+int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id);
+void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id);
 
 #endif /* __SOUNDWIRE_H */
-- 
cgit v1.2.3


From a24f98176d1efae2c37d3438c57a624d530d9c33 Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Mon, 29 Mar 2021 16:38:27 +0300
Subject: gpu: host1x: Use different lock classes for each client

To avoid false lockdep warnings, give each client lock a different
lock class, passed from the initialization site by macro.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 10 ++++++----
 include/linux/host1x.h   |  9 ++++++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 347fb962b6c9..68a766ff0e9d 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -705,8 +705,9 @@ void host1x_driver_unregister(struct host1x_driver *driver)
 EXPORT_SYMBOL(host1x_driver_unregister);
 
 /**
- * host1x_client_register() - register a host1x client
+ * __host1x_client_register() - register a host1x client
  * @client: host1x client
+ * @key: lock class key for the client-specific mutex
  *
  * Registers a host1x client with each host1x controller instance. Note that
  * each client will only match their parent host1x controller and will only be
@@ -715,13 +716,14 @@ EXPORT_SYMBOL(host1x_driver_unregister);
  * device and call host1x_device_init(), which will in turn call each client's
  * &host1x_client_ops.init implementation.
  */
-int host1x_client_register(struct host1x_client *client)
+int __host1x_client_register(struct host1x_client *client,
+			     struct lock_class_key *key)
 {
 	struct host1x *host1x;
 	int err;
 
 	INIT_LIST_HEAD(&client->list);
-	mutex_init(&client->lock);
+	__mutex_init(&client->lock, "host1x client lock", key);
 	client->usecount = 0;
 
 	mutex_lock(&devices_lock);
@@ -742,7 +744,7 @@ int host1x_client_register(struct host1x_client *client)
 
 	return 0;
 }
-EXPORT_SYMBOL(host1x_client_register);
+EXPORT_SYMBOL(__host1x_client_register);
 
 /**
  * host1x_client_unregister() - unregister a host1x client
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index ce59a6a6a008..9eb77c87a83b 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -320,7 +320,14 @@ static inline struct host1x_device *to_host1x_device(struct device *dev)
 int host1x_device_init(struct host1x_device *device);
 int host1x_device_exit(struct host1x_device *device);
 
-int host1x_client_register(struct host1x_client *client);
+int __host1x_client_register(struct host1x_client *client,
+			     struct lock_class_key *key);
+#define host1x_client_register(class) \
+	({ \
+		static struct lock_class_key __key; \
+		__host1x_client_register(class, &__key); \
+	})
+
 int host1x_client_unregister(struct host1x_client *client);
 
 int host1x_client_suspend(struct host1x_client *client);
-- 
cgit v1.2.3


From df59d0a461bc5935232bf56a279e4d7a71c566a5 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 30 Mar 2021 13:40:27 -0400
Subject: XArray: Add xa_limit_16b

A 16-bit limit is a more common limit than I had realised.  Make it
generally available.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/xarray.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 92c0160b3352..a91e3d90df8a 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -229,9 +229,10 @@ static inline int xa_err(void *entry)
  *
  * This structure is used either directly or via the XA_LIMIT() macro
  * to communicate the range of IDs that are valid for allocation.
- * Two common ranges are predefined for you:
+ * Three common ranges are predefined for you:
  * * xa_limit_32b	- [0 - UINT_MAX]
  * * xa_limit_31b	- [0 - INT_MAX]
+ * * xa_limit_16b	- [0 - USHRT_MAX]
  */
 struct xa_limit {
 	u32 max;
@@ -242,6 +243,7 @@ struct xa_limit {
 
 #define xa_limit_32b	XA_LIMIT(0, UINT_MAX)
 #define xa_limit_31b	XA_LIMIT(0, INT_MAX)
+#define xa_limit_16b	XA_LIMIT(0, USHRT_MAX)
 
 typedef unsigned __bitwise xa_mark_t;
 #define XA_MARK_0		((__force xa_mark_t)0U)
-- 
cgit v1.2.3


From 86cec7ece3e62517e2bc0fd796a8a8da4193e7e5 Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Mon, 29 Mar 2021 16:38:28 +0300
Subject: gpu: host1x: Allow syncpoints without associated client

Syncpoints don't need to be associated with any client,
so remove the property, and expose host1x_syncpt_alloc.
This will allow allocating syncpoints without prior knowledge
of the engine that it will be used with.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/syncpt.c | 37 +++++++++++++++++++++++++------------
 drivers/gpu/host1x/syncpt.h |  1 -
 include/linux/host1x.h      |  3 +++
 3 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index fce7892d5137..9a113016d482 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -42,13 +42,28 @@ static void host1x_syncpt_base_free(struct host1x_syncpt_base *base)
 		base->requested = false;
 }
 
-static struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
-						 struct host1x_client *client,
-						 unsigned long flags)
+/**
+ * host1x_syncpt_alloc() - allocate a syncpoint
+ * @host: host1x device data
+ * @flags: bitfield of HOST1X_SYNCPT_* flags
+ * @name: name for the syncpoint for use in debug prints
+ *
+ * Allocates a hardware syncpoint for the caller's use. The caller then has
+ * the sole authority to mutate the syncpoint's value until it is freed again.
+ *
+ * If no free syncpoints are available, or a NULL name was specified, returns
+ * NULL.
+ */
+struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
+					  unsigned long flags,
+					  const char *name)
 {
 	struct host1x_syncpt *sp = host->syncpt;
+	char *full_name;
 	unsigned int i;
-	char *name;
+
+	if (!name)
+		return NULL;
 
 	mutex_lock(&host->syncpt_mutex);
 
@@ -64,13 +79,11 @@ static struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
 			goto unlock;
 	}
 
-	name = kasprintf(GFP_KERNEL, "%02u-%s", sp->id,
-			 client ? dev_name(client->dev) : NULL);
-	if (!name)
+	full_name = kasprintf(GFP_KERNEL, "%u-%s", sp->id, name);
+	if (!full_name)
 		goto free_base;
 
-	sp->client = client;
-	sp->name = name;
+	sp->name = full_name;
 
 	if (flags & HOST1X_SYNCPT_CLIENT_MANAGED)
 		sp->client_managed = true;
@@ -87,6 +100,7 @@ unlock:
 	mutex_unlock(&host->syncpt_mutex);
 	return NULL;
 }
+EXPORT_SYMBOL(host1x_syncpt_alloc);
 
 /**
  * host1x_syncpt_id() - retrieve syncpoint ID
@@ -401,7 +415,7 @@ int host1x_syncpt_init(struct host1x *host)
 	host1x_hw_syncpt_enable_protection(host);
 
 	/* Allocate sync point to use for clearing waits for expired fences */
-	host->nop_sp = host1x_syncpt_alloc(host, NULL, 0);
+	host->nop_sp = host1x_syncpt_alloc(host, 0, "reserved-nop");
 	if (!host->nop_sp)
 		return -ENOMEM;
 
@@ -423,7 +437,7 @@ struct host1x_syncpt *host1x_syncpt_request(struct host1x_client *client,
 {
 	struct host1x *host = dev_get_drvdata(client->host->parent);
 
-	return host1x_syncpt_alloc(host, client, flags);
+	return host1x_syncpt_alloc(host, flags, dev_name(client->dev));
 }
 EXPORT_SYMBOL(host1x_syncpt_request);
 
@@ -447,7 +461,6 @@ void host1x_syncpt_free(struct host1x_syncpt *sp)
 	host1x_syncpt_base_free(sp->base);
 	kfree(sp->name);
 	sp->base = NULL;
-	sp->client = NULL;
 	sp->name = NULL;
 	sp->client_managed = false;
 
diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
index 8e1d04dacaa0..3aa6b25b1b9c 100644
--- a/drivers/gpu/host1x/syncpt.h
+++ b/drivers/gpu/host1x/syncpt.h
@@ -33,7 +33,6 @@ struct host1x_syncpt {
 	const char *name;
 	bool client_managed;
 	struct host1x *host;
-	struct host1x_client *client;
 	struct host1x_syncpt_base *base;
 
 	/* interrupt data */
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 9eb77c87a83b..7137ce0e35d4 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -154,6 +154,9 @@ int host1x_syncpt_wait(struct host1x_syncpt *sp, u32 thresh, long timeout,
 struct host1x_syncpt *host1x_syncpt_request(struct host1x_client *client,
 					    unsigned long flags);
 void host1x_syncpt_free(struct host1x_syncpt *sp);
+struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
+					  unsigned long flags,
+					  const char *name);
 
 struct host1x_syncpt_base *host1x_syncpt_get_base(struct host1x_syncpt *sp);
 u32 host1x_syncpt_base_id(struct host1x_syncpt_base *base);
-- 
cgit v1.2.3


From 913d55037616659c04763e756f948fcbaef0bbee Mon Sep 17 00:00:00 2001
From: He Fengqing <hefengqing@huawei.com>
Date: Tue, 30 Mar 2021 02:48:43 +0000
Subject: bpf: Remove unused bpf_load_pointer

Remove unused bpf_load_pointer function in filter.h. The last user of it has
been removed with 24dea04767e6 ("bpf, x32: remove ld_abs/ld_ind").

Signed-off-by: He Fengqing <hefengqing@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210330024843.3479844-1-hefengqing@huawei.com
---
 include/linux/filter.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index eecfd82db648..9a09547bc7ba 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1246,15 +1246,6 @@ static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
 void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
 					   int k, unsigned int size);
 
-static inline void *bpf_load_pointer(const struct sk_buff *skb, int k,
-				     unsigned int size, void *buffer)
-{
-	if (k >= 0)
-		return skb_header_pointer(skb, k, size, buffer);
-
-	return bpf_internal_load_pointer_neg_helper(skb, k, size);
-}
-
 static inline int bpf_tell_extensions(void)
 {
 	return SKF_AD_MAX;
-- 
cgit v1.2.3


From 78352f73dc5047f3f744764cc45912498c52f3c9 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 30 Mar 2021 12:28:52 +0200
Subject: udp: never accept GSO_FRAGLIST packets

Currently the UDP protocol delivers GSO_FRAGLIST packets to
the sockets without the expected segmentation.

This change addresses the issue introducing and maintaining
a couple of new fields to explicitly accept SKB_GSO_UDP_L4
or GSO_FRAGLIST packets. Additionally updates  udp_unexpected_gso()
accordingly.

UDP sockets enabling UDP_GRO stil keep accept_udp_fraglist
zeroed.

v1 -> v2:
 - use 2 bits instead of a whole GSO bitmask (Willem)

Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h | 16 +++++++++++++---
 net/ipv4/udp.c      |  3 +++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index aa84597bdc33..ae58ff3b6b5b 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -51,7 +51,9 @@ struct udp_sock {
 					   * different encapsulation layer set
 					   * this
 					   */
-			 gro_enabled:1;	/* Can accept GRO packets */
+			 gro_enabled:1,	/* Request GRO aggregation */
+			 accept_udp_l4:1,
+			 accept_udp_fraglist:1;
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -131,8 +133,16 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
 
 static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
 {
-	return !udp_sk(sk)->gro_enabled && skb_is_gso(skb) &&
-	       skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4;
+	if (!skb_is_gso(skb))
+		return false;
+
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
+		return true;
+
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
+		return true;
+
+	return false;
 }
 
 #define udp_portaddr_for_each_entry(__sk, list) \
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fe85dcf8c008..c0695ce42dc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2666,9 +2666,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 
 	case UDP_GRO:
 		lock_sock(sk);
+
+		/* when enabling GRO, accept the related GSO packet type */
 		if (valbool)
 			udp_tunnel_encap_enable(sk->sk_socket);
 		up->gro_enabled = valbool;
+		up->accept_udp_l4 = valbool;
 		release_sock(sk);
 		break;
 
-- 
cgit v1.2.3


From d18931a92a0b5feddd8a39d097b90ae2867db02f Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 30 Mar 2021 12:28:53 +0200
Subject: vxlan: allow L4 GRO passthrough

When passing up an UDP GSO packet with L4 aggregation, there is
no need to segment it at the vxlan level. We can propagate the
packet untouched and let it be segmented later, if needed.

Introduce an helper to allow let the UDP socket to accept any
L4 aggregation and use it in the vxlan driver.

v1 -> v2:
 - updated to use the newly introduced UDP socket 'accept*' fields

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 1 +
 include/linux/udp.h | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7665817f3cb6..39ee1300cdd9 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3484,6 +3484,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
 	if (err < 0)
 		return ERR_PTR(err);
 
+	udp_allow_gso(sock->sk);
 	return sock;
 }
 
diff --git a/include/linux/udp.h b/include/linux/udp.h
index ae58ff3b6b5b..ae66dadd8543 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -145,6 +145,12 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
 	return false;
 }
 
+static inline void udp_allow_gso(struct sock *sk)
+{
+	udp_sk(sk)->accept_udp_l4 = 1;
+	udp_sk(sk)->accept_udp_fraglist = 1;
+}
+
 #define udp_portaddr_for_each_entry(__sk, list) \
 	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
-- 
cgit v1.2.3


From 61431a5907fc36d0738e9a547c7e1556349a03e9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Mar 2021 16:43:43 -0700
Subject: net: ensure mac header is set in virtio_net_hdr_to_skb()

Commit 924a9bc362a5 ("net: check if protocol extracted by virtio_net_hdr_set_proto is correct")
added a call to dev_parse_header_protocol() but mac_header is not yet set.

This means that eth_hdr() reads complete garbage, and syzbot complained about it [1]

This patch resets mac_header earlier, to get more coverage about this change.

Audit of virtio_net_hdr_to_skb() callers shows that this change should be safe.

[1]

BUG: KASAN: use-after-free in eth_header_parse_protocol+0xdc/0xe0 net/ethernet/eth.c:282
Read of size 2 at addr ffff888017a6200b by task syz-executor313/8409

CPU: 1 PID: 8409 Comm: syz-executor313 Not tainted 5.12.0-rc2-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:79 [inline]
 dump_stack+0x141/0x1d7 lib/dump_stack.c:120
 print_address_description.constprop.0.cold+0x5b/0x2f8 mm/kasan/report.c:232
 __kasan_report mm/kasan/report.c:399 [inline]
 kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:416
 eth_header_parse_protocol+0xdc/0xe0 net/ethernet/eth.c:282
 dev_parse_header_protocol include/linux/netdevice.h:3177 [inline]
 virtio_net_hdr_to_skb.constprop.0+0x99d/0xcd0 include/linux/virtio_net.h:83
 packet_snd net/packet/af_packet.c:2994 [inline]
 packet_sendmsg+0x2325/0x52b0 net/packet/af_packet.c:3031
 sock_sendmsg_nosec net/socket.c:654 [inline]
 sock_sendmsg+0xcf/0x120 net/socket.c:674
 sock_no_sendpage+0xf3/0x130 net/core/sock.c:2860
 kernel_sendpage.part.0+0x1ab/0x350 net/socket.c:3631
 kernel_sendpage net/socket.c:3628 [inline]
 sock_sendpage+0xe5/0x140 net/socket.c:947
 pipe_to_sendpage+0x2ad/0x380 fs/splice.c:364
 splice_from_pipe_feed fs/splice.c:418 [inline]
 __splice_from_pipe+0x43e/0x8a0 fs/splice.c:562
 splice_from_pipe fs/splice.c:597 [inline]
 generic_splice_sendpage+0xd4/0x140 fs/splice.c:746
 do_splice_from fs/splice.c:767 [inline]
 do_splice+0xb7e/0x1940 fs/splice.c:1079
 __do_splice+0x134/0x250 fs/splice.c:1144
 __do_sys_splice fs/splice.c:1350 [inline]
 __se_sys_splice fs/splice.c:1332 [inline]
 __x64_sys_splice+0x198/0x250 fs/splice.c:1332
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46

Fixes: 924a9bc362a5 ("net: check if protocol extracted by virtio_net_hdr_set_proto is correct")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Balazs Nemeth <bnemeth@redhat.com>
Cc: Willem de Bruijn <willemb@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 6b5fcfa1e555..98775d7fa696 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -62,6 +62,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 			return -EINVAL;
 	}
 
+	skb_reset_mac_header(skb);
+
 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
 		u16 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
 		u16 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
-- 
cgit v1.2.3


From 6e085e0ac9cf16298b5fefe0b1893f98ef765812 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 9 Dec 2020 14:09:24 +0800
Subject: arm/arm64: Probe for the presence of KVM hypervisor

Although the SMCCC specification provides some limited functionality for
describing the presence of hypervisor and firmware services, this is
generally applicable only to functions designated as "Arm Architecture
Service Functions" and no portable discovery mechanism is provided for
standard hypervisor services, despite having a designated range of
function identifiers reserved by the specification.

In an attempt to avoid the need for additional firmware changes every
time a new function is added, introduce a UID to identify the service
provider as being compatible with KVM. Once this has been established,
additional services can be discovered via a feature bitmap.

Reviewed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
[maz: move code to its own file, plug it into PSCI]
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-2-jianyong.wu@arm.com
---
 arch/arm/include/asm/hypervisor.h   |  3 +++
 arch/arm64/include/asm/hypervisor.h |  3 +++
 drivers/firmware/psci/psci.c        |  2 ++
 drivers/firmware/smccc/Makefile     |  2 +-
 drivers/firmware/smccc/kvm_guest.c  | 50 +++++++++++++++++++++++++++++++++++++
 drivers/firmware/smccc/smccc.c      |  1 +
 include/linux/arm-smccc.h           | 25 +++++++++++++++++++
 7 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/smccc/kvm_guest.c

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h
index df8524365637..bd61502b9715 100644
--- a/arch/arm/include/asm/hypervisor.h
+++ b/arch/arm/include/asm/hypervisor.h
@@ -4,4 +4,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
 #endif
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index f9cc1d021791..0ae427f352c8 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -4,4 +4,7 @@
 
 #include <asm/xen/hypervisor.h>
 
+void kvm_init_hyp_services(void);
+bool kvm_arm_hyp_service_available(u32 func_id);
+
 #endif
diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index f5fc429cae3f..69e296f02902 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -23,6 +23,7 @@
 
 #include <asm/cpuidle.h>
 #include <asm/cputype.h>
+#include <asm/hypervisor.h>
 #include <asm/system_misc.h>
 #include <asm/smp_plat.h>
 #include <asm/suspend.h>
@@ -498,6 +499,7 @@ static int __init psci_probe(void)
 		psci_init_cpu_suspend();
 		psci_init_system_suspend();
 		psci_init_system_reset2();
+		kvm_init_hyp_services();
 	}
 
 	return 0;
diff --git a/drivers/firmware/smccc/Makefile b/drivers/firmware/smccc/Makefile
index 72ab84042832..40d19144a860 100644
--- a/drivers/firmware/smccc/Makefile
+++ b/drivers/firmware/smccc/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)	+= smccc.o
+obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)	+= smccc.o kvm_guest.o
 obj-$(CONFIG_ARM_SMCCC_SOC_ID)	+= soc_id.o
diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c
new file mode 100644
index 000000000000..2d3e866decaa
--- /dev/null
+++ b/drivers/firmware/smccc/kvm_guest.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "smccc: KVM: " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/hypervisor.h>
+
+static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { };
+
+void __init kvm_init_hyp_services(void)
+{
+	struct arm_smccc_res res;
+	u32 val[4];
+
+	if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_HVC)
+		return;
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, &res);
+	if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 ||
+	    res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 ||
+	    res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 ||
+	    res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3)
+		return;
+
+	memset(&res, 0, sizeof(res));
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, &res);
+
+	val[0] = lower_32_bits(res.a0);
+	val[1] = lower_32_bits(res.a1);
+	val[2] = lower_32_bits(res.a2);
+	val[3] = lower_32_bits(res.a3);
+
+	bitmap_from_arr32(__kvm_arm_hyp_services, val, ARM_SMCCC_KVM_NUM_FUNCS);
+
+	pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n",
+		 res.a3, res.a2, res.a1, res.a0);
+}
+
+bool kvm_arm_hyp_service_available(u32 func_id)
+{
+	if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS)
+		return false;
+
+	return test_bit(func_id, __kvm_arm_hyp_services);
+}
+EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available);
diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c
index d52bfc5ed5e4..028f81d702cc 100644
--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -8,6 +8,7 @@
 #include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/arm-smccc.h>
+#include <linux/kernel.h>
 #include <asm/archrandom.h>
 
 static u32 smccc_version = ARM_SMCCC_VERSION_1_0;
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 62c54234576c..1a27bd9493fe 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -55,6 +55,8 @@
 #define ARM_SMCCC_OWNER_TRUSTED_OS	50
 #define ARM_SMCCC_OWNER_TRUSTED_OS_END	63
 
+#define ARM_SMCCC_FUNC_QUERY_CALL_UID  0xff01
+
 #define ARM_SMCCC_QUIRK_NONE		0
 #define ARM_SMCCC_QUIRK_QCOM_A6		1 /* Save/restore register a6 */
 
@@ -87,6 +89,29 @@
 			   ARM_SMCCC_SMC_32,				\
 			   0, 0x7fff)
 
+#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_FUNC_QUERY_CALL_UID)
+
+/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0	0xb66fb428U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1	0xe911c52eU
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2	0x564bcaa9U
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3	0x743a004dU
+
+/* KVM "vendor specific" services */
+#define ARM_SMCCC_KVM_FUNC_FEATURES		0
+#define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
+#define ARM_SMCCC_KVM_NUM_FUNCS			128
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_FEATURES)
+
 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED	1
 
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
-- 
cgit v1.2.3


From 66ac7985b2af310aaca14869d6e43b0290e98c07 Mon Sep 17 00:00:00 2001
From: Carl Yin <carl.yin@quectel.com>
Date: Mon, 29 Mar 2021 18:28:21 -0700
Subject: bus: mhi: core: Add support for Flash Programmer execution
 environment

MHI WWAN modems support downloading firmware to NAND or eMMC
using Firehose protocol with process as follows:
1. Modem boots up, enters AMSS execution environment and the
device later enters EDL (Emergency Download) mode through any
mechanism host can use such as a diag command.
2. Modem enters SYS_ERROR, MHI host handles SYS_ERROR transition.
3. EDL image for device to enter 'Flash Programmer' execution
environment is then flashed via BHI interface from host.
4. Modem enters MHI READY -> M0 and sends the Flash Programmer
execution environment change to host.
5. Following that, EDL/FIREHOSE channels (34, 35) are made
available from the host.
6. User space tool for downloading firmware image to modem over
the EDL channels using Firehose protocol. Link to USB flashing
tool: https://git.linaro.org/landing-teams/working/qualcomm/qdl.git/

Make the necessary changes to allow for this sequence to occur and
allow using the Flash Programmer execution environment.

Signed-off-by: Carl Yin <carl.yin@quectel.com>
Co-developed-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
Reviewed-by: Loic Poulain <loic.poulain@linaro.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1617067704-28850-5-git-send-email-bbhatt@codeaurora.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/init.c     | 2 ++
 drivers/bus/mhi/core/internal.h | 1 +
 drivers/bus/mhi/core/main.c     | 3 +++
 drivers/bus/mhi/core/pm.c       | 6 ++++++
 include/linux/mhi.h             | 4 +++-
 5 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c
index 706484ffe169..ee67712f66c0 100644
--- a/drivers/bus/mhi/core/init.c
+++ b/drivers/bus/mhi/core/init.c
@@ -29,6 +29,7 @@ const char * const mhi_ee_str[MHI_EE_MAX] = {
 	[MHI_EE_WFW] = "WFW",
 	[MHI_EE_PTHRU] = "PASS THRU",
 	[MHI_EE_EDL] = "EDL",
+	[MHI_EE_FP] = "FLASH PROGRAMMER",
 	[MHI_EE_DISABLE_TRANSITION] = "DISABLE",
 	[MHI_EE_NOT_SUPPORTED] = "NOT SUPPORTED",
 };
@@ -38,6 +39,7 @@ const char * const dev_state_tran_str[DEV_ST_TRANSITION_MAX] = {
 	[DEV_ST_TRANSITION_READY] = "READY",
 	[DEV_ST_TRANSITION_SBL] = "SBL",
 	[DEV_ST_TRANSITION_MISSION_MODE] = "MISSION_MODE",
+	[DEV_ST_TRANSITION_FP] = "FLASH_PROGRAMMER",
 	[DEV_ST_TRANSITION_SYS_ERR] = "SYS_ERR",
 	[DEV_ST_TRANSITION_DISABLE] = "DISABLE",
 };
diff --git a/drivers/bus/mhi/core/internal.h b/drivers/bus/mhi/core/internal.h
index 6f37439e5247..b1b5f2b82393 100644
--- a/drivers/bus/mhi/core/internal.h
+++ b/drivers/bus/mhi/core/internal.h
@@ -386,6 +386,7 @@ enum dev_st_transition {
 	DEV_ST_TRANSITION_READY,
 	DEV_ST_TRANSITION_SBL,
 	DEV_ST_TRANSITION_MISSION_MODE,
+	DEV_ST_TRANSITION_FP,
 	DEV_ST_TRANSITION_SYS_ERR,
 	DEV_ST_TRANSITION_DISABLE,
 	DEV_ST_TRANSITION_MAX,
diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c
index d377d07f269e..8b68657c76c3 100644
--- a/drivers/bus/mhi/core/main.c
+++ b/drivers/bus/mhi/core/main.c
@@ -831,6 +831,9 @@ int mhi_process_ctrl_ev_ring(struct mhi_controller *mhi_cntrl,
 			case MHI_EE_AMSS:
 				st = DEV_ST_TRANSITION_MISSION_MODE;
 				break;
+			case MHI_EE_FP:
+				st = DEV_ST_TRANSITION_FP;
+				break;
 			case MHI_EE_RDDM:
 				mhi_cntrl->status_cb(mhi_cntrl, MHI_CB_EE_RDDM);
 				write_lock_irq(&mhi_cntrl->pm_lock);
diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c
index b65222eda5e6..7ac9dfe54895 100644
--- a/drivers/bus/mhi/core/pm.c
+++ b/drivers/bus/mhi/core/pm.c
@@ -782,6 +782,12 @@ void mhi_pm_st_worker(struct work_struct *work)
 		case DEV_ST_TRANSITION_MISSION_MODE:
 			mhi_pm_mission_mode_transition(mhi_cntrl);
 			break;
+		case DEV_ST_TRANSITION_FP:
+			write_lock_irq(&mhi_cntrl->pm_lock);
+			mhi_cntrl->ee = MHI_EE_FP;
+			write_unlock_irq(&mhi_cntrl->pm_lock);
+			mhi_create_devices(mhi_cntrl);
+			break;
 		case DEV_ST_TRANSITION_READY:
 			mhi_ready_state_transition(mhi_cntrl);
 			break;
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 8f5bf409f663..b16afd36b444 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -117,6 +117,7 @@ struct mhi_link_info {
  * @MHI_EE_WFW: WLAN firmware mode
  * @MHI_EE_PTHRU: Passthrough
  * @MHI_EE_EDL: Embedded downloader
+ * @MHI_EE_FP: Flash Programmer Environment
  */
 enum mhi_ee_type {
 	MHI_EE_PBL,
@@ -126,7 +127,8 @@ enum mhi_ee_type {
 	MHI_EE_WFW,
 	MHI_EE_PTHRU,
 	MHI_EE_EDL,
-	MHI_EE_MAX_SUPPORTED = MHI_EE_EDL,
+	MHI_EE_FP,
+	MHI_EE_MAX_SUPPORTED = MHI_EE_FP,
 	MHI_EE_DISABLE_TRANSITION, /* local EE, not related to mhi spec */
 	MHI_EE_NOT_SUPPORTED,
 	MHI_EE_MAX,
-- 
cgit v1.2.3


From 4c5014456305482412b35a081ca0fb4fefd69764 Mon Sep 17 00:00:00 2001
From: Guru Das Srinagesh <gurus@codeaurora.org>
Date: Wed, 24 Mar 2021 12:28:53 -0700
Subject: regmap-irq: Introduce virtual regs to handle more config regs

Add "virtual" registers support to handle any irq configuration
registers in addition to the ones the framework currently supports
(status, mask, unmask, wake, type and ack). These are non-standard
registers that further configure irq type on some devices, so enable the
framework to add a variable number of them.

Signed-off-by: Guru Das Srinagesh <gurus@codeaurora.org>
Link: https://lore.kernel.org/r/a1787067004b0e11cb960319082764397469215a.1616613838.git.gurus@codeaurora.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 38 +++++++++++++++++++++++++++++++++++++-
 include/linux/regmap.h           |  5 +++++
 2 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index e1d8fc9ef040..d1ade76a6c93 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -38,6 +38,7 @@ struct regmap_irq_chip_data {
 	unsigned int *wake_buf;
 	unsigned int *type_buf;
 	unsigned int *type_buf_def;
+	unsigned int **virt_buf;
 
 	unsigned int irq_reg_stride;
 	unsigned int type_reg_stride;
@@ -94,7 +95,7 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 {
 	struct regmap_irq_chip_data *d = irq_data_get_irq_chip_data(data);
 	struct regmap *map = d->map;
-	int i, ret;
+	int i, j, ret;
 	u32 reg;
 	u32 unmask_offset;
 	u32 val;
@@ -218,6 +219,20 @@ static void regmap_irq_sync_unlock(struct irq_data *data)
 		}
 	}
 
+	if (d->chip->num_virt_regs) {
+		for (i = 0; i < d->chip->num_virt_regs; i++) {
+			for (j = 0; j < d->chip->num_regs; j++) {
+				reg = sub_irq_reg(d, d->chip->virt_reg_base[i],
+						  j);
+				ret = regmap_write(map, reg, d->virt_buf[i][j]);
+				if (ret != 0)
+					dev_err(d->map->dev,
+						"Failed to write virt 0x%x: %d\n",
+						reg, ret);
+			}
+		}
+	}
+
 	if (d->chip->runtime_pm)
 		pm_runtime_put(map->dev);
 
@@ -691,6 +706,24 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode,
 			goto err_alloc;
 	}
 
+	if (chip->num_virt_regs) {
+		/*
+		 * Create virt_buf[chip->num_extra_config_regs][chip->num_regs]
+		 */
+		d->virt_buf = kcalloc(chip->num_virt_regs, sizeof(*d->virt_buf),
+				      GFP_KERNEL);
+		if (!d->virt_buf)
+			goto err_alloc;
+
+		for (i = 0; i < chip->num_virt_regs; i++) {
+			d->virt_buf[i] = kcalloc(chip->num_regs,
+						 sizeof(unsigned int),
+						 GFP_KERNEL);
+			if (!d->virt_buf[i])
+				goto err_alloc;
+		}
+	}
+
 	d->irq_chip = regmap_irq_chip;
 	d->irq_chip.name = chip->name;
 	d->irq = irq;
@@ -863,6 +896,9 @@ err_alloc:
 	kfree(d->mask_buf);
 	kfree(d->status_buf);
 	kfree(d->status_reg_buf);
+	for (i = 0; i < chip->num_virt_regs; i++)
+		kfree(d->virt_buf[i]);
+	kfree(d->virt_buf);
 	kfree(d);
 	return ret;
 }
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 18910bd809f7..97ec73383e47 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1393,6 +1393,7 @@ struct regmap_irq_sub_irq_map {
  *               Using zero value is possible with @use_ack bit.
  * @wake_base:   Base address for wake enables.  If zero unsupported.
  * @type_base:   Base address for irq type.  If zero unsupported.
+ * @virt_reg_base:   Base addresses for extra config regs.
  * @irq_reg_stride:  Stride to use for chips where registers are not contiguous.
  * @init_ack_masked: Ack all masked interrupts once during initalization.
  * @mask_invert: Inverted mask register: cleared bits are masked out.
@@ -1417,6 +1418,8 @@ struct regmap_irq_sub_irq_map {
  *               assigned based on the index in the array of the interrupt.
  * @num_irqs:    Number of descriptors.
  * @num_type_reg:    Number of type registers.
+ * @num_virt_regs:   Number of non-standard irq configuration registers.
+ *		     If zero unsupported.
  * @type_reg_stride: Stride to use for chips where type registers are not
  *			contiguous.
  * @handle_pre_irq:  Driver specific callback to handle interrupt from device
@@ -1444,6 +1447,7 @@ struct regmap_irq_chip {
 	unsigned int ack_base;
 	unsigned int wake_base;
 	unsigned int type_base;
+	unsigned int *virt_reg_base;
 	unsigned int irq_reg_stride;
 	bool mask_writeonly:1;
 	bool init_ack_masked:1;
@@ -1464,6 +1468,7 @@ struct regmap_irq_chip {
 	int num_irqs;
 
 	int num_type_reg;
+	int num_virt_regs;
 	unsigned int type_reg_stride;
 
 	int (*handle_pre_irq)(void *irq_drv_data);
-- 
cgit v1.2.3


From 394409aafd017adfcffd075595cb01cc456a9327 Mon Sep 17 00:00:00 2001
From: Guru Das Srinagesh <gurus@codeaurora.org>
Date: Wed, 24 Mar 2021 12:28:54 -0700
Subject: regmap-irq: Add driver callback to configure virtual regs

Enable drivers to configure and modify "virtual" registers, which are
non-standard registers that further configure irq type on some devices.
Since they are non-standard, enable drivers to configure them according
to their particular idiosyncrasies by specifying an optional callback
function while registering with the framework.

Signed-off-by: Guru Das Srinagesh <gurus@codeaurora.org>
Link: https://lore.kernel.org/r/07e058cdec2297d15c95c825aa0263064d962d5a.1616613838.git.gurus@codeaurora.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap-irq.c | 5 +++++
 include/linux/regmap.h           | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c
index d1ade76a6c93..e6343ccc6aa1 100644
--- a/drivers/base/regmap/regmap-irq.c
+++ b/drivers/base/regmap/regmap-irq.c
@@ -333,6 +333,11 @@ static int regmap_irq_set_type(struct irq_data *data, unsigned int type)
 	default:
 		return -EINVAL;
 	}
+
+	if (d->chip->set_type_virt)
+		return d->chip->set_type_virt(d->virt_buf, type, data->hwirq,
+					      reg);
+
 	return 0;
 }
 
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 97ec73383e47..f87a11a5cc4a 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1426,6 +1426,8 @@ struct regmap_irq_sub_irq_map {
  *		     before regmap_irq_handler process the interrupts.
  * @handle_post_irq: Driver specific callback to handle interrupt from device
  *		     after handling the interrupts in regmap_irq_handler().
+ * @set_type_virt:   Driver specific callback to extend regmap_irq_set_type()
+ *		     and configure virt regs.
  * @irq_drv_data:    Driver specific IRQ data which is passed as parameter when
  *		     driver specific pre/post interrupt handler is called.
  *
@@ -1473,6 +1475,8 @@ struct regmap_irq_chip {
 
 	int (*handle_pre_irq)(void *irq_drv_data);
 	int (*handle_post_irq)(void *irq_drv_data);
+	int (*set_type_virt)(unsigned int **buf, unsigned int type,
+			     unsigned long hwirq, int reg);
 	void *irq_drv_data;
 };
 
-- 
cgit v1.2.3


From 2aed4f5ab04af922a7cf1b616701845c9ed2473f Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Mon, 29 Mar 2021 16:38:32 +0300
Subject: gpu: host1x: Cleanup and refcounting for syncpoints

Add reference counting for allocated syncpoints to allow keeping
them allocated while jobs are referencing them. Additionally,
clean up various places using syncpoint IDs to use host1x_syncpt
pointers instead.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/dc.c             |  4 +-
 drivers/gpu/drm/tegra/drm.c            | 14 +++----
 drivers/gpu/drm/tegra/gr2d.c           |  4 +-
 drivers/gpu/drm/tegra/gr3d.c           |  4 +-
 drivers/gpu/drm/tegra/vic.c            |  4 +-
 drivers/gpu/host1x/cdma.c              | 11 ++---
 drivers/gpu/host1x/dev.h               |  7 ++--
 drivers/gpu/host1x/hw/cdma_hw.c        |  2 +-
 drivers/gpu/host1x/hw/channel_hw.c     | 10 ++---
 drivers/gpu/host1x/hw/debug_hw.c       |  2 +-
 drivers/gpu/host1x/job.c               |  5 ++-
 drivers/gpu/host1x/syncpt.c            | 75 ++++++++++++++++++++++++++--------
 drivers/gpu/host1x/syncpt.h            |  3 ++
 drivers/staging/media/tegra-video/vi.c |  6 +--
 include/linux/host1x.h                 |  8 ++--
 15 files changed, 99 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index c9385cfd0fc1..cfda71e151cc 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -2141,7 +2141,7 @@ cleanup:
 		drm_plane_cleanup(primary);
 
 	host1x_client_iommu_detach(client);
-	host1x_syncpt_free(dc->syncpt);
+	host1x_syncpt_put(dc->syncpt);
 
 	return err;
 }
@@ -2166,7 +2166,7 @@ static int tegra_dc_exit(struct host1x_client *client)
 	}
 
 	host1x_client_iommu_detach(client);
-	host1x_syncpt_free(dc->syncpt);
+	host1x_syncpt_put(dc->syncpt);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 90709c38c993..ce5bdc58d315 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -174,7 +174,7 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 	struct drm_tegra_syncpt syncpt;
 	struct host1x *host1x = dev_get_drvdata(drm->dev->parent);
 	struct drm_gem_object **refs;
-	struct host1x_syncpt *sp;
+	struct host1x_syncpt *sp = NULL;
 	struct host1x_job *job;
 	unsigned int num_refs;
 	int err;
@@ -301,8 +301,8 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 		goto fail;
 	}
 
-	/* check whether syncpoint ID is valid */
-	sp = host1x_syncpt_get(host1x, syncpt.id);
+	/* Syncpoint ref will be dropped on job release. */
+	sp = host1x_syncpt_get_by_id(host1x, syncpt.id);
 	if (!sp) {
 		err = -ENOENT;
 		goto fail;
@@ -311,7 +311,7 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 	job->is_addr_reg = context->client->ops->is_addr_reg;
 	job->is_valid_class = context->client->ops->is_valid_class;
 	job->syncpt_incrs = syncpt.incrs;
-	job->syncpt_id = syncpt.id;
+	job->syncpt = sp;
 	job->timeout = 10000;
 
 	if (args->timeout && args->timeout < 10000)
@@ -383,7 +383,7 @@ static int tegra_syncpt_read(struct drm_device *drm, void *data,
 	struct drm_tegra_syncpt_read *args = data;
 	struct host1x_syncpt *sp;
 
-	sp = host1x_syncpt_get(host, args->id);
+	sp = host1x_syncpt_get_by_id_noref(host, args->id);
 	if (!sp)
 		return -EINVAL;
 
@@ -398,7 +398,7 @@ static int tegra_syncpt_incr(struct drm_device *drm, void *data,
 	struct drm_tegra_syncpt_incr *args = data;
 	struct host1x_syncpt *sp;
 
-	sp = host1x_syncpt_get(host1x, args->id);
+	sp = host1x_syncpt_get_by_id_noref(host1x, args->id);
 	if (!sp)
 		return -EINVAL;
 
@@ -412,7 +412,7 @@ static int tegra_syncpt_wait(struct drm_device *drm, void *data,
 	struct drm_tegra_syncpt_wait *args = data;
 	struct host1x_syncpt *sp;
 
-	sp = host1x_syncpt_get(host1x, args->id);
+	sp = host1x_syncpt_get_by_id_noref(host1x, args->id);
 	if (!sp)
 		return -EINVAL;
 
diff --git a/drivers/gpu/drm/tegra/gr2d.c b/drivers/gpu/drm/tegra/gr2d.c
index adbe2ddcda19..de288cba3905 100644
--- a/drivers/gpu/drm/tegra/gr2d.c
+++ b/drivers/gpu/drm/tegra/gr2d.c
@@ -67,7 +67,7 @@ static int gr2d_init(struct host1x_client *client)
 detach:
 	host1x_client_iommu_detach(client);
 free:
-	host1x_syncpt_free(client->syncpts[0]);
+	host1x_syncpt_put(client->syncpts[0]);
 put:
 	host1x_channel_put(gr2d->channel);
 	return err;
@@ -86,7 +86,7 @@ static int gr2d_exit(struct host1x_client *client)
 		return err;
 
 	host1x_client_iommu_detach(client);
-	host1x_syncpt_free(client->syncpts[0]);
+	host1x_syncpt_put(client->syncpts[0]);
 	host1x_channel_put(gr2d->channel);
 
 	return 0;
diff --git a/drivers/gpu/drm/tegra/gr3d.c b/drivers/gpu/drm/tegra/gr3d.c
index b0b8154e8104..24442ade0da3 100644
--- a/drivers/gpu/drm/tegra/gr3d.c
+++ b/drivers/gpu/drm/tegra/gr3d.c
@@ -76,7 +76,7 @@ static int gr3d_init(struct host1x_client *client)
 detach:
 	host1x_client_iommu_detach(client);
 free:
-	host1x_syncpt_free(client->syncpts[0]);
+	host1x_syncpt_put(client->syncpts[0]);
 put:
 	host1x_channel_put(gr3d->channel);
 	return err;
@@ -94,7 +94,7 @@ static int gr3d_exit(struct host1x_client *client)
 		return err;
 
 	host1x_client_iommu_detach(client);
-	host1x_syncpt_free(client->syncpts[0]);
+	host1x_syncpt_put(client->syncpts[0]);
 	host1x_channel_put(gr3d->channel);
 
 	return 0;
diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c
index 77e128832920..72aea1cc0cfa 100644
--- a/drivers/gpu/drm/tegra/vic.c
+++ b/drivers/gpu/drm/tegra/vic.c
@@ -214,7 +214,7 @@ static int vic_init(struct host1x_client *client)
 	return 0;
 
 free_syncpt:
-	host1x_syncpt_free(client->syncpts[0]);
+	host1x_syncpt_put(client->syncpts[0]);
 free_channel:
 	host1x_channel_put(vic->channel);
 detach:
@@ -238,7 +238,7 @@ static int vic_exit(struct host1x_client *client)
 	if (err < 0)
 		return err;
 
-	host1x_syncpt_free(client->syncpts[0]);
+	host1x_syncpt_put(client->syncpts[0]);
 	host1x_channel_put(vic->channel);
 	host1x_client_iommu_detach(client);
 
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index e8d3fda91d8a..6e6ca774f68d 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -273,15 +273,13 @@ static int host1x_cdma_wait_pushbuffer_space(struct host1x *host1x,
 static void cdma_start_timer_locked(struct host1x_cdma *cdma,
 				    struct host1x_job *job)
 {
-	struct host1x *host = cdma_to_host1x(cdma);
-
 	if (cdma->timeout.client) {
 		/* timer already started */
 		return;
 	}
 
 	cdma->timeout.client = job->client;
-	cdma->timeout.syncpt = host1x_syncpt_get(host, job->syncpt_id);
+	cdma->timeout.syncpt = job->syncpt;
 	cdma->timeout.syncpt_val = job->syncpt_end;
 	cdma->timeout.start_ktime = ktime_get();
 
@@ -312,7 +310,6 @@ static void stop_cdma_timer_locked(struct host1x_cdma *cdma)
 static void update_cdma_locked(struct host1x_cdma *cdma)
 {
 	bool signal = false;
-	struct host1x *host1x = cdma_to_host1x(cdma);
 	struct host1x_job *job, *n;
 
 	/* If CDMA is stopped, queue is cleared and we can return */
@@ -324,8 +321,7 @@ static void update_cdma_locked(struct host1x_cdma *cdma)
 	 * to consume as many sync queue entries as possible without blocking
 	 */
 	list_for_each_entry_safe(job, n, &cdma->sync_queue, list) {
-		struct host1x_syncpt *sp =
-			host1x_syncpt_get(host1x, job->syncpt_id);
+		struct host1x_syncpt *sp = job->syncpt;
 
 		/* Check whether this syncpt has completed, and bail if not */
 		if (!host1x_syncpt_is_expired(sp, job->syncpt_end)) {
@@ -499,8 +495,7 @@ int host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job)
 		if (!cdma->timeout.initialized) {
 			int err;
 
-			err = host1x_hw_cdma_timeout_init(host1x, cdma,
-							  job->syncpt_id);
+			err = host1x_hw_cdma_timeout_init(host1x, cdma);
 			if (err) {
 				mutex_unlock(&cdma->lock);
 				return err;
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index f781a9b0f39d..63010ae37a97 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -37,7 +37,7 @@ struct host1x_cdma_ops {
 	void (*start)(struct host1x_cdma *cdma);
 	void (*stop)(struct host1x_cdma *cdma);
 	void (*flush)(struct  host1x_cdma *cdma);
-	int (*timeout_init)(struct host1x_cdma *cdma, unsigned int syncpt);
+	int (*timeout_init)(struct host1x_cdma *cdma);
 	void (*timeout_destroy)(struct host1x_cdma *cdma);
 	void (*freeze)(struct host1x_cdma *cdma);
 	void (*resume)(struct host1x_cdma *cdma, u32 getptr);
@@ -261,10 +261,9 @@ static inline void host1x_hw_cdma_flush(struct host1x *host,
 }
 
 static inline int host1x_hw_cdma_timeout_init(struct host1x *host,
-					      struct host1x_cdma *cdma,
-					      unsigned int syncpt)
+					      struct host1x_cdma *cdma)
 {
-	return host->cdma_op->timeout_init(cdma, syncpt);
+	return host->cdma_op->timeout_init(cdma);
 }
 
 static inline void host1x_hw_cdma_timeout_destroy(struct host1x *host,
diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
index 2f3bf94cf365..e49cd5b8f735 100644
--- a/drivers/gpu/host1x/hw/cdma_hw.c
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -295,7 +295,7 @@ static void cdma_timeout_handler(struct work_struct *work)
 /*
  * Init timeout resources
  */
-static int cdma_timeout_init(struct host1x_cdma *cdma, unsigned int syncpt)
+static int cdma_timeout_init(struct host1x_cdma *cdma)
 {
 	INIT_DELAYED_WORK(&cdma->timeout.wq, cdma_timeout_handler);
 	cdma->timeout.initialized = true;
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
index 5eaa29d171c9..d4c28faf27d1 100644
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -86,8 +86,7 @@ static void submit_gathers(struct host1x_job *job)
 
 static inline void synchronize_syncpt_base(struct host1x_job *job)
 {
-	struct host1x *host = dev_get_drvdata(job->channel->dev->parent);
-	struct host1x_syncpt *sp = host->syncpt + job->syncpt_id;
+	struct host1x_syncpt *sp = job->syncpt;
 	unsigned int id;
 	u32 value;
 
@@ -118,7 +117,7 @@ static void host1x_channel_set_streamid(struct host1x_channel *channel)
 static int channel_submit(struct host1x_job *job)
 {
 	struct host1x_channel *ch = job->channel;
-	struct host1x_syncpt *sp;
+	struct host1x_syncpt *sp = job->syncpt;
 	u32 user_syncpt_incrs = job->syncpt_incrs;
 	u32 prev_max = 0;
 	u32 syncval;
@@ -126,10 +125,9 @@ static int channel_submit(struct host1x_job *job)
 	struct host1x_waitlist *completed_waiter = NULL;
 	struct host1x *host = dev_get_drvdata(ch->dev->parent);
 
-	sp = host->syncpt + job->syncpt_id;
 	trace_host1x_channel_submit(dev_name(ch->dev),
 				    job->num_gathers, job->num_relocs,
-				    job->syncpt_id, job->syncpt_incrs);
+				    job->syncpt->id, job->syncpt_incrs);
 
 	/* before error checks, return current max */
 	prev_max = job->syncpt_end = host1x_syncpt_read_max(sp);
@@ -163,7 +161,7 @@ static int channel_submit(struct host1x_job *job)
 		host1x_cdma_push(&ch->cdma,
 				 host1x_opcode_setclass(HOST1X_CLASS_HOST1X,
 					host1x_uclass_wait_syncpt_r(), 1),
-				 host1x_class_host_wait_syncpt(job->syncpt_id,
+				 host1x_class_host_wait_syncpt(job->syncpt->id,
 					host1x_syncpt_read_max(sp)));
 	}
 
diff --git a/drivers/gpu/host1x/hw/debug_hw.c b/drivers/gpu/host1x/hw/debug_hw.c
index f31bcfa1b837..ceb48229d14b 100644
--- a/drivers/gpu/host1x/hw/debug_hw.c
+++ b/drivers/gpu/host1x/hw/debug_hw.c
@@ -204,7 +204,7 @@ static void show_channel_gathers(struct output *o, struct host1x_cdma *cdma)
 		unsigned int i;
 
 		host1x_debug_output(o, "\n%p: JOB, syncpt_id=%d, syncpt_val=%d, first_get=%08x, timeout=%d num_slots=%d, num_handles=%d\n",
-				    job, job->syncpt_id, job->syncpt_end,
+				    job, job->syncpt->id, job->syncpt_end,
 				    job->first_get, job->timeout,
 				    job->num_slots, job->num_unpins);
 
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index 82d0a60ba3f7..adbdc225de8d 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -79,6 +79,9 @@ static void job_free(struct kref *ref)
 {
 	struct host1x_job *job = container_of(ref, struct host1x_job, ref);
 
+	if (job->syncpt)
+		host1x_syncpt_put(job->syncpt);
+
 	kfree(job);
 }
 
@@ -674,7 +677,7 @@ EXPORT_SYMBOL(host1x_job_unpin);
  */
 void host1x_job_dump(struct device *dev, struct host1x_job *job)
 {
-	dev_dbg(dev, "    SYNCPT_ID   %d\n", job->syncpt_id);
+	dev_dbg(dev, "    SYNCPT_ID   %d\n", job->syncpt->id);
 	dev_dbg(dev, "    SYNCPT_VAL  %d\n", job->syncpt_end);
 	dev_dbg(dev, "    FIRST_GET   0x%x\n", job->first_get);
 	dev_dbg(dev, "    TIMEOUT     %d\n", job->timeout);
diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index 8da4bbce8b9d..7bb5de8c3d63 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -90,6 +90,8 @@ struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
 	else
 		sp->client_managed = false;
 
+	kref_init(&sp->ref);
+
 	mutex_unlock(&host->syncpt_mutex);
 	return sp;
 
@@ -383,7 +385,7 @@ int host1x_syncpt_init(struct host1x *host)
  * host1x client drivers can use this function to allocate a syncpoint for
  * subsequent use. A syncpoint returned by this function will be reserved for
  * use by the client exclusively. When no longer using a syncpoint, a host1x
- * client driver needs to release it using host1x_syncpt_free().
+ * client driver needs to release it using host1x_syncpt_put().
  */
 struct host1x_syncpt *host1x_syncpt_request(struct host1x_client *client,
 					    unsigned long flags)
@@ -394,20 +396,9 @@ struct host1x_syncpt *host1x_syncpt_request(struct host1x_client *client,
 }
 EXPORT_SYMBOL(host1x_syncpt_request);
 
-/**
- * host1x_syncpt_free() - free a requested syncpoint
- * @sp: host1x syncpoint
- *
- * Release a syncpoint previously allocated using host1x_syncpt_request(). A
- * host1x client driver should call this when the syncpoint is no longer in
- * use. Note that client drivers must ensure that the syncpoint doesn't remain
- * under the control of hardware after calling this function, otherwise two
- * clients may end up trying to access the same syncpoint concurrently.
- */
-void host1x_syncpt_free(struct host1x_syncpt *sp)
+static void syncpt_release(struct kref *ref)
 {
-	if (!sp)
-		return;
+	struct host1x_syncpt *sp = container_of(ref, struct host1x_syncpt, ref);
 
 	mutex_lock(&sp->host->syncpt_mutex);
 
@@ -419,7 +410,23 @@ void host1x_syncpt_free(struct host1x_syncpt *sp)
 
 	mutex_unlock(&sp->host->syncpt_mutex);
 }
-EXPORT_SYMBOL(host1x_syncpt_free);
+
+/**
+ * host1x_syncpt_put() - free a requested syncpoint
+ * @sp: host1x syncpoint
+ *
+ * Release a syncpoint previously allocated using host1x_syncpt_request(). A
+ * host1x client driver should call this when the syncpoint is no longer in
+ * use.
+ */
+void host1x_syncpt_put(struct host1x_syncpt *sp)
+{
+	if (!sp)
+		return;
+
+	kref_put(&sp->ref, syncpt_release);
+}
+EXPORT_SYMBOL(host1x_syncpt_put);
 
 void host1x_syncpt_deinit(struct host1x *host)
 {
@@ -486,16 +493,48 @@ unsigned int host1x_syncpt_nb_mlocks(struct host1x *host)
 }
 
 /**
- * host1x_syncpt_get() - obtain a syncpoint by ID
+ * host1x_syncpt_get_by_id() - obtain a syncpoint by ID
+ * @host: host1x controller
+ * @id: syncpoint ID
+ */
+struct host1x_syncpt *host1x_syncpt_get_by_id(struct host1x *host,
+					      unsigned int id)
+{
+	if (id >= host->info->nb_pts)
+		return NULL;
+
+	if (kref_get_unless_zero(&host->syncpt[id].ref))
+		return &host->syncpt[id];
+	else
+		return NULL;
+}
+EXPORT_SYMBOL(host1x_syncpt_get_by_id);
+
+/**
+ * host1x_syncpt_get_by_id_noref() - obtain a syncpoint by ID but don't
+ * 	increase the refcount.
  * @host: host1x controller
  * @id: syncpoint ID
  */
-struct host1x_syncpt *host1x_syncpt_get(struct host1x *host, unsigned int id)
+struct host1x_syncpt *host1x_syncpt_get_by_id_noref(struct host1x *host,
+						    unsigned int id)
 {
 	if (id >= host->info->nb_pts)
 		return NULL;
 
-	return host->syncpt + id;
+	return &host->syncpt[id];
+}
+EXPORT_SYMBOL(host1x_syncpt_get_by_id_noref);
+
+/**
+ * host1x_syncpt_get() - increment syncpoint refcount
+ * @sp: syncpoint
+ */
+struct host1x_syncpt *host1x_syncpt_get(struct host1x_syncpt *sp)
+{
+	kref_get(&sp->ref);
+
+	return sp;
 }
 EXPORT_SYMBOL(host1x_syncpt_get);
 
diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
index 3aa6b25b1b9c..a6766f8d55ee 100644
--- a/drivers/gpu/host1x/syncpt.h
+++ b/drivers/gpu/host1x/syncpt.h
@@ -11,6 +11,7 @@
 #include <linux/atomic.h>
 #include <linux/host1x.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/sched.h>
 
 #include "intr.h"
@@ -26,6 +27,8 @@ struct host1x_syncpt_base {
 };
 
 struct host1x_syncpt {
+	struct kref ref;
+
 	unsigned int id;
 	atomic_t min_val;
 	atomic_t max_val;
diff --git a/drivers/staging/media/tegra-video/vi.c b/drivers/staging/media/tegra-video/vi.c
index 7a09061cda57..df5ca3596470 100644
--- a/drivers/staging/media/tegra-video/vi.c
+++ b/drivers/staging/media/tegra-video/vi.c
@@ -1131,8 +1131,8 @@ static void tegra_channel_host1x_syncpts_free(struct tegra_vi_channel *chan)
 	int i;
 
 	for (i = 0; i < chan->numgangports; i++) {
-		host1x_syncpt_free(chan->mw_ack_sp[i]);
-		host1x_syncpt_free(chan->frame_start_sp[i]);
+		host1x_syncpt_put(chan->mw_ack_sp[i]);
+		host1x_syncpt_put(chan->frame_start_sp[i]);
 	}
 }
 
@@ -1177,7 +1177,7 @@ static int tegra_channel_host1x_syncpt_init(struct tegra_vi_channel *chan)
 		mw_sp = host1x_syncpt_request(&vi->client, flags);
 		if (!mw_sp) {
 			dev_err(vi->dev, "failed to request memory ack syncpoint\n");
-			host1x_syncpt_free(fs_sp);
+			host1x_syncpt_put(fs_sp);
 			ret = -ENOMEM;
 			goto free_syncpts;
 		}
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 7137ce0e35d4..107aea29bccb 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -142,7 +142,9 @@ struct host1x_syncpt_base;
 struct host1x_syncpt;
 struct host1x;
 
-struct host1x_syncpt *host1x_syncpt_get(struct host1x *host, u32 id);
+struct host1x_syncpt *host1x_syncpt_get_by_id(struct host1x *host, u32 id);
+struct host1x_syncpt *host1x_syncpt_get_by_id_noref(struct host1x *host, u32 id);
+struct host1x_syncpt *host1x_syncpt_get(struct host1x_syncpt *sp);
 u32 host1x_syncpt_id(struct host1x_syncpt *sp);
 u32 host1x_syncpt_read_min(struct host1x_syncpt *sp);
 u32 host1x_syncpt_read_max(struct host1x_syncpt *sp);
@@ -153,7 +155,7 @@ int host1x_syncpt_wait(struct host1x_syncpt *sp, u32 thresh, long timeout,
 		       u32 *value);
 struct host1x_syncpt *host1x_syncpt_request(struct host1x_client *client,
 					    unsigned long flags);
-void host1x_syncpt_free(struct host1x_syncpt *sp);
+void host1x_syncpt_put(struct host1x_syncpt *sp);
 struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
 					  unsigned long flags,
 					  const char *name);
@@ -221,7 +223,7 @@ struct host1x_job {
 	dma_addr_t *reloc_addr_phys;
 
 	/* Sync point id, number of increments and end related to the submit */
-	u32 syncpt_id;
+	struct host1x_syncpt *syncpt;
 	u32 syncpt_incrs;
 	u32 syncpt_end;
 
-- 
cgit v1.2.3


From f5ba33fb9690566c382624637125827b5512e766 Mon Sep 17 00:00:00 2001
From: Mikko Perttunen <mperttunen@nvidia.com>
Date: Mon, 29 Mar 2021 16:38:34 +0300
Subject: gpu: host1x: Reserve VBLANK syncpoints at initialization

On T20-T148 chips, the bootloader can set up a boot splash
screen with DC configured to increment syncpoint 26/27
at VBLANK. Because of this we shouldn't allow these syncpoints
to be allocated until DC has been reset and will no longer
increment them in the background.

As such, on these chips, reserve those two syncpoints at
initialization, and only mark them free once the DC
driver has indicated it's safe to do so.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/dc.c  |  6 ++++++
 drivers/gpu/host1x/dev.c    |  6 ++++++
 drivers/gpu/host1x/dev.h    |  6 ++++++
 drivers/gpu/host1x/syncpt.c | 35 ++++++++++++++++++++++++++++++++++-
 include/linux/host1x.h      |  3 +++
 5 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index cfda71e151cc..40bf8f33a2ae 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -2045,6 +2045,12 @@ static int tegra_dc_init(struct host1x_client *client)
 	struct drm_plane *cursor = NULL;
 	int err;
 
+	/*
+	 * DC has been reset by now, so VBLANK syncpoint can be released
+	 * for general use.
+	 */
+	host1x_syncpt_release_vblank_reservation(client, 26 + dc->pipe);
+
 	/*
 	 * XXX do not register DCs with no window groups because we cannot
 	 * assign a primary plane to them, which in turn will cause KMS to
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index d0ebb70e2fdd..fbb6447b8659 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -77,6 +77,7 @@ static const struct host1x_info host1x01_info = {
 	.has_hypervisor = false,
 	.num_sid_entries = 0,
 	.sid_table = NULL,
+	.reserve_vblank_syncpts = true,
 };
 
 static const struct host1x_info host1x02_info = {
@@ -91,6 +92,7 @@ static const struct host1x_info host1x02_info = {
 	.has_hypervisor = false,
 	.num_sid_entries = 0,
 	.sid_table = NULL,
+	.reserve_vblank_syncpts = true,
 };
 
 static const struct host1x_info host1x04_info = {
@@ -105,6 +107,7 @@ static const struct host1x_info host1x04_info = {
 	.has_hypervisor = false,
 	.num_sid_entries = 0,
 	.sid_table = NULL,
+	.reserve_vblank_syncpts = false,
 };
 
 static const struct host1x_info host1x05_info = {
@@ -119,6 +122,7 @@ static const struct host1x_info host1x05_info = {
 	.has_hypervisor = false,
 	.num_sid_entries = 0,
 	.sid_table = NULL,
+	.reserve_vblank_syncpts = false,
 };
 
 static const struct host1x_sid_entry tegra186_sid_table[] = {
@@ -142,6 +146,7 @@ static const struct host1x_info host1x06_info = {
 	.has_hypervisor = true,
 	.num_sid_entries = ARRAY_SIZE(tegra186_sid_table),
 	.sid_table = tegra186_sid_table,
+	.reserve_vblank_syncpts = false,
 };
 
 static const struct host1x_sid_entry tegra194_sid_table[] = {
@@ -165,6 +170,7 @@ static const struct host1x_info host1x07_info = {
 	.has_hypervisor = true,
 	.num_sid_entries = ARRAY_SIZE(tegra194_sid_table),
 	.sid_table = tegra194_sid_table,
+	.reserve_vblank_syncpts = false,
 };
 
 static const struct of_device_id host1x_of_match[] = {
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index 63010ae37a97..fa6d4bc46e98 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -101,6 +101,12 @@ struct host1x_info {
 	bool has_hypervisor; /* has hypervisor registers */
 	unsigned int num_sid_entries;
 	const struct host1x_sid_entry *sid_table;
+	/*
+	 * On T20-T148, the boot chain may setup DC to increment syncpoints
+	 * 26/27 on VBLANK. As such we cannot use these syncpoints until
+	 * the display driver disables VBLANK increments.
+	 */
+	bool reserve_vblank_syncpts;
 };
 
 struct host1x {
diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index 877c5ab40cbd..e648ebbb2027 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -67,7 +67,7 @@ struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
 
 	mutex_lock(&host->syncpt_mutex);
 
-	for (i = 0; i < host->info->nb_pts && sp->name; i++, sp++)
+	for (i = 0; i < host->info->nb_pts && kref_read(&sp->ref); i++, sp++)
 		;
 
 	if (i >= host->info->nb_pts)
@@ -374,6 +374,11 @@ int host1x_syncpt_init(struct host1x *host)
 	if (!host->nop_sp)
 		return -ENOMEM;
 
+	if (host->info->reserve_vblank_syncpts) {
+		kref_init(&host->syncpt[26].ref);
+		kref_init(&host->syncpt[27].ref);
+	}
+
 	return 0;
 }
 
@@ -559,3 +564,31 @@ u32 host1x_syncpt_base_id(struct host1x_syncpt_base *base)
 	return base->id;
 }
 EXPORT_SYMBOL(host1x_syncpt_base_id);
+
+static void do_nothing(struct kref *ref)
+{
+}
+
+/**
+ * host1x_syncpt_release_vblank_reservation() - Make VBLANK syncpoint
+ *   available for allocation
+ *
+ * @client: host1x bus client
+ * @syncpt_id: syncpoint ID to make available
+ *
+ * Makes VBLANK<i> syncpoint available for allocatation if it was
+ * reserved at initialization time. This should be called by the display
+ * driver after it has ensured that any VBLANK increment programming configured
+ * by the boot chain has been disabled.
+ */
+void host1x_syncpt_release_vblank_reservation(struct host1x_client *client,
+					      u32 syncpt_id)
+{
+	struct host1x *host = dev_get_drvdata(client->host->parent);
+
+	if (!host->info->reserve_vblank_syncpts)
+		return;
+
+	kref_put(&host->syncpt[syncpt_id].ref, do_nothing);
+}
+EXPORT_SYMBOL(host1x_syncpt_release_vblank_reservation);
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 107aea29bccb..e0a41c2b4c7a 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -163,6 +163,9 @@ struct host1x_syncpt *host1x_syncpt_alloc(struct host1x *host,
 struct host1x_syncpt_base *host1x_syncpt_get_base(struct host1x_syncpt *sp);
 u32 host1x_syncpt_base_id(struct host1x_syncpt_base *base);
 
+void host1x_syncpt_release_vblank_reservation(struct host1x_client *client,
+					      u32 syncpt_id);
+
 /*
  * host1x channel
  */
-- 
cgit v1.2.3


From 933deb8c7b8e3f83e3dbd0b08e3cad51350d44c4 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 26 Mar 2021 15:51:37 +0100
Subject: gpu: host1x: Add early init and late exit callbacks

These callbacks can be used by client drivers to run code during early
init and during late exit. Early init callbacks are run prior to the
regular init callbacks while late exit callbacks run after the regular
exit callbacks.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 31 +++++++++++++++++++++++++++++++
 include/linux/host1x.h   |  4 ++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 68a766ff0e9d..46f69c532b6b 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -196,6 +196,17 @@ int host1x_device_init(struct host1x_device *device)
 
 	mutex_lock(&device->clients_lock);
 
+	list_for_each_entry(client, &device->clients, list) {
+		if (client->ops && client->ops->early_init) {
+			err = client->ops->early_init(client);
+			if (err < 0) {
+				dev_err(&device->dev, "failed to early initialize %s: %d\n",
+					dev_name(client->dev), err);
+				goto teardown_late;
+			}
+		}
+	}
+
 	list_for_each_entry(client, &device->clients, list) {
 		if (client->ops && client->ops->init) {
 			err = client->ops->init(client);
@@ -217,6 +228,14 @@ teardown:
 		if (client->ops->exit)
 			client->ops->exit(client);
 
+	/* reset client to end of list for late teardown */
+	client = list_entry(&device->clients, struct host1x_client, list);
+
+teardown_late:
+	list_for_each_entry_continue_reverse(client, &device->clients, list)
+		if (client->ops->late_exit)
+			client->ops->late_exit(client);
+
 	mutex_unlock(&device->clients_lock);
 	return err;
 }
@@ -251,6 +270,18 @@ int host1x_device_exit(struct host1x_device *device)
 		}
 	}
 
+	list_for_each_entry_reverse(client, &device->clients, list) {
+		if (client->ops && client->ops->late_exit) {
+			err = client->ops->late_exit(client);
+			if (err < 0) {
+				dev_err(&device->dev, "failed to late cleanup %s: %d\n",
+					dev_name(client->dev), err);
+				mutex_unlock(&device->clients_lock);
+				return err;
+			}
+		}
+	}
+
 	mutex_unlock(&device->clients_lock);
 
 	return 0;
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index e0a41c2b4c7a..232e1bd507a7 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -25,14 +25,18 @@ u64 host1x_get_dma_mask(struct host1x *host1x);
 
 /**
  * struct host1x_client_ops - host1x client operations
+ * @early_init: host1x client early initialization code
  * @init: host1x client initialization code
  * @exit: host1x client tear down code
+ * @late_exit: host1x client late tear down code
  * @suspend: host1x client suspend code
  * @resume: host1x client resume code
  */
 struct host1x_client_ops {
+	int (*early_init)(struct host1x_client *client);
 	int (*init)(struct host1x_client *client);
 	int (*exit)(struct host1x_client *client);
+	int (*late_exit)(struct host1x_client *client);
 	int (*suspend)(struct host1x_client *client);
 	int (*resume)(struct host1x_client *client);
 };
-- 
cgit v1.2.3


From 1fd3dde5e270ad08f1406f921c9a2cda154fcea9 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 30 Mar 2021 12:43:16 -0500
Subject: PCI: Add pci_disable_parity()

Add pci_disable_parity() to disable reporting of parity errors for a
device by clearing PCI_COMMAND_PARITY.

The device will still set PCI_STATUS_DETECTED_PARITY when it detects
a parity error or receives a Poisoned TLP, but it will not set
PCI_STATUS_PARITY, which means it will not assert PERR#
(conventional PCI) or report Poisoned TLPs (PCIe).

Based-on: https://lore.kernel.org/linux-arm-kernel/d375987c-ea4f-dd98-4ef8-99b2fbfe7c33@gmail.com/
Based-on-patch-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/20210330174318.1289680-2-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 17 +++++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 16a17215f633..b1845e5e5c8f 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4453,6 +4453,23 @@ void pci_clear_mwi(struct pci_dev *dev)
 }
 EXPORT_SYMBOL(pci_clear_mwi);
 
+/**
+ * pci_disable_parity - disable parity checking for device
+ * @dev: the PCI device to operate on
+ *
+ * Disable parity checking for device @dev
+ */
+void pci_disable_parity(struct pci_dev *dev)
+{
+	u16 cmd;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	if (cmd & PCI_COMMAND_PARITY) {
+		cmd &= ~PCI_COMMAND_PARITY;
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+}
+
 /**
  * pci_intx - enables/disables PCI INTx for device dev
  * @pdev: the PCI device to operate on
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 86c799c97b77..4eaa773115da 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1201,6 +1201,7 @@ int __must_check pci_set_mwi(struct pci_dev *dev);
 int __must_check pcim_set_mwi(struct pci_dev *dev);
 int pci_try_set_mwi(struct pci_dev *dev);
 void pci_clear_mwi(struct pci_dev *dev);
+void pci_disable_parity(struct pci_dev *dev);
 void pci_intx(struct pci_dev *dev, int enable);
 bool pci_check_and_mask_intx(struct pci_dev *dev);
 bool pci_check_and_unmask_intx(struct pci_dev *dev);
-- 
cgit v1.2.3


From 9af723be863904c746a6a6bf4f3686087b16b9ff Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 30 Dec 2020 12:43:34 -0500
Subject: svcrdma: Remove sc_read_complete_q

Now that svc_rdma_recvfrom() waits for Read completion,
sc_read_complete_q is no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h          |  2 --
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 57 ++++----------------------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  1 -
 3 files changed, 6 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 6e621e1f56b8..b72f75091404 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -106,7 +106,6 @@ struct svcxprt_rdma {
 
 	wait_queue_head_t    sc_send_wait;	/* SQ exhaustion waitlist */
 	unsigned long	     sc_flags;
-	struct list_head     sc_read_complete_q;
 	struct work_struct   sc_work;
 
 	struct llist_head    sc_recv_ctxts;
@@ -140,7 +139,6 @@ struct svc_rdma_recv_ctxt {
 	bool			rc_temp;
 	u32			rc_byte_len;
 	unsigned int		rc_page_count;
-	unsigned int		rc_hdr_count;
 	u32			rc_inv_rkey;
 	__be32			rc_msgtype;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ae932435e1d3..88d80d701241 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -89,8 +89,7 @@
  * svc_rdma_recvfrom call returns.
  *
  * During the second svc_rdma_recvfrom call, RDMA Read sink pages
- * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst
- * (see rdma_read_complete() below).
+ * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst.
  */
 
 #include <linux/slab.h>
@@ -379,10 +378,6 @@ void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_recv_ctxt *ctxt;
 
-	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
-		list_del(&ctxt->rc_list);
-		svc_rdma_recv_ctxt_put(rdma, ctxt);
-	}
 	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
 		list_del(&ctxt->rc_list);
 		svc_rdma_recv_ctxt_put(rdma, ctxt);
@@ -720,35 +715,6 @@ out_inval:
 	return -EINVAL;
 }
 
-static void rdma_read_complete(struct svc_rqst *rqstp,
-			       struct svc_rdma_recv_ctxt *head)
-{
-	int page_no;
-
-	/* Move Read chunk pages to rqstp so that they will be released
-	 * when svc_process is done with them.
-	 */
-	for (page_no = 0; page_no < head->rc_page_count; page_no++) {
-		put_page(rqstp->rq_pages[page_no]);
-		rqstp->rq_pages[page_no] = head->rc_pages[page_no];
-	}
-	head->rc_page_count = 0;
-
-	/* Point rq_arg.pages past header */
-	rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count];
-	rqstp->rq_arg.page_len = head->rc_arg.page_len;
-
-	/* rq_respages starts after the last arg page */
-	rqstp->rq_respages = &rqstp->rq_pages[page_no];
-	rqstp->rq_next_page = rqstp->rq_respages + 1;
-
-	/* Rebuild rq_arg head and tail. */
-	rqstp->rq_arg.head[0] = head->rc_arg.head[0];
-	rqstp->rq_arg.tail[0] = head->rc_arg.tail[0];
-	rqstp->rq_arg.len = head->rc_arg.len;
-	rqstp->rq_arg.buflen = head->rc_arg.buflen;
-}
-
 static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
 				struct svc_rdma_recv_ctxt *rctxt,
 				int status)
@@ -834,13 +800,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_xprt_ctxt = NULL;
 
 	spin_lock(&rdma_xprt->sc_rq_dto_lock);
-	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
-	if (ctxt) {
-		list_del(&ctxt->rc_list);
-		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
-		rdma_read_complete(rqstp, ctxt);
-		goto complete;
-	}
 	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
 	if (!ctxt) {
 		/* No new incoming requests, terminate the loop */
@@ -880,21 +839,17 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
 
 	if (!pcl_is_empty(&ctxt->rc_read_pcl) ||
-	    !pcl_is_empty(&ctxt->rc_call_pcl))
-		goto out_readlist;
+	    !pcl_is_empty(&ctxt->rc_call_pcl)) {
+		ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
+		if (ret < 0)
+			goto out_readfail;
+	}
 
-complete:
 	rqstp->rq_xprt_ctxt = ctxt;
 	rqstp->rq_prot = IPPROTO_MAX;
 	svc_xprt_copy_addrs(rqstp, xprt);
 	return rqstp->rq_arg.len;
 
-out_readlist:
-	ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
-	if (ret < 0)
-		goto out_readfail;
-	goto complete;
-
 out_err:
 	svc_rdma_send_error(rdma_xprt, ctxt, ret);
 	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3646216211c5..d94b7759ada1 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -136,7 +136,6 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
 	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
-	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
 	init_llist_head(&cma_xprt->sc_recv_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
-- 
cgit v1.2.3


From 5533c4f4b996b7fc36d16b5e0807ebbc08c93af4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 13 Jan 2021 09:31:50 -0500
Subject: svcrdma: Remove svc_rdma_recv_ctxt::rc_pages and ::rc_arg

These fields are no longer used.

The size of struct svc_rdma_recv_ctxt is now less than 300 bytes on
x86_64, down from 2440 bytes.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_rdma.h         |  3 ---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |  5 -----
 net/sunrpc/xprtrdma/svc_rdma_rw.c       | 12 ------------
 3 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index b72f75091404..3184465de3a0 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -134,7 +134,6 @@ struct svc_rdma_recv_ctxt {
 	struct rpc_rdma_cid	rc_cid;
 	struct ib_sge		rc_recv_sge;
 	void			*rc_recv_buf;
-	struct xdr_buf		rc_arg;
 	struct xdr_stream	rc_stream;
 	bool			rc_temp;
 	u32			rc_byte_len;
@@ -148,8 +147,6 @@ struct svc_rdma_recv_ctxt {
 	struct svc_rdma_chunk	*rc_cur_result_payload;
 	struct svc_rdma_pcl	rc_write_pcl;
 	struct svc_rdma_pcl	rc_reply_pcl;
-
-	struct page		*rc_pages[RPCSVC_MAXPAGES];
 };
 
 struct svc_rdma_send_ctxt {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 88d80d701241..232860ea683b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -227,11 +227,6 @@ out_empty:
 void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
 			    struct svc_rdma_recv_ctxt *ctxt)
 {
-	unsigned int i;
-
-	for (i = 0; i < ctxt->rc_page_count; i++)
-		put_page(ctxt->rc_pages[i]);
-
 	pcl_free(&ctxt->rc_call_pcl);
 	pcl_free(&ctxt->rc_read_pcl);
 	pcl_free(&ctxt->rc_write_pcl);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 36959a14375e..5238bc829235 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -1081,18 +1081,6 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
 	struct svc_rdma_chunk_ctxt *cc;
 	int ret;
 
-	/* The request (with page list) is constructed in
-	 * head->rc_arg. Pages involved with RDMA Read I/O are
-	 * transferred there.
-	 */
-	head->rc_arg.head[0] = rqstp->rq_arg.head[0];
-	head->rc_arg.tail[0] = rqstp->rq_arg.tail[0];
-	head->rc_arg.pages = head->rc_pages;
-	head->rc_arg.page_base = 0;
-	head->rc_arg.page_len = 0;
-	head->rc_arg.len = rqstp->rq_arg.len;
-	head->rc_arg.buflen = rqstp->rq_arg.buflen;
-
 	info = svc_rdma_read_info_alloc(rdma);
 	if (!info)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 5c701e71961af0ec8227ea615f1646dbe98aea1a Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Sat, 27 Mar 2021 10:54:47 +0800
Subject: netfilter: ipset: Remove duplicate declaration

struct ip_set is declared twice. One is declared at 79th line,
so remove the duplicate.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 46d9a0c26c67..10279c4830ac 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -124,8 +124,6 @@ struct ip_set_ext {
 	bool target;
 };
 
-struct ip_set;
-
 #define ext_timeout(e, s)	\
 ((unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT]))
 #define ext_counter(e, s)	\
-- 
cgit v1.2.3


From 19c28b1374fb1073a9ec873a6c10bf5f16b10b9d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 30 Mar 2021 16:58:37 +0200
Subject: netfilter: add helper function to set up the nfnetlink header and use
 it

This patch adds a helper function to set up the netlink and nfnetlink headers.
Update existing codebase to use it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h  |  27 ++++++++++
 net/netfilter/ipset/ip_set_core.c    |  17 ++----
 net/netfilter/nf_conntrack_netlink.c |  77 ++++++++------------------
 net/netfilter/nf_tables_api.c        | 102 ++++++++++-------------------------
 net/netfilter/nf_tables_trace.c      |   9 +---
 net/netfilter/nfnetlink_acct.c       |  11 ++--
 net/netfilter/nfnetlink_cthelper.c   |  11 ++--
 net/netfilter/nfnetlink_cttimeout.c  |  22 +++-----
 net/netfilter/nfnetlink_log.c        |  11 ++--
 net/netfilter/nfnetlink_queue.c      |  12 ++---
 net/netfilter/nft_compat.c           |  11 ++--
 11 files changed, 102 insertions(+), 208 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index f6267e2883f2..791d516e1e88 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -57,6 +57,33 @@ static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type)
 	return subsys << 8 | msg_type;
 }
 
+static inline void nfnl_fill_hdr(struct nlmsghdr *nlh, u8 family, u8 version,
+				 __be16 res_id)
+{
+	struct nfgenmsg *nfmsg;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = family;
+	nfmsg->version = version;
+	nfmsg->res_id = res_id;
+}
+
+static inline struct nlmsghdr *nfnl_msg_put(struct sk_buff *skb, u32 portid,
+					    u32 seq, int type, int flags,
+					    u8 family, u8 version,
+					    __be16 res_id)
+{
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
+	if (!nlh)
+		return NULL;
+
+	nfnl_fill_hdr(nlh, family, version, res_id);
+
+	return nlh;
+}
+
 void nfnl_lock(__u8 subsys_id);
 void nfnl_unlock(__u8 subsys_id);
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 89009c82a6b2..359ff8ec236a 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -963,20 +963,9 @@ static struct nlmsghdr *
 start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
 	  enum ipset_cmd cmd)
 {
-	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
-
-	nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd),
-			sizeof(*nfmsg), flags);
-	if (!nlh)
-		return NULL;
-
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = NFPROTO_IPV4;
-	nfmsg->version = NFNETLINK_V0;
-	nfmsg->res_id = 0;
-
-	return nlh;
+	return nfnl_msg_put(skb, portid, seq,
+			    nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags,
+			    NFPROTO_IPV4, NFNETLINK_V0, 0);
 }
 
 /* Create a set */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1d519b0e51a5..c67a6ec22a74 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -555,22 +555,17 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 {
 	const struct nf_conntrack_zone *zone;
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	struct nlattr *nest_parms;
 	unsigned int event;
 
 	if (portid)
 		flags |= NLM_F_MULTI;
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct),
+			   NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = nf_ct_l3num(ct);
-	nfmsg->version      = NFNETLINK_V0;
-	nfmsg->res_id	    = 0;
-
 	zone = nf_ct_zone(ct);
 
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
@@ -713,7 +708,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	const struct nf_conntrack_zone *zone;
 	struct net *net;
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	struct nlattr *nest_parms;
 	struct nf_conn *ct = item->ct;
 	struct sk_buff *skb;
@@ -743,15 +737,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 		goto errout;
 
 	type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type);
-	nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct),
+			   NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = nf_ct_l3num(ct);
-	nfmsg->version	= NFNETLINK_V0;
-	nfmsg->res_id	= 0;
-
 	zone = nf_ct_zone(ct);
 
 	nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
@@ -2490,20 +2480,15 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
 				__u16 cpu, const struct ip_conntrack_stat *st)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
 			      IPCTNL_MSG_CT_GET_STATS_CPU);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+			   NFNETLINK_V0, htons(cpu));
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = AF_UNSPEC;
-	nfmsg->version      = NFNETLINK_V0;
-	nfmsg->res_id	    = htons(cpu);
-
 	if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
 	    nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
 	    nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
@@ -2575,20 +2560,15 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 			    struct net *net)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
 	unsigned int nr_conntracks = atomic_read(&net->ct.count);
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+			   NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = AF_UNSPEC;
-	nfmsg->version      = NFNETLINK_V0;
-	nfmsg->res_id	    = 0;
-
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
@@ -3085,19 +3065,14 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
 			int event, const struct nf_conntrack_expect *exp)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags,
+			   exp->tuple.src.l3num, NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = exp->tuple.src.l3num;
-	nfmsg->version	    = NFNETLINK_V0;
-	nfmsg->res_id	    = 0;
-
 	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
 		goto nla_put_failure;
 
@@ -3117,7 +3092,6 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	struct nf_conntrack_expect *exp = item->exp;
 	struct net *net = nf_ct_exp_net(exp);
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	struct sk_buff *skb;
 	unsigned int type, group;
 	int flags = 0;
@@ -3140,15 +3114,11 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 		goto errout;
 
 	type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type);
-	nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, item->portid, 0, type, flags,
+			   exp->tuple.src.l3num, NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = exp->tuple.src.l3num;
-	nfmsg->version	    = NFNETLINK_V0;
-	nfmsg->res_id	    = 0;
-
 	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
 		goto nla_put_failure;
 
@@ -3716,20 +3686,15 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
 			     const struct ip_conntrack_stat *st)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
 			      IPCTNL_MSG_EXP_GET_STATS_CPU);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+			   NFNETLINK_V0, htons(cpu));
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = AF_UNSPEC;
-	nfmsg->version      = NFNETLINK_V0;
-	nfmsg->res_id	    = htons(cpu);
-
 	if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
 	    nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
 	    nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e894d70b5d5f..005f1c620fc0 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -707,18 +707,13 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
 				     int family, const struct nft_table *table)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+			   NFNETLINK_V0, nft_base_seq(net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= family;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= nft_base_seq(net);
-
 	if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
 	    nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
 	    nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
@@ -1468,18 +1463,13 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 				     const struct nft_chain *chain)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+			   NFNETLINK_V0, nft_base_seq(net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= family;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= nft_base_seq(net);
-
 	if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
 		goto nla_put_failure;
 	if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
@@ -2825,20 +2815,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
 				    const struct nft_rule *prule)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	const struct nft_expr *expr, *next;
 	struct nlattr *list;
 	u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 
-	nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
+			   nft_base_seq(net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= family;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= nft_base_seq(net);
-
 	if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
 		goto nla_put_failure;
 	if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
@@ -3809,7 +3794,6 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb,
 static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 			      const struct nft_set *set, u16 event, u16 flags)
 {
-	struct nfgenmsg *nfmsg;
 	struct nlmsghdr *nlh;
 	u32 portid = ctx->portid;
 	struct nlattr *nest;
@@ -3817,16 +3801,11 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 	int i;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
-			flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
+			   NFNETLINK_V0, nft_base_seq(ctx->net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= ctx->family;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= nft_base_seq(ctx->net);
-
 	if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
 		goto nla_put_failure;
 	if (nla_put_string(skb, NFTA_SET_NAME, set->name))
@@ -4795,7 +4774,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	struct nft_set *set;
 	struct nft_set_dump_args args;
 	bool set_found = false;
-	struct nfgenmsg *nfmsg;
 	struct nlmsghdr *nlh;
 	struct nlattr *nest;
 	u32 portid, seq;
@@ -4828,16 +4806,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	portid = NETLINK_CB(cb->skb).portid;
 	seq    = cb->nlh->nlmsg_seq;
 
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
-			NLM_F_MULTI);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
+			   table->family, NFNETLINK_V0, nft_base_seq(net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = table->family;
-	nfmsg->version      = NFNETLINK_V0;
-	nfmsg->res_id	    = nft_base_seq(net);
-
 	if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name))
 		goto nla_put_failure;
 	if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
@@ -4894,22 +4867,16 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
 				       const struct nft_set *set,
 				       const struct nft_set_elem *elem)
 {
-	struct nfgenmsg *nfmsg;
 	struct nlmsghdr *nlh;
 	struct nlattr *nest;
 	int err;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
-			flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
+			   NFNETLINK_V0, nft_base_seq(ctx->net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= ctx->family;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= nft_base_seq(ctx->net);
-
 	if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
 		goto nla_put_failure;
 	if (nla_put_string(skb, NFTA_SET_NAME, set->name))
@@ -6227,19 +6194,14 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
 				   int family, const struct nft_table *table,
 				   struct nft_object *obj, bool reset)
 {
-	struct nfgenmsg *nfmsg;
 	struct nlmsghdr *nlh;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+			   NFNETLINK_V0, nft_base_seq(net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= family;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= nft_base_seq(net);
-
 	if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
 	    nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) ||
 	    nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
@@ -7139,20 +7101,15 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 					 struct list_head *hook_list)
 {
 	struct nlattr *nest, *nest_devs;
-	struct nfgenmsg *nfmsg;
 	struct nft_hook *hook;
 	struct nlmsghdr *nlh;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+			   NFNETLINK_V0, nft_base_seq(net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= family;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= nft_base_seq(net);
-
 	if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
 	    nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
 	    nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
@@ -7385,19 +7342,14 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 				   u32 portid, u32 seq)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	char buf[TASK_COMM_LEN];
 	int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
 
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
+			   NFNETLINK_V0, nft_base_seq(net));
+	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= AF_UNSPEC;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= nft_base_seq(net);
-
 	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) ||
 	    nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
 	    nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 87b36da5cd98..0cf3278007ba 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -183,7 +183,6 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
 void nft_trace_notify(struct nft_traceinfo *info)
 {
 	const struct nft_pktinfo *pkt = info->pkt;
-	struct nfgenmsg *nfmsg;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
 	unsigned int size;
@@ -219,15 +218,11 @@ void nft_trace_notify(struct nft_traceinfo *info)
 		return;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE);
-	nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0);
+	nlh = nfnl_msg_put(skb, 0, 0, event, 0, info->basechain->type->family,
+			   NFNETLINK_V0, 0);
 	if (!nlh)
 		goto nla_put_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family	= info->basechain->type->family;
-	nfmsg->version		= NFNETLINK_V0;
-	nfmsg->res_id		= 0;
-
 	if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt))))
 		goto nla_put_failure;
 
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 0fa1653b5f19..bb930f3b06c7 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -145,21 +145,16 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 		   int event, struct nf_acct *acct)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 	u64 pkts, bytes;
 	u32 old_flags;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+			   NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = AF_UNSPEC;
-	nfmsg->version = NFNETLINK_V0;
-	nfmsg->res_id = 0;
-
 	if (nla_put_string(skb, NFACCT_NAME, acct->name))
 		goto nla_put_failure;
 
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 0f94fce1d3ed..22f6f7fcc724 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -526,20 +526,15 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 			int event, struct nf_conntrack_helper *helper)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 	int status;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+			   NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = AF_UNSPEC;
-	nfmsg->version = NFNETLINK_V0;
-	nfmsg->res_id = 0;
-
 	if (nla_put_string(skb, NFCTH_NAME, helper->name))
 		goto nla_put_failure;
 
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 89a381f7f945..de831a257512 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -160,22 +160,17 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 		       int event, struct ctnl_timeout *timeout)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 	const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto;
 	struct nlattr *nest_parms;
 	int ret;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+			   NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = AF_UNSPEC;
-	nfmsg->version = NFNETLINK_V0;
-	nfmsg->res_id = 0;
-
 	if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
 	    nla_put_be16(skb, CTA_TIMEOUT_L3PROTO,
 			 htons(timeout->timeout.l3num)) ||
@@ -382,21 +377,16 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
 			    const unsigned int *timeouts)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 	struct nlattr *nest_parms;
 	int ret;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+			   NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = AF_UNSPEC;
-	nfmsg->version = NFNETLINK_V0;
-	nfmsg->res_id = 0;
-
 	if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) ||
 	    nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
 		goto nla_put_failure;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 26776b88a539..d5f458d0ff3d 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -456,20 +456,15 @@ __build_packet_message(struct nfnl_log_net *log,
 {
 	struct nfulnl_msg_packet_hdr pmsg;
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	sk_buff_data_t old_tail = inst->skb->tail;
 	struct sock *sk;
 	const unsigned char *hwhdrp;
 
-	nlh = nlmsg_put(inst->skb, 0, 0,
-			nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
-			sizeof(struct nfgenmsg), 0);
+	nlh = nfnl_msg_put(inst->skb, 0, 0,
+			   nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
+			   0, pf, NFNETLINK_V0, htons(inst->group_num));
 	if (!nlh)
 		return -1;
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = pf;
-	nfmsg->version = NFNETLINK_V0;
-	nfmsg->res_id = htons(inst->group_num);
 
 	memset(&pmsg, 0, sizeof(pmsg));
 	pmsg.hw_protocol	= skb->protocol;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 48a07914fd94..37e81d895e61 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -383,7 +383,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	struct nlattr *nla;
 	struct nfqnl_msg_packet_hdr *pmsg;
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	struct sk_buff *entskb = entry->skb;
 	struct net_device *indev;
 	struct net_device *outdev;
@@ -471,18 +470,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		goto nlmsg_failure;
 	}
 
-	nlh = nlmsg_put(skb, 0, 0,
-			nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
-			sizeof(struct nfgenmsg), 0);
+	nlh = nfnl_msg_put(skb, 0, 0,
+			   nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
+			   0, entry->state.pf, NFNETLINK_V0,
+			   htons(queue->queue_num));
 	if (!nlh) {
 		skb_tx_error(entskb);
 		kfree_skb(skb);
 		goto nlmsg_failure;
 	}
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = entry->state.pf;
-	nfmsg->version = NFNETLINK_V0;
-	nfmsg->res_id = htons(queue->queue_num);
 
 	nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
 	pmsg = nla_data(nla);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 8e56f353ff35..b8dbd20a6a4c 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -591,19 +591,14 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 		      int rev, int target)
 {
 	struct nlmsghdr *nlh;
-	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event);
-	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+			   NFNETLINK_V0, 0);
+	if (!nlh)
 		goto nlmsg_failure;
 
-	nfmsg = nlmsg_data(nlh);
-	nfmsg->nfgen_family = family;
-	nfmsg->version = NFNETLINK_V0;
-	nfmsg->res_id = 0;
-
 	if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
 	    nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
 	    nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
-- 
cgit v1.2.3


From 53366a9f917a8601dcad0fd9768d5956cd2f99a6 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Tue, 30 Mar 2021 16:38:17 +0200
Subject: USB: serial: drop unused suspending flag

The suspending flag was added back in 2009 but no users ever followed.
Remove it.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/usb-serial.c | 8 +-------
 include/linux/usb/serial.h      | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 27e3bb58c872..2a38810a3979 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -1114,8 +1114,6 @@ int usb_serial_suspend(struct usb_interface *intf, pm_message_t message)
 	struct usb_serial *serial = usb_get_intfdata(intf);
 	int i, r = 0;
 
-	serial->suspending = 1;
-
 	/*
 	 * serial->type->suspend() MUST return 0 in system sleep context,
 	 * otherwise, the resume callback has to recover device from
@@ -1123,10 +1121,8 @@ int usb_serial_suspend(struct usb_interface *intf, pm_message_t message)
 	 */
 	if (serial->type->suspend) {
 		r = serial->type->suspend(serial, message);
-		if (r < 0) {
-			serial->suspending = 0;
+		if (r < 0)
 			goto err_out;
-		}
 	}
 
 	for (i = 0; i < serial->num_ports; ++i)
@@ -1151,7 +1147,6 @@ int usb_serial_resume(struct usb_interface *intf)
 
 	usb_serial_unpoison_port_urbs(serial);
 
-	serial->suspending = 0;
 	if (serial->type->resume)
 		rv = serial->type->resume(serial);
 	else
@@ -1168,7 +1163,6 @@ static int usb_serial_reset_resume(struct usb_interface *intf)
 
 	usb_serial_unpoison_port_urbs(serial);
 
-	serial->suspending = 0;
 	if (serial->type->reset_resume) {
 		rv = serial->type->reset_resume(serial);
 	} else {
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 952272002e48..7efba6caaadc 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -146,7 +146,6 @@ struct usb_serial {
 	struct usb_serial_driver	*type;
 	struct usb_interface		*interface;
 	unsigned char			disconnected:1;
-	unsigned char			suspending:1;
 	unsigned char			attached:1;
 	unsigned char			minors_reserved:1;
 	unsigned char			num_ports;
-- 
cgit v1.2.3


From 5de03c99691d5b0b6253fda1d1d3bbc8239aadb8 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Tue, 30 Mar 2021 16:38:19 +0200
Subject: USB: serial: add support for multi-interface functions

A single USB function can be implemented using a group of interfaces and
this is for example commonly used for Communication Class devices.

Add support for multi-interface functions to USB serial core and export
an interface that allows drivers to claim a second sibling interface.
The interface could easily be extended to allow claiming further
interfaces if ever needed.

When a driver claims a sibling interface in probe(), core allocates
resources for any bulk in, bulk out, interrupt in and interrupt out
endpoints found also on the sibling interface.

Disconnect is implemented so that unbinding either interface will
release the other interface while disconnect() is called precisely once.

Similarly, suspend() is called when the first sibling interface is
suspended and resume() is called when the last sibling interface is
resumed by USB core.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/usb-serial.c | 84 +++++++++++++++++++++++++++++++++++------
 include/linux/usb/serial.h      |  7 ++++
 2 files changed, 80 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index d981809c4ed3..aaae71a0bbff 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -121,6 +121,44 @@ static void release_minors(struct usb_serial *serial)
 	serial->minors_reserved = 0;
 }
 
+int usb_serial_claim_interface(struct usb_serial *serial, struct usb_interface *intf)
+{
+	struct usb_driver *driver = serial->type->usb_driver;
+	int ret;
+
+	if (serial->sibling)
+		return -EBUSY;
+
+	ret = usb_driver_claim_interface(driver, intf, serial);
+	if (ret) {
+		dev_err(&serial->interface->dev,
+				"failed to claim sibling interface: %d\n", ret);
+		return ret;
+	}
+
+	serial->sibling = intf;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_serial_claim_interface);
+
+static void release_sibling(struct usb_serial *serial, struct usb_interface *intf)
+{
+	struct usb_driver *driver = serial->type->usb_driver;
+	struct usb_interface *sibling;
+
+	if (!serial->sibling)
+		return;
+
+	if (intf == serial->sibling)
+		sibling = serial->interface;
+	else
+		sibling = serial->sibling;
+
+	usb_set_intfdata(sibling, NULL);
+	usb_driver_release_interface(driver, sibling);
+}
+
 static void destroy_serial(struct kref *kref)
 {
 	struct usb_serial *serial;
@@ -742,13 +780,14 @@ static void store_endpoint(struct usb_serial *serial,
 }
 
 static void find_endpoints(struct usb_serial *serial,
-					struct usb_serial_endpoints *epds)
+					struct usb_serial_endpoints *epds,
+					struct usb_interface *intf)
 {
 	struct usb_host_interface *iface_desc;
 	struct usb_endpoint_descriptor *epd;
 	unsigned int i;
 
-	iface_desc = serial->interface->cur_altsetting;
+	iface_desc = intf->cur_altsetting;
 	for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) {
 		epd = &iface_desc->endpoint[i].desc;
 		store_endpoint(serial, epds, epd);
@@ -917,7 +956,7 @@ static int usb_serial_probe(struct usb_interface *interface,
 
 		if (retval) {
 			dev_dbg(ddev, "sub driver rejected device\n");
-			goto err_put_serial;
+			goto err_release_sibling;
 		}
 	}
 
@@ -925,10 +964,12 @@ static int usb_serial_probe(struct usb_interface *interface,
 	epds = kzalloc(sizeof(*epds), GFP_KERNEL);
 	if (!epds) {
 		retval = -ENOMEM;
-		goto err_put_serial;
+		goto err_release_sibling;
 	}
 
-	find_endpoints(serial, epds);
+	find_endpoints(serial, epds, interface);
+	if (serial->sibling)
+		find_endpoints(serial, epds, serial->sibling);
 
 	if (epds->num_bulk_in < type->num_bulk_in ||
 			epds->num_bulk_out < type->num_bulk_out ||
@@ -1076,7 +1117,8 @@ exit:
 
 err_free_epds:
 	kfree(epds);
-err_put_serial:
+err_release_sibling:
+	release_sibling(serial, interface);
 	usb_serial_put(serial);
 err_put_module:
 	module_put(type->driver.owner);
@@ -1092,6 +1134,10 @@ static void usb_serial_disconnect(struct usb_interface *interface)
 	struct usb_serial_port *port;
 	struct tty_struct *tty;
 
+	/* sibling interface is cleaning up */
+	if (!serial)
+		return;
+
 	usb_serial_console_disconnect(serial);
 
 	mutex_lock(&serial->disc_mutex);
@@ -1115,6 +1161,8 @@ static void usb_serial_disconnect(struct usb_interface *interface)
 	if (serial->type->disconnect)
 		serial->type->disconnect(serial);
 
+	release_sibling(serial, interface);
+
 	/* let the last holder of this object cause it to be cleaned up */
 	usb_serial_put(serial);
 	dev_info(dev, "device disconnected\n");
@@ -1123,7 +1171,11 @@ static void usb_serial_disconnect(struct usb_interface *interface)
 int usb_serial_suspend(struct usb_interface *intf, pm_message_t message)
 {
 	struct usb_serial *serial = usb_get_intfdata(intf);
-	int i, r = 0;
+	int i, r;
+
+	/* suspend when called for first sibling interface */
+	if (serial->suspend_count++)
+		return 0;
 
 	/*
 	 * serial->type->suspend() MUST return 0 in system sleep context,
@@ -1132,14 +1184,16 @@ int usb_serial_suspend(struct usb_interface *intf, pm_message_t message)
 	 */
 	if (serial->type->suspend) {
 		r = serial->type->suspend(serial, message);
-		if (r < 0)
-			goto err_out;
+		if (r < 0) {
+			serial->suspend_count--;
+			return r;
+		}
 	}
 
 	for (i = 0; i < serial->num_ports; ++i)
 		usb_serial_port_poison_urbs(serial->port[i]);
-err_out:
-	return r;
+
+	return 0;
 }
 EXPORT_SYMBOL(usb_serial_suspend);
 
@@ -1156,6 +1210,10 @@ int usb_serial_resume(struct usb_interface *intf)
 	struct usb_serial *serial = usb_get_intfdata(intf);
 	int rv;
 
+	/* resume when called for last sibling interface */
+	if (--serial->suspend_count)
+		return 0;
+
 	usb_serial_unpoison_port_urbs(serial);
 
 	if (serial->type->resume)
@@ -1172,6 +1230,10 @@ static int usb_serial_reset_resume(struct usb_interface *intf)
 	struct usb_serial *serial = usb_get_intfdata(intf);
 	int rv;
 
+	/* resume when called for last sibling interface */
+	if (--serial->suspend_count)
+		return 0;
+
 	usb_serial_unpoison_port_urbs(serial);
 
 	if (serial->type->reset_resume) {
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 7efba6caaadc..e9b90577f50b 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -130,6 +130,8 @@ static inline void usb_set_serial_port_data(struct usb_serial_port *port,
  * @dev: pointer to the struct usb_device for this device
  * @type: pointer to the struct usb_serial_driver for this device
  * @interface: pointer to the struct usb_interface for this device
+ * @sibling: pointer to the struct usb_interface of any sibling interface
+ * @suspend_count: number of suspended (sibling) interfaces
  * @num_ports: the number of ports this device has
  * @num_interrupt_in: number of interrupt in endpoints we have
  * @num_interrupt_out: number of interrupt out endpoints we have
@@ -145,6 +147,8 @@ struct usb_serial {
 	struct usb_device		*dev;
 	struct usb_serial_driver	*type;
 	struct usb_interface		*interface;
+	struct usb_interface		*sibling;
+	unsigned int			suspend_count;
 	unsigned char			disconnected:1;
 	unsigned char			attached:1;
 	unsigned char			minors_reserved:1;
@@ -334,6 +338,9 @@ static inline void usb_serial_console_disconnect(struct usb_serial *serial) {}
 /* Functions needed by other parts of the usbserial core */
 struct usb_serial_port *usb_serial_port_get_by_minor(unsigned int minor);
 void usb_serial_put(struct usb_serial *serial);
+
+int usb_serial_claim_interface(struct usb_serial *serial, struct usb_interface *intf);
+
 int usb_serial_generic_open(struct tty_struct *tty, struct usb_serial_port *port);
 int usb_serial_generic_write_start(struct usb_serial_port *port, gfp_t mem_flags);
 int usb_serial_generic_write(struct tty_struct *tty, struct usb_serial_port *port,
-- 
cgit v1.2.3


From 26dbc7e299c7ebbb6a95e2c620b21b5280b37c57 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Thu, 18 Mar 2021 14:33:08 +0000
Subject: bug: Factor out a getter for a bug's file line

There is some non-trivial config-based logic to get the file name and
line number associated with a bug. Factor this out to a getter that can
be resused.

Signed-off-by: Andrew Scull <ascull@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210318143311.839894-3-ascull@google.com
---
 include/linux/bug.h |  3 +++
 lib/bug.c           | 27 +++++++++++++++++----------
 2 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index f639bd0122f3..e3841bee4c8d 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -36,6 +36,9 @@ static inline int is_warning_bug(const struct bug_entry *bug)
 	return bug->flags & BUGFLAG_WARNING;
 }
 
+void bug_get_file_line(struct bug_entry *bug, const char **file,
+		       unsigned int *line);
+
 struct bug_entry *find_bug(unsigned long bugaddr);
 
 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
diff --git a/lib/bug.c b/lib/bug.c
index b92da1f6e21b..e65398082cbc 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -127,6 +127,22 @@ static inline struct bug_entry *module_find_bug(unsigned long bugaddr)
 }
 #endif
 
+void bug_get_file_line(struct bug_entry *bug, const char **file,
+		       unsigned int *line)
+{
+	*file = NULL;
+	*line = 0;
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
+	*file = bug->file;
+#else
+	*file = (const char *)bug + bug->file_disp;
+#endif
+	*line = bug->line;
+#endif
+}
+
 struct bug_entry *find_bug(unsigned long bugaddr)
 {
 	struct bug_entry *bug;
@@ -153,17 +169,8 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
 	disable_trace_on_warning();
 
-	file = NULL;
-	line = 0;
+	bug_get_file_line(bug, &file, &line);
 
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
-	file = bug->file;
-#else
-	file = (const char *)bug + bug->file_disp;
-#endif
-	line = bug->line;
-#endif
 	warning = (bug->flags & BUGFLAG_WARNING) != 0;
 	once = (bug->flags & BUGFLAG_ONCE) != 0;
 	done = (bug->flags & BUGFLAG_DONE) != 0;
-- 
cgit v1.2.3


From b01fd6e802b6d0a635176f943315670b679d8d7b Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:23 -0700
Subject: skmsg: Introduce a spinlock to protect ingress_msg

Currently we rely on lock_sock to protect ingress_msg,
it is too big for this, we can actually just use a spinlock
to protect this list like protecting other skb queues.

__tcp_bpf_recvmsg() is still special because of peeking,
it still has to use lock_sock.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-3-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 net/core/skmsg.c      |  3 +++
 net/ipv4/tcp_bpf.c    | 18 ++++++------------
 3 files changed, 55 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 6c09d94be2e9..f2d45a73b2b2 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -89,6 +89,7 @@ struct sk_psock {
 #endif
 	struct sk_buff_head		ingress_skb;
 	struct list_head		ingress_msg;
+	spinlock_t			ingress_lock;
 	unsigned long			state;
 	struct list_head		link;
 	spinlock_t			link_lock;
@@ -284,7 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
 static inline void sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
+	spin_lock_bh(&psock->ingress_lock);
 	list_add_tail(&msg->list, &psock->ingress_msg);
+	spin_unlock_bh(&psock->ingress_lock);
+}
+
+static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
+{
+	struct sk_msg *msg;
+
+	spin_lock_bh(&psock->ingress_lock);
+	msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+	if (msg)
+		list_del(&msg->list);
+	spin_unlock_bh(&psock->ingress_lock);
+	return msg;
+}
+
+static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
+{
+	struct sk_msg *msg;
+
+	spin_lock_bh(&psock->ingress_lock);
+	msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+	spin_unlock_bh(&psock->ingress_lock);
+	return msg;
+}
+
+static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock,
+					       struct sk_msg *msg)
+{
+	struct sk_msg *ret;
+
+	spin_lock_bh(&psock->ingress_lock);
+	if (list_is_last(&msg->list, &psock->ingress_msg))
+		ret = NULL;
+	else
+		ret = list_next_entry(msg, list);
+	spin_unlock_bh(&psock->ingress_lock);
+	return ret;
 }
 
 static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
@@ -292,6 +331,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
 	return psock ? list_empty(&psock->ingress_msg) : true;
 }
 
+static inline void kfree_sk_msg(struct sk_msg *msg)
+{
+	if (msg->skb)
+		consume_skb(msg->skb);
+	kfree(msg);
+}
+
 static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 {
 	struct sock *sk = psock->sk;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index bebf84ed4e30..305dddc51857 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -592,6 +592,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 
 	INIT_WORK(&psock->work, sk_psock_backlog);
 	INIT_LIST_HEAD(&psock->ingress_msg);
+	spin_lock_init(&psock->ingress_lock);
 	skb_queue_head_init(&psock->ingress_skb);
 
 	sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
@@ -638,7 +639,9 @@ static void sk_psock_zap_ingress(struct sk_psock *psock)
 		skb_bpf_redirect_clear(skb);
 		kfree_skb(skb);
 	}
+	spin_lock_bh(&psock->ingress_lock);
 	__sk_psock_purge_ingress_msg(psock);
+	spin_unlock_bh(&psock->ingress_lock);
 }
 
 static void sk_psock_link_destroy(struct sk_psock *psock)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 17c322b875fd..ae980716d896 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -18,9 +18,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 	struct sk_msg *msg_rx;
 	int i, copied = 0;
 
-	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
-					  struct sk_msg, list);
-
+	msg_rx = sk_psock_peek_msg(psock);
 	while (copied != len) {
 		struct scatterlist *sge;
 
@@ -68,22 +66,18 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 		} while (i != msg_rx->sg.end);
 
 		if (unlikely(peek)) {
-			if (msg_rx == list_last_entry(&psock->ingress_msg,
-						      struct sk_msg, list))
+			msg_rx = sk_psock_next_msg(psock, msg_rx);
+			if (!msg_rx)
 				break;
-			msg_rx = list_next_entry(msg_rx, list);
 			continue;
 		}
 
 		msg_rx->sg.start = i;
 		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
-			list_del(&msg_rx->list);
-			if (msg_rx->skb)
-				consume_skb(msg_rx->skb);
-			kfree(msg_rx);
+			msg_rx = sk_psock_dequeue_msg(psock);
+			kfree_sk_msg(msg_rx);
 		}
-		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
-						  struct sk_msg, list);
+		msg_rx = sk_psock_peek_msg(psock);
 	}
 
 	return copied;
-- 
cgit v1.2.3


From 0739cd28f2645e814586c7536ba5da9825cb8029 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:24 -0700
Subject: net: Introduce skb_send_sock() for sock_map

We only have skb_send_sock_locked() which requires callers
to use lock_sock(). Introduce a variant skb_send_sock()
which locks on its own, callers do not need to lock it
any more. This will save us from adding a ->sendmsg_locked
for each protocol.

To reuse the code, pass function pointers to __skb_send_sock()
and build skb_send_sock() and skb_send_sock_locked() on top.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-4-xiyou.wangcong@gmail.com
---
 include/linux/skbuff.h |  1 +
 net/core/skbuff.c      | 55 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c8def85fcc22..dbf820a50a39 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3626,6 +3626,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    unsigned int flags);
 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 			 int len);
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
 int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e8320b5d651a..3ad9e8425ab2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2500,9 +2500,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 }
 EXPORT_SYMBOL_GPL(skb_splice_bits);
 
-/* Send skb data on a socket. Socket must be locked. */
-int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
-			 int len)
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
+			    struct kvec *vec, size_t num, size_t size)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (!sock)
+		return -EINVAL;
+	return kernel_sendmsg(sock, msg, vec, num, size);
+}
+
+static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
+			     size_t size, int flags)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (!sock)
+		return -EINVAL;
+	return kernel_sendpage(sock, page, offset, size, flags);
+}
+
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
+			    struct kvec *vec, size_t num, size_t size);
+typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
+			     size_t size, int flags);
+static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
+			   int len, sendmsg_func sendmsg, sendpage_func sendpage)
 {
 	unsigned int orig_len = len;
 	struct sk_buff *head = skb;
@@ -2522,7 +2545,8 @@ do_frag_list:
 		memset(&msg, 0, sizeof(msg));
 		msg.msg_flags = MSG_DONTWAIT;
 
-		ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+		ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
+				      sendmsg_unlocked, sk, &msg, &kv, 1, slen);
 		if (ret <= 0)
 			goto error;
 
@@ -2553,9 +2577,11 @@ do_frag_list:
 		slen = min_t(size_t, len, skb_frag_size(frag) - offset);
 
 		while (slen) {
-			ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
-						     skb_frag_off(frag) + offset,
-						     slen, MSG_DONTWAIT);
+			ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
+					      sendpage_unlocked, sk,
+					      skb_frag_page(frag),
+					      skb_frag_off(frag) + offset,
+					      slen, MSG_DONTWAIT);
 			if (ret <= 0)
 				goto error;
 
@@ -2587,8 +2613,23 @@ out:
 error:
 	return orig_len == len ? ret : orig_len - len;
 }
+
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+			 int len)
+{
+	return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
+			       kernel_sendpage_locked);
+}
 EXPORT_SYMBOL_GPL(skb_send_sock_locked);
 
+/* Send skb data on a socket. Socket must be unlocked. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+	return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
+			       sendpage_unlocked);
+}
+
 /**
  *	skb_store_bits - store bits from kernel buffer to skb
  *	@skb: destination buffer
-- 
cgit v1.2.3


From 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:25 -0700
Subject: skmsg: Avoid lock_sock() in sk_psock_backlog()

We do not have to lock the sock to avoid losing sk_socket,
instead we can purge all the ingress queues when we close
the socket. Sending or receiving packets after orphaning
socket makes no sense.

We do purge these queues when psock refcnt reaches zero but
here we want to purge them explicitly in sock_map_close().
There are also some nasty race conditions on testing bit
SK_PSOCK_TX_ENABLED and queuing/canceling the psock work,
we can expand psock->ingress_lock a bit to protect them too.

As noticed by John, we still have to lock the psock->work,
because the same work item could be running concurrently on
different CPU's.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h |  2 ++
 net/core/skmsg.c      | 50 ++++++++++++++++++++++++++++++++++----------------
 net/core/sock_map.c   |  1 +
 3 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index f2d45a73b2b2..7382c4b518d7 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -99,6 +99,7 @@ struct sk_psock {
 	void (*saved_write_space)(struct sock *sk);
 	void (*saved_data_ready)(struct sock *sk);
 	struct proto			*sk_proto;
+	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
 	struct work_struct		work;
 	union {
@@ -347,6 +348,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err)
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node);
+void sk_psock_stop(struct sk_psock *psock, bool wait);
 
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 305dddc51857..9c25020086a9 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 	if (!ingress) {
 		if (!sock_writeable(psock->sk))
 			return -EAGAIN;
-		return skb_send_sock_locked(psock->sk, skb, off, len);
+		return skb_send_sock(psock->sk, skb, off, len);
 	}
 	return sk_psock_skb_ingress(psock, skb);
 }
@@ -511,8 +511,7 @@ static void sk_psock_backlog(struct work_struct *work)
 	u32 len, off;
 	int ret;
 
-	/* Lock sock to avoid losing sk_socket during loop. */
-	lock_sock(psock->sk);
+	mutex_lock(&psock->work_mutex);
 	if (state->skb) {
 		skb = state->skb;
 		len = state->len;
@@ -529,7 +528,7 @@ start:
 		skb_bpf_redirect_clear(skb);
 		do {
 			ret = -EIO;
-			if (likely(psock->sk->sk_socket))
+			if (!sock_flag(psock->sk, SOCK_DEAD))
 				ret = sk_psock_handle_skb(psock, skb, off,
 							  len, ingress);
 			if (ret <= 0) {
@@ -553,7 +552,7 @@ start:
 			kfree_skb(skb);
 	}
 end:
-	release_sock(psock->sk);
+	mutex_unlock(&psock->work_mutex);
 }
 
 struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -591,6 +590,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 	spin_lock_init(&psock->link_lock);
 
 	INIT_WORK(&psock->work, sk_psock_backlog);
+	mutex_init(&psock->work_mutex);
 	INIT_LIST_HEAD(&psock->ingress_msg);
 	spin_lock_init(&psock->ingress_lock);
 	skb_queue_head_init(&psock->ingress_skb);
@@ -631,7 +631,7 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
 	}
 }
 
-static void sk_psock_zap_ingress(struct sk_psock *psock)
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
 {
 	struct sk_buff *skb;
 
@@ -639,9 +639,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock)
 		skb_bpf_redirect_clear(skb);
 		kfree_skb(skb);
 	}
-	spin_lock_bh(&psock->ingress_lock);
 	__sk_psock_purge_ingress_msg(psock);
-	spin_unlock_bh(&psock->ingress_lock);
 }
 
 static void sk_psock_link_destroy(struct sk_psock *psock)
@@ -654,6 +652,18 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
 	}
 }
 
+void sk_psock_stop(struct sk_psock *psock, bool wait)
+{
+	spin_lock_bh(&psock->ingress_lock);
+	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+	sk_psock_cork_free(psock);
+	__sk_psock_zap_ingress(psock);
+	spin_unlock_bh(&psock->ingress_lock);
+
+	if (wait)
+		cancel_work_sync(&psock->work);
+}
+
 static void sk_psock_done_strp(struct sk_psock *psock);
 
 static void sk_psock_destroy_deferred(struct work_struct *gc)
@@ -665,12 +675,12 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
 	sk_psock_done_strp(psock);
 
 	cancel_work_sync(&psock->work);
+	mutex_destroy(&psock->work_mutex);
 
 	psock_progs_drop(&psock->progs);
 
 	sk_psock_link_destroy(psock);
 	sk_psock_cork_free(psock);
-	sk_psock_zap_ingress(psock);
 
 	if (psock->sk_redir)
 		sock_put(psock->sk_redir);
@@ -688,8 +698,7 @@ static void sk_psock_destroy(struct rcu_head *rcu)
 
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 {
-	sk_psock_cork_free(psock);
-	sk_psock_zap_ingress(psock);
+	sk_psock_stop(psock, false);
 
 	write_lock_bh(&sk->sk_callback_lock);
 	sk_psock_restore_proto(sk, psock);
@@ -699,7 +708,6 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 	else if (psock->progs.stream_verdict)
 		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
-	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
 
 	call_rcu(&psock->rcu, sk_psock_destroy);
 }
@@ -770,14 +778,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
 	 * error that caused the pipe to break. We can't send a packet on
 	 * a socket that is in this state so we drop the skb.
 	 */
-	if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
-	    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+	if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+		kfree_skb(skb);
+		return;
+	}
+	spin_lock_bh(&psock_other->ingress_lock);
+	if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+		spin_unlock_bh(&psock_other->ingress_lock);
 		kfree_skb(skb);
 		return;
 	}
 
 	skb_queue_tail(&psock_other->ingress_skb, skb);
 	schedule_work(&psock_other->work);
+	spin_unlock_bh(&psock_other->ingress_lock);
 }
 
 static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
@@ -845,8 +859,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 			err = sk_psock_skb_ingress_self(psock, skb);
 		}
 		if (err < 0) {
-			skb_queue_tail(&psock->ingress_skb, skb);
-			schedule_work(&psock->work);
+			spin_lock_bh(&psock->ingress_lock);
+			if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+				skb_queue_tail(&psock->ingress_skb, skb);
+				schedule_work(&psock->work);
+			}
+			spin_unlock_bh(&psock->ingress_lock);
 		}
 		break;
 	case __SK_REDIRECT:
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index dd53a7771d7e..e564fdeaada1 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout)
 	saved_close = psock->saved_close;
 	sock_map_remove_links(sk, psock);
 	rcu_read_unlock();
+	sk_psock_stop(psock, true);
 	release_sock(sk);
 	saved_close(sk, timeout);
 }
-- 
cgit v1.2.3


From 7786dfc41a74e0567557b5c4a28fc8482f5f5691 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:26 -0700
Subject: skmsg: Use rcu work for destroying psock

The RCU callback sk_psock_destroy() only queues work psock->gc,
so we can just switch to rcu work to simplify the code.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-6-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h |  5 +----
 net/core/skmsg.c      | 17 +++++------------
 2 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 7382c4b518d7..e7aba150539d 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -102,10 +102,7 @@ struct sk_psock {
 	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
 	struct work_struct		work;
-	union {
-		struct rcu_head		rcu;
-		struct work_struct	gc;
-	};
+	struct rcu_work			rwork;
 };
 
 int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 9c25020086a9..d43d43905d2c 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -666,10 +666,10 @@ void sk_psock_stop(struct sk_psock *psock, bool wait)
 
 static void sk_psock_done_strp(struct sk_psock *psock);
 
-static void sk_psock_destroy_deferred(struct work_struct *gc)
+static void sk_psock_destroy(struct work_struct *work)
 {
-	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
-
+	struct sk_psock *psock = container_of(to_rcu_work(work),
+					      struct sk_psock, rwork);
 	/* No sk_callback_lock since already detached. */
 
 	sk_psock_done_strp(psock);
@@ -688,14 +688,6 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
 	kfree(psock);
 }
 
-static void sk_psock_destroy(struct rcu_head *rcu)
-{
-	struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
-
-	INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
-	schedule_work(&psock->gc);
-}
-
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 {
 	sk_psock_stop(psock, false);
@@ -709,7 +701,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
 
-	call_rcu(&psock->rcu, sk_psock_destroy);
+	INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+	queue_rcu_work(system_wq, &psock->rwork);
 }
 EXPORT_SYMBOL_GPL(sk_psock_drop);
 
-- 
cgit v1.2.3


From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:30 -0700
Subject: sock_map: Introduce BPF_SK_SKB_VERDICT

Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is
confusing and more importantly we still want to distinguish them
from user-space. So we can just reuse the stream verdict code but
introduce a new type of eBPF program, skb_verdict. Users are not
allowed to attach stream_verdict and skb_verdict programs to the
same map.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h          |  2 ++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           |  1 +
 net/core/skmsg.c               |  4 +++-
 net/core/sock_map.c            | 28 ++++++++++++++++++++++++++++
 tools/bpf/bpftool/common.c     |  1 +
 tools/bpf/bpftool/prog.c       |  1 +
 tools/include/uapi/linux/bpf.h |  1 +
 8 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e7aba150539d..c83dbc2d81d9 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -58,6 +58,7 @@ struct sk_psock_progs {
 	struct bpf_prog			*msg_parser;
 	struct bpf_prog			*stream_parser;
 	struct bpf_prog			*stream_verdict;
+	struct bpf_prog			*skb_verdict;
 };
 
 enum sk_psock_state_bits {
@@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs)
 	psock_set_prog(&progs->msg_parser, NULL);
 	psock_set_prog(&progs->stream_parser, NULL);
 	psock_set_prog(&progs->stream_verdict, NULL);
+	psock_set_prog(&progs->skb_verdict, NULL);
 }
 
 int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 598716742593..49371eba98ba 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -957,6 +957,7 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_SKB_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9603de81811a..6428634da57e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_SK_MSG;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
+	case BPF_SK_SKB_VERDICT:
 		return BPF_PROG_TYPE_SK_SKB;
 	case BPF_LIRC_MODE2:
 		return BPF_PROG_TYPE_LIRC_MODE2;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 656eceab73bc..a045812d7c78 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 	rcu_assign_sk_user_data(sk, NULL);
 	if (psock->progs.stream_parser)
 		sk_psock_stop_strp(sk, psock);
-	else if (psock->progs.stream_verdict)
+	else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
 		sk_psock_stop_verdict(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
 
@@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
 	}
 	skb_set_owner_r(skb, sk);
 	prog = READ_ONCE(psock->progs.stream_verdict);
+	if (!prog)
+		prog = READ_ONCE(psock->progs.skb_verdict);
 	if (likely(prog)) {
 		skb_dst_drop(skb);
 		skb_bpf_redirect_clear(skb);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 42d797291d34..c2a0411e08a8 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk,
 				strp_stop = true;
 			if (psock->saved_data_ready && stab->progs.stream_verdict)
 				verdict_stop = true;
+			if (psock->saved_data_ready && stab->progs.skb_verdict)
+				verdict_stop = true;
 			list_del(&link->list);
 			sk_psock_free_link(link);
 		}
@@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 	struct sk_psock_progs *progs = sock_map_progs(map);
 	struct bpf_prog *stream_verdict = NULL;
 	struct bpf_prog *stream_parser = NULL;
+	struct bpf_prog *skb_verdict = NULL;
 	struct bpf_prog *msg_parser = NULL;
 	struct sk_psock *psock;
 	int ret;
@@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
 		}
 	}
 
+	skb_verdict = READ_ONCE(progs->skb_verdict);
+	if (skb_verdict) {
+		skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+		if (IS_ERR(skb_verdict)) {
+			ret = PTR_ERR(skb_verdict);
+			goto out_put_msg_parser;
+		}
+	}
+
 no_progs:
 	psock = sock_map_psock_get_checked(sk);
 	if (IS_ERR(psock)) {
@@ -278,6 +290,9 @@ no_progs:
 	if (psock) {
 		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
 		    (stream_parser  && READ_ONCE(psock->progs.stream_parser)) ||
+		    (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+		    (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) ||
+		    (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
 		    (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
 			sk_psock_put(sk, psock);
 			ret = -EBUSY;
@@ -309,6 +324,9 @@ no_progs:
 	} else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
 		psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
 		sk_psock_start_verdict(sk,psock);
+	} else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) {
+		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+		sk_psock_start_verdict(sk, psock);
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
 	return 0;
@@ -317,6 +335,9 @@ out_unlock_drop:
 out_drop:
 	sk_psock_put(sk, psock);
 out_progs:
+	if (skb_verdict)
+		bpf_prog_put(skb_verdict);
+out_put_msg_parser:
 	if (msg_parser)
 		bpf_prog_put(msg_parser);
 out_put_stream_parser:
@@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 		break;
 #endif
 	case BPF_SK_SKB_STREAM_VERDICT:
+		if (progs->skb_verdict)
+			return -EBUSY;
 		pprog = &progs->stream_verdict;
 		break;
+	case BPF_SK_SKB_VERDICT:
+		if (progs->stream_verdict)
+			return -EBUSY;
+		pprog = &progs->skb_verdict;
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 65303664417e..1828bba19020 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
 
 	[BPF_SK_SKB_STREAM_PARSER]	= "sk_skb_stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT]	= "sk_skb_stream_verdict",
+	[BPF_SK_SKB_VERDICT]		= "sk_skb_verdict",
 	[BPF_SK_MSG_VERDICT]		= "sk_msg_verdict",
 	[BPF_LIRC_MODE2]		= "lirc_mode2",
 	[BPF_FLOW_DISSECTOR]		= "flow_dissector",
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f2b915b20546..3f067d2d7584 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -76,6 +76,7 @@ enum dump_mode {
 static const char * const attach_type_strings[] = {
 	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
+	[BPF_SK_SKB_VERDICT] = "skb_verdict",
 	[BPF_SK_MSG_VERDICT] = "msg_verdict",
 	[BPF_FLOW_DISSECTOR] = "flow_dissector",
 	[__MAX_BPF_ATTACH_TYPE] = NULL,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ab9f2233607c..69902603012c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -957,6 +957,7 @@ enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_SK_SKB_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
cgit v1.2.3


From 8a59f9d1e3d4340659fdfee8879dc09a6f2546e1 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:31 -0700
Subject: sock: Introduce sk->sk_prot->psock_update_sk_prot()

Currently sockmap calls into each protocol to update the struct
proto and replace it. This certainly won't work when the protocol
is implemented as a module, for example, AF_UNIX.

Introduce a new ops sk->sk_prot->psock_update_sk_prot(), so each
protocol can implement its own way to replace the struct proto.
This also helps get rid of symbol dependencies on CONFIG_INET.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210331023237.41094-11-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 18 +++---------------
 include/net/sock.h    |  3 +++
 include/net/tcp.h     |  1 +
 include/net/udp.h     |  1 +
 net/core/skmsg.c      |  5 -----
 net/core/sock_map.c   | 24 ++++--------------------
 net/ipv4/tcp_bpf.c    | 24 +++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c   |  3 +++
 net/ipv4/udp.c        |  3 +++
 net/ipv4/udp_bpf.c    | 15 +++++++++++++--
 net/ipv6/tcp_ipv6.c   |  3 +++
 net/ipv6/udp.c        |  3 +++
 12 files changed, 58 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c83dbc2d81d9..5e800ddc2dc6 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -99,6 +99,7 @@ struct sk_psock {
 	void (*saved_close)(struct sock *sk, long timeout);
 	void (*saved_write_space)(struct sock *sk);
 	void (*saved_data_ready)(struct sock *sk);
+	int  (*psock_update_sk_prot)(struct sock *sk, bool restore);
 	struct proto			*sk_proto;
 	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
@@ -395,25 +396,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock)
 	}
 }
 
-static inline void sk_psock_update_proto(struct sock *sk,
-					 struct sk_psock *psock,
-					 struct proto *ops)
-{
-	/* Pairs with lockless read in sk_clone_lock() */
-	WRITE_ONCE(sk->sk_prot, ops);
-}
-
 static inline void sk_psock_restore_proto(struct sock *sk,
 					  struct sk_psock *psock)
 {
 	sk->sk_prot->unhash = psock->saved_unhash;
-	if (inet_csk_has_ulp(sk)) {
-		tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
-	} else {
-		sk->sk_write_space = psock->saved_write_space;
-		/* Pairs with lockless read in sk_clone_lock() */
-		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
-	}
+	if (psock->psock_update_sk_prot)
+		psock->psock_update_sk_prot(sk, true);
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/sock.h b/include/net/sock.h
index 0b6266fd6bf6..8b4155e756c2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1184,6 +1184,9 @@ struct proto {
 	void			(*unhash)(struct sock *sk);
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
+#ifdef CONFIG_BPF_SYSCALL
+	int			(*psock_update_sk_prot)(struct sock *sk, bool restore);
+#endif
 
 	/* Keeping track of sockets in use */
 #ifdef CONFIG_PROC_FS
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 075de26f449d..2efa4e5ea23d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2203,6 +2203,7 @@ struct sk_psock;
 
 #ifdef CONFIG_BPF_SYSCALL
 struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
+int tcp_bpf_update_proto(struct sock *sk, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/include/net/udp.h b/include/net/udp.h
index d4d064c59232..df7cc1edc200 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -518,6 +518,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
 #ifdef CONFIG_BPF_SYSCALL
 struct sk_psock;
 struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
+int udp_bpf_update_proto(struct sock *sk, bool restore);
 #endif
 
 #endif	/* _UDP_H */
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index a045812d7c78..9fc83f7cc1a0 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -562,11 +562,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
 
 	write_lock_bh(&sk->sk_callback_lock);
 
-	if (inet_csk_has_ulp(sk)) {
-		psock = ERR_PTR(-EINVAL);
-		goto out;
-	}
-
 	if (sk->sk_user_data) {
 		psock = ERR_PTR(-EBUSY);
 		goto out;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index c2a0411e08a8..2915c7c8778b 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -185,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw)
 
 static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
 {
-	struct proto *prot;
-
-	switch (sk->sk_type) {
-	case SOCK_STREAM:
-		prot = tcp_bpf_get_proto(sk, psock);
-		break;
-
-	case SOCK_DGRAM:
-		prot = udp_bpf_get_proto(sk, psock);
-		break;
-
-	default:
+	if (!sk->sk_prot->psock_update_sk_prot)
 		return -EINVAL;
-	}
-
-	if (IS_ERR(prot))
-		return PTR_ERR(prot);
-
-	sk_psock_update_proto(sk, psock, prot);
-	return 0;
+	psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot;
+	return sk->sk_prot->psock_update_sk_prot(sk, false);
 }
 
 static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
@@ -556,7 +540,7 @@ static bool sock_map_redirect_allowed(const struct sock *sk)
 
 static bool sock_map_sk_is_suitable(const struct sock *sk)
 {
-	return sk_is_tcp(sk) || sk_is_udp(sk);
+	return !!sk->sk_prot->psock_update_sk_prot;
 }
 
 static bool sock_map_sk_state_allowed(const struct sock *sk)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ae980716d896..ac8cfbaeacd2 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -595,20 +595,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
 	       ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
 }
 
-struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+int tcp_bpf_update_proto(struct sock *sk, bool restore)
 {
+	struct sk_psock *psock = sk_psock(sk);
 	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
 	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
 
+	if (restore) {
+		if (inet_csk_has_ulp(sk)) {
+			tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+		} else {
+			sk->sk_write_space = psock->saved_write_space;
+			/* Pairs with lockless read in sk_clone_lock() */
+			WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		}
+		return 0;
+	}
+
+	if (inet_csk_has_ulp(sk))
+		return -EINVAL;
+
 	if (sk->sk_family == AF_INET6) {
 		if (tcp_bpf_assert_proto_ops(psock->sk_proto))
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 
 		tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
 	}
 
-	return &tcp_bpf_prots[family][config];
+	/* Pairs with lockless read in sk_clone_lock() */
+	WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
 
 /* If a child got cloned from a listening socket that had tcp_bpf
  * protocol callbacks installed, we need to restore the callbacks to
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index daad4f99db32..dfc6d1c0e710 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2806,6 +2806,9 @@ struct proto tcp_prot = {
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= tcp_bpf_update_proto,
+#endif
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.leave_memory_pressure	= tcp_leave_memory_pressure,
 	.stream_memory_free	= tcp_stream_memory_free,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4a0478b17243..38952aaee3a1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2849,6 +2849,9 @@ struct proto udp_prot = {
 	.unhash			= udp_lib_unhash,
 	.rehash			= udp_v4_rehash,
 	.get_port		= udp_v4_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= udp_bpf_update_proto,
+#endif
 	.memory_allocated	= &udp_memory_allocated,
 	.sysctl_mem		= sysctl_udp_mem,
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_udp_wmem_min),
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 7a94791efc1a..6001f93cd3a0 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -41,12 +41,23 @@ static int __init udp_bpf_v4_build_proto(void)
 }
 core_initcall(udp_bpf_v4_build_proto);
 
-struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+int udp_bpf_update_proto(struct sock *sk, bool restore)
 {
 	int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+	struct sk_psock *psock = sk_psock(sk);
+
+	if (restore) {
+		sk->sk_write_space = psock->saved_write_space;
+		/* Pairs with lockless read in sk_clone_lock() */
+		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		return 0;
+	}
 
 	if (sk->sk_family == AF_INET6)
 		udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
 
-	return &udp_bpf_prots[family];
+	/* Pairs with lockless read in sk_clone_lock() */
+	WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d0f007741e8e..bff22d6ef516 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2139,6 +2139,9 @@ struct proto tcpv6_prot = {
 	.hash			= inet6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= tcp_bpf_update_proto,
+#endif
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.leave_memory_pressure	= tcp_leave_memory_pressure,
 	.stream_memory_free	= tcp_stream_memory_free,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d25e5a9252fd..ef2c75bb4771 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1713,6 +1713,9 @@ struct proto udpv6_prot = {
 	.unhash			= udp_lib_unhash,
 	.rehash			= udp_v6_rehash,
 	.get_port		= udp_v6_get_port,
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot	= udp_bpf_update_proto,
+#endif
 	.memory_allocated	= &udp_memory_allocated,
 	.sysctl_mem		= sysctl_udp_mem,
 	.sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
-- 
cgit v1.2.3


From 2bc793e3272a13e337416c057cb81c5396ad91d1 Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 30 Mar 2021 19:32:33 -0700
Subject: skmsg: Extract __tcp_bpf_recvmsg() and tcp_bpf_wait_data()

Although these two functions are only used by TCP, they are not
specific to TCP at all, both operate on skmsg and ingress_msg,
so fit in net/core/skmsg.c very well.

And we will need them for non-TCP, so rename and move them to
skmsg.c and export them to modules.

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210331023237.41094-13-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h |   4 ++
 include/net/tcp.h     |   2 -
 net/core/skmsg.c      |  98 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_bpf.c    | 100 +-------------------------------------------------
 net/tls/tls_sw.c      |   4 +-
 5 files changed, 106 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 5e800ddc2dc6..f78e90a04a69 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -125,6 +125,10 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 			      struct sk_msg *msg, u32 bytes);
 int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 			     struct sk_msg *msg, u32 bytes);
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+		     long timeo, int *err);
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+		   int len, int flags);
 
 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
 {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2efa4e5ea23d..31b1696c62ba 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2209,8 +2209,6 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 
 int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
 			  int flags);
-int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len, int flags);
 #endif /* CONFIG_NET_SOCK_MSG */
 
 #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 9fc83f7cc1a0..92a83c02562a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -399,6 +399,104 @@ out:
 }
 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
 
+int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags,
+		     long timeo, int *err)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret = 0;
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return 1;
+
+	if (!timeo)
+		return ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = sk_wait_event(sk, &timeo,
+			    !list_empty(&psock->ingress_msg) ||
+			    !skb_queue_empty(&sk->sk_receive_queue), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_wait_data);
+
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+		   int len, int flags)
+{
+	struct iov_iter *iter = &msg->msg_iter;
+	int peek = flags & MSG_PEEK;
+	struct sk_msg *msg_rx;
+	int i, copied = 0;
+
+	msg_rx = sk_psock_peek_msg(psock);
+	while (copied != len) {
+		struct scatterlist *sge;
+
+		if (unlikely(!msg_rx))
+			break;
+
+		i = msg_rx->sg.start;
+		do {
+			struct page *page;
+			int copy;
+
+			sge = sk_msg_elem(msg_rx, i);
+			copy = sge->length;
+			page = sg_page(sge);
+			if (copied + copy > len)
+				copy = len - copied;
+			copy = copy_page_to_iter(page, sge->offset, copy, iter);
+			if (!copy)
+				return copied ? copied : -EFAULT;
+
+			copied += copy;
+			if (likely(!peek)) {
+				sge->offset += copy;
+				sge->length -= copy;
+				if (!msg_rx->skb)
+					sk_mem_uncharge(sk, copy);
+				msg_rx->sg.size -= copy;
+
+				if (!sge->length) {
+					sk_msg_iter_var_next(i);
+					if (!msg_rx->skb)
+						put_page(page);
+				}
+			} else {
+				/* Lets not optimize peek case if copy_page_to_iter
+				 * didn't copy the entire length lets just break.
+				 */
+				if (copy != sge->length)
+					return copied;
+				sk_msg_iter_var_next(i);
+			}
+
+			if (copied == len)
+				break;
+		} while (i != msg_rx->sg.end);
+
+		if (unlikely(peek)) {
+			msg_rx = sk_psock_next_msg(psock, msg_rx);
+			if (!msg_rx)
+				break;
+			continue;
+		}
+
+		msg_rx->sg.start = i;
+		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+			msg_rx = sk_psock_dequeue_msg(psock);
+			kfree_sk_msg(msg_rx);
+		}
+		msg_rx = sk_psock_peek_msg(psock);
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
 static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
 						  struct sk_buff *skb)
 {
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ac8cfbaeacd2..3d622a0d0753 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,80 +10,6 @@
 #include <net/inet_common.h>
 #include <net/tls.h>
 
-int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len, int flags)
-{
-	struct iov_iter *iter = &msg->msg_iter;
-	int peek = flags & MSG_PEEK;
-	struct sk_msg *msg_rx;
-	int i, copied = 0;
-
-	msg_rx = sk_psock_peek_msg(psock);
-	while (copied != len) {
-		struct scatterlist *sge;
-
-		if (unlikely(!msg_rx))
-			break;
-
-		i = msg_rx->sg.start;
-		do {
-			struct page *page;
-			int copy;
-
-			sge = sk_msg_elem(msg_rx, i);
-			copy = sge->length;
-			page = sg_page(sge);
-			if (copied + copy > len)
-				copy = len - copied;
-			copy = copy_page_to_iter(page, sge->offset, copy, iter);
-			if (!copy)
-				return copied ? copied : -EFAULT;
-
-			copied += copy;
-			if (likely(!peek)) {
-				sge->offset += copy;
-				sge->length -= copy;
-				if (!msg_rx->skb)
-					sk_mem_uncharge(sk, copy);
-				msg_rx->sg.size -= copy;
-
-				if (!sge->length) {
-					sk_msg_iter_var_next(i);
-					if (!msg_rx->skb)
-						put_page(page);
-				}
-			} else {
-				/* Lets not optimize peek case if copy_page_to_iter
-				 * didn't copy the entire length lets just break.
-				 */
-				if (copy != sge->length)
-					return copied;
-				sk_msg_iter_var_next(i);
-			}
-
-			if (copied == len)
-				break;
-		} while (i != msg_rx->sg.end);
-
-		if (unlikely(peek)) {
-			msg_rx = sk_psock_next_msg(psock, msg_rx);
-			if (!msg_rx)
-				break;
-			continue;
-		}
-
-		msg_rx->sg.start = i;
-		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
-			msg_rx = sk_psock_dequeue_msg(psock);
-			kfree_sk_msg(msg_rx);
-		}
-		msg_rx = sk_psock_peek_msg(psock);
-	}
-
-	return copied;
-}
-EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
-
 static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
 			   struct sk_msg *msg, u32 apply_bytes, int flags)
 {
@@ -237,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk)
 	return !empty;
 }
 
-static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
-			     int flags, long timeo, int *err)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int ret = 0;
-
-	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		return 1;
-
-	if (!timeo)
-		return ret;
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	ret = sk_wait_event(sk, &timeo,
-			    !list_empty(&psock->ingress_msg) ||
-			    !skb_queue_empty(&sk->sk_receive_queue), &wait);
-	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	remove_wait_queue(sk_sleep(sk), &wait);
-	return ret;
-}
-
 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len)
 {
@@ -278,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	}
 	lock_sock(sk);
 msg_bytes_ready:
-	copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
 	if (!copied) {
 		int data, err = 0;
 		long timeo;
 
 		timeo = sock_rcvtimeo(sk, nonblock);
-		data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+		data = sk_msg_wait_data(sk, psock, flags, timeo, &err);
 		if (data) {
 			if (!sk_psock_queue_empty(psock))
 				goto msg_bytes_ready;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 01d933ae5f16..1dcb34dfd56b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1789,8 +1789,8 @@ int tls_sw_recvmsg(struct sock *sk,
 		skb = tls_wait_data(sk, psock, flags, timeo, &err);
 		if (!skb) {
 			if (psock) {
-				int ret = __tcp_bpf_recvmsg(sk, psock,
-							    msg, len, flags);
+				int ret = sk_msg_recvmsg(sk, psock, msg, len,
+							 flags);
 
 				if (ret > 0) {
 					decrypted += ret;
-- 
cgit v1.2.3


From 1decdb335c366fc0a1bae0db55c138c613cc9a1f Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Tue, 30 Mar 2021 11:40:55 +0800
Subject: tracing: Remove duplicate struct declaration in trace_events.h

struct trace_array is declared twice. One has been declared
at forward declaration. Remove the duplicate.

Link: https://lkml.kernel.org/r/20210330034056.2266969-1-wanjiabing@vivo.com

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 36e27c1f42e0..ad413b382a3c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -404,7 +404,6 @@ trace_get_fields(struct trace_event_call *event_call)
 	return event_call->class->get_fields(event_call);
 }
 
-struct trace_array;
 struct trace_subsystem_dir;
 
 enum {
-- 
cgit v1.2.3


From f3ef7202ef7c705d640d1aeec3b286a641ac9186 Mon Sep 17 00:00:00 2001
From: "Yordan Karadzhov (VMware)" <y.karadz@gmail.com>
Date: Mon, 29 Mar 2021 16:03:31 +0300
Subject: tracing: Remove unused argument from "ring_buffer_time_stamp()

The "cpu" parameter is not being used by the function.

Link: https://lkml.kernel.org/r/20210329130331.199402-1-y.karadz@gmail.com

Signed-off-by: Yordan Karadzhov (VMware) <y.karadz@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 2 +-
 kernel/trace/ring_buffer.c  | 2 +-
 kernel/trace/trace.c        | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 057b7ed4fe24..dac53fd3afea 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -181,7 +181,7 @@ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cp
 unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu);
 unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu);
 
-u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu);
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer);
 void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer,
 				      int cpu, u64 *ts);
 void ring_buffer_set_clock(struct trace_buffer *buffer,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f4216df58e31..2c0ee6484990 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1080,7 +1080,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer)
 	return ts << DEBUG_SHIFT;
 }
 
-u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer)
 {
 	u64 time;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3834de91fb97..507a30bf26e4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -771,7 +771,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
 	if (!buf->buffer)
 		return trace_clock_local();
 
-	ts = ring_buffer_time_stamp(buf->buffer, cpu);
+	ts = ring_buffer_time_stamp(buf->buffer);
 	ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
 
 	return ts;
@@ -7174,7 +7174,7 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
 u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe)
 {
 	if (rbe == this_cpu_read(trace_buffered_event))
-		return ring_buffer_time_stamp(buffer, smp_processor_id());
+		return ring_buffer_time_stamp(buffer);
 
 	return ring_buffer_event_time_stamp(buffer, rbe);
 }
@@ -8088,7 +8088,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 		trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
 								t, usec_rem);
 
-		t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
+		t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer));
 		usec_rem = do_div(t, USEC_PER_SEC);
 		trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
 	} else {
@@ -8097,7 +8097,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 				ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
 
 		trace_seq_printf(s, "now ts: %llu\n",
-				ring_buffer_time_stamp(trace_buf->buffer, cpu));
+				ring_buffer_time_stamp(trace_buf->buffer));
 	}
 
 	cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
-- 
cgit v1.2.3


From 4578be130a6470d85ff05b13b75a00e6224eeeeb Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 1 Apr 2021 09:06:29 -0700
Subject: overflow: Correct check_shl_overflow() comment

A 'false' return means the value was safely set, so the comment should
say 'true' for when it is not considered safe.

Cc: Jason Gunthorpe <jgg@ziepe.ca>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Fixes: 0c66847793d1 ("overflow.h: Add arithmetic shift helper")
Link: https://lore.kernel.org/r/20210401160629.1941787-1-kbusch@kernel.org
---
 include/linux/overflow.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index ef74051d5cfe..0f12345c21fb 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -235,7 +235,7 @@ static inline bool __must_check __must_check_overflow(bool overflow)
  * - 'a << s' sets the sign bit, if any, in '*d'.
  *
  * '*d' will hold the results of the attempted shift, but is not
- * considered "safe for use" if false is returned.
+ * considered "safe for use" if true is returned.
  */
 #define check_shl_overflow(a, s, d) __must_check_overflow(({		\
 	typeof(a) _a = a;						\
-- 
cgit v1.2.3


From cad3193fe9d1f0af4d05ed86693f99984409b188 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Mon, 22 Mar 2021 00:51:39 +0100
Subject: mtd: spi-nor: implement OTP support for Winbond and similar flashes

Use the new OTP ops to implement OTP access on Winbond flashes. Most
Winbond flashes provides up to four different OTP regions ("Security
Registers").

Winbond devices use a special opcode to read and write to the OTP
regions, just like the RDSFDP opcode. In fact, it seems that the
(undocumented) first OTP area of the newer flashes is the actual SFDP
table.

On a side note, Winbond devices also allow erasing the OTP regions as
long as the area isn't locked down.

Signed-off-by: Michael Walle <michael@walle.cc>
Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Link: https://lore.kernel.org/r/20210321235140.8308-3-michael@walle.cc
---
 drivers/mtd/spi-nor/core.c  |   2 +-
 drivers/mtd/spi-nor/core.h  |   6 ++
 drivers/mtd/spi-nor/otp.c   | 164 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spi-nor.h |   8 +++
 4 files changed, 179 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 9e27ea727628..8cf3cf92129e 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -1034,7 +1034,7 @@ static int spi_nor_write_16bit_sr_and_check(struct spi_nor *nor, u8 sr1)
  *
  * Return: 0 on success, -errno otherwise.
  */
-static int spi_nor_write_16bit_cr_and_check(struct spi_nor *nor, u8 cr)
+int spi_nor_write_16bit_cr_and_check(struct spi_nor *nor, u8 cr)
 {
 	int ret;
 	u8 *sr_cr = nor->bouncebuf;
diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h
index cfbc43c5cc57..e9b6b2e76cdb 100644
--- a/drivers/mtd/spi-nor/core.h
+++ b/drivers/mtd/spi-nor/core.h
@@ -495,6 +495,7 @@ int spi_nor_read_sr(struct spi_nor *nor, u8 *sr);
 int spi_nor_read_cr(struct spi_nor *nor, u8 *cr);
 int spi_nor_write_sr(struct spi_nor *nor, const u8 *sr, size_t len);
 int spi_nor_write_sr_and_check(struct spi_nor *nor, u8 sr1);
+int spi_nor_write_16bit_cr_and_check(struct spi_nor *nor, u8 cr);
 
 int spi_nor_xread_sr(struct spi_nor *nor, u8 *sr);
 ssize_t spi_nor_read_data(struct spi_nor *nor, loff_t from, size_t len,
@@ -502,6 +503,11 @@ ssize_t spi_nor_read_data(struct spi_nor *nor, loff_t from, size_t len,
 ssize_t spi_nor_write_data(struct spi_nor *nor, loff_t to, size_t len,
 			   const u8 *buf);
 
+int spi_nor_otp_read_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf);
+int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf);
+int spi_nor_otp_lock_sr2(struct spi_nor *nor, unsigned int region);
+int spi_nor_otp_is_locked_sr2(struct spi_nor *nor, unsigned int region);
+
 int spi_nor_hwcaps_read2cmd(u32 hwcaps);
 u8 spi_nor_convert_3to4_read(u8 opcode);
 void spi_nor_set_read_settings(struct spi_nor_read_command *read,
diff --git a/drivers/mtd/spi-nor/otp.c b/drivers/mtd/spi-nor/otp.c
index 075b7290a95d..5021d40dffbf 100644
--- a/drivers/mtd/spi-nor/otp.c
+++ b/drivers/mtd/spi-nor/otp.c
@@ -14,6 +14,170 @@
 #define spi_nor_otp_region_len(nor) ((nor)->params->otp.org->len)
 #define spi_nor_otp_n_regions(nor) ((nor)->params->otp.org->n_regions)
 
+/**
+ * spi_nor_otp_read_secr() - read OTP data
+ * @nor:	pointer to 'struct spi_nor'
+ * @from:       offset to read from
+ * @len:        number of bytes to read
+ * @buf:        pointer to dst buffer
+ *
+ * Read OTP data from one region by using the SPINOR_OP_RSECR commands. This
+ * method is used on GigaDevice and Winbond flashes.
+ *
+ * Return: number of bytes read successfully, -errno otherwise
+ */
+int spi_nor_otp_read_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf)
+{
+	u8 addr_width, read_opcode, read_dummy;
+	struct spi_mem_dirmap_desc *rdesc;
+	enum spi_nor_protocol read_proto;
+	int ret;
+
+	read_opcode = nor->read_opcode;
+	addr_width = nor->addr_width;
+	read_dummy = nor->read_dummy;
+	read_proto = nor->read_proto;
+	rdesc = nor->dirmap.rdesc;
+
+	nor->read_opcode = SPINOR_OP_RSECR;
+	nor->addr_width = 3;
+	nor->read_dummy = 8;
+	nor->read_proto = SNOR_PROTO_1_1_1;
+	nor->dirmap.rdesc = NULL;
+
+	ret = spi_nor_read_data(nor, addr, len, buf);
+
+	nor->read_opcode = read_opcode;
+	nor->addr_width = addr_width;
+	nor->read_dummy = read_dummy;
+	nor->read_proto = read_proto;
+	nor->dirmap.rdesc = rdesc;
+
+	return ret;
+}
+
+/**
+ * spi_nor_otp_write_secr() - write OTP data
+ * @nor:        pointer to 'struct spi_nor'
+ * @to:         offset to write to
+ * @len:        number of bytes to write
+ * @buf:        pointer to src buffer
+ *
+ * Write OTP data to one region by using the SPINOR_OP_PSECR commands. This
+ * method is used on GigaDevice and Winbond flashes.
+ *
+ * Please note, the write must not span multiple OTP regions.
+ *
+ * Return: number of bytes written successfully, -errno otherwise
+ */
+int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf)
+{
+	enum spi_nor_protocol write_proto;
+	struct spi_mem_dirmap_desc *wdesc;
+	u8 addr_width, program_opcode;
+	int ret, written;
+
+	program_opcode = nor->program_opcode;
+	addr_width = nor->addr_width;
+	write_proto = nor->write_proto;
+	wdesc = nor->dirmap.wdesc;
+
+	nor->program_opcode = SPINOR_OP_PSECR;
+	nor->addr_width = 3;
+	nor->write_proto = SNOR_PROTO_1_1_1;
+	nor->dirmap.wdesc = NULL;
+
+	/*
+	 * We only support a write to one single page. For now all winbond
+	 * flashes only have one page per OTP region.
+	 */
+	ret = spi_nor_write_enable(nor);
+	if (ret)
+		goto out;
+
+	written = spi_nor_write_data(nor, addr, len, buf);
+	if (written < 0)
+		goto out;
+
+	ret = spi_nor_wait_till_ready(nor);
+
+out:
+	nor->program_opcode = program_opcode;
+	nor->addr_width = addr_width;
+	nor->write_proto = write_proto;
+	nor->dirmap.wdesc = wdesc;
+
+	return ret ?: written;
+}
+
+static int spi_nor_otp_lock_bit_cr(unsigned int region)
+{
+	static const int lock_bits[] = { SR2_LB1, SR2_LB2, SR2_LB3 };
+
+	if (region >= ARRAY_SIZE(lock_bits))
+		return -EINVAL;
+
+	return lock_bits[region];
+}
+
+/**
+ * spi_nor_otp_lock_sr2() - lock the OTP region
+ * @nor:        pointer to 'struct spi_nor'
+ * @region:     OTP region
+ *
+ * Lock the OTP region by writing the status register-2. This method is used on
+ * GigaDevice and Winbond flashes.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+int spi_nor_otp_lock_sr2(struct spi_nor *nor, unsigned int region)
+{
+	u8 *cr = nor->bouncebuf;
+	int ret, lock_bit;
+
+	lock_bit = spi_nor_otp_lock_bit_cr(region);
+	if (lock_bit < 0)
+		return lock_bit;
+
+	ret = spi_nor_read_cr(nor, cr);
+	if (ret)
+		return ret;
+
+	/* no need to write the register if region is already locked */
+	if (cr[0] & lock_bit)
+		return 0;
+
+	cr[0] |= lock_bit;
+
+	return spi_nor_write_16bit_cr_and_check(nor, cr[0]);
+}
+
+/**
+ * spi_nor_otp_is_locked_sr2() - get the OTP region lock status
+ * @nor:        pointer to 'struct spi_nor'
+ * @region:     OTP region
+ *
+ * Retrieve the OTP region lock bit by reading the status register-2. This
+ * method is used on GigaDevice and Winbond flashes.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+int spi_nor_otp_is_locked_sr2(struct spi_nor *nor, unsigned int region)
+{
+	u8 *cr = nor->bouncebuf;
+	int ret, lock_bit;
+
+	lock_bit = spi_nor_otp_lock_bit_cr(region);
+	if (lock_bit < 0)
+		return lock_bit;
+
+	ret = spi_nor_read_cr(nor, cr);
+	if (ret)
+		return ret;
+
+	return cr[0] & lock_bit;
+}
+
 static loff_t spi_nor_otp_region_start(const struct spi_nor *nor, unsigned int region)
 {
 	const struct spi_nor_otp_organization *org = nor->params->otp.org;
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index a0d572855444..98ed91b529ea 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -107,6 +107,11 @@
 #define SPINOR_OP_RD_EVCR      0x65    /* Read EVCR register */
 #define SPINOR_OP_WD_EVCR      0x61    /* Write EVCR register */
 
+/* Used for GigaDevices and Winbond flashes. */
+#define SPINOR_OP_ESECR		0x44	/* Erase Security registers */
+#define SPINOR_OP_PSECR		0x42	/* Program Security registers */
+#define SPINOR_OP_RSECR		0x48	/* Read Security registers */
+
 /* Status Register bits. */
 #define SR_WIP			BIT(0)	/* Write in progress */
 #define SR_WEL			BIT(1)	/* Write enable latch */
@@ -138,6 +143,9 @@
 
 /* Status Register 2 bits. */
 #define SR2_QUAD_EN_BIT1	BIT(1)
+#define SR2_LB1			BIT(3)	/* Security Register Lock Bit 1 */
+#define SR2_LB2			BIT(4)	/* Security Register Lock Bit 2 */
+#define SR2_LB3			BIT(5)	/* Security Register Lock Bit 3 */
 #define SR2_QUAD_EN_BIT7	BIT(7)
 
 /* Supported SPI protocols */
-- 
cgit v1.2.3


From a28e824fb8270eda43fd0f65c2a5fdf33f55c5eb Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 30 Mar 2021 12:12:37 +0100
Subject: nvmem: core: Add functions to make number reading easy

Sometimes the clients of nvmem just want to get a number out of
nvmem. They don't want to think about exactly how many bytes the nvmem
cell took up. They just want the number. Let's make it easy.

In general this concept is useful because nvmem space is precious and
usually the fewest bits are allocated that will hold a given value on
a given system. However, even though small numbers might be fine on
one system that doesn't mean that logically the number couldn't be
bigger. Imagine nvmem containing a max frequency for a component. On
one system perhaps that fits in 16 bits. On another system it might
fit in 32 bits. The code reading this number doesn't care--it just
wants the number.

We'll provide two functions: nvmem_cell_read_variable_le_u32() and
nvmem_cell_read_variable_le_u64().

Comparing these to the existing functions like nvmem_cell_read_u32():
* These new functions have no problems if the value was stored in
  nvmem in fewer bytes. It's OK to use these function as long as the
  value stored will fit in 32-bits (or 64-bits).
* These functions avoid problems that the earlier APIs had with bit
  offsets. For instance, you can't use nvmem_cell_read_u32() to read a
  value has nbits=32 and bit_offset=4 because the nvmem cell must be
  at least 5 bytes big to hold this value. The new API accounts for
  this and works fine.
* These functions make it very explicit that they assume that the
  number was stored in little endian format. The old functions made
  this assumption whenever bit_offset was non-zero (see
  nvmem_shift_read_buffer_in_place()) but didn't whenever the
  bit_offset was zero.

NOTE: it's assumed that we don't need an 8-bit or 16-bit version of
this function. The 32-bit version of the function can be used to read
8-bit or 16-bit data.

At the moment, I'm only adding the "unsigned" versions of these
functions, but if it ends up being useful someone could add a "signed"
version that did 2's complement sign extension.

At the moment, I'm only adding the "little endian" versions of these
functions. Adding the "big endian" version would require adding "big
endian" support to nvmem_shift_read_buffer_in_place().

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20210330111241.19401-7-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 95 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvmem-consumer.h |  4 ++
 2 files changed, 99 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index a5ab1e0c74cf..635e3131eb5f 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -1606,6 +1606,101 @@ int nvmem_cell_read_u64(struct device *dev, const char *cell_id, u64 *val)
 }
 EXPORT_SYMBOL_GPL(nvmem_cell_read_u64);
 
+static void *nvmem_cell_read_variable_common(struct device *dev,
+					     const char *cell_id,
+					     size_t max_len, size_t *len)
+{
+	struct nvmem_cell *cell;
+	int nbits;
+	void *buf;
+
+	cell = nvmem_cell_get(dev, cell_id);
+	if (IS_ERR(cell))
+		return cell;
+
+	nbits = cell->nbits;
+	buf = nvmem_cell_read(cell, len);
+	nvmem_cell_put(cell);
+	if (IS_ERR(buf))
+		return buf;
+
+	/*
+	 * If nbits is set then nvmem_cell_read() can significantly exaggerate
+	 * the length of the real data. Throw away the extra junk.
+	 */
+	if (nbits)
+		*len = DIV_ROUND_UP(nbits, 8);
+
+	if (*len > max_len) {
+		kfree(buf);
+		return ERR_PTR(-ERANGE);
+	}
+
+	return buf;
+}
+
+/**
+ * nvmem_cell_read_variable_le_u32() - Read up to 32-bits of data as a little endian number.
+ *
+ * @dev: Device that requests the nvmem cell.
+ * @cell_id: Name of nvmem cell to read.
+ * @val: pointer to output value.
+ *
+ * Return: 0 on success or negative errno.
+ */
+int nvmem_cell_read_variable_le_u32(struct device *dev, const char *cell_id,
+				    u32 *val)
+{
+	size_t len;
+	u8 *buf;
+	int i;
+
+	buf = nvmem_cell_read_variable_common(dev, cell_id, sizeof(*val), &len);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	/* Copy w/ implicit endian conversion */
+	*val = 0;
+	for (i = 0; i < len; i++)
+		*val |= buf[i] << (8 * i);
+
+	kfree(buf);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmem_cell_read_variable_le_u32);
+
+/**
+ * nvmem_cell_read_variable_le_u64() - Read up to 64-bits of data as a little endian number.
+ *
+ * @dev: Device that requests the nvmem cell.
+ * @cell_id: Name of nvmem cell to read.
+ * @val: pointer to output value.
+ *
+ * Return: 0 on success or negative errno.
+ */
+int nvmem_cell_read_variable_le_u64(struct device *dev, const char *cell_id,
+				    u64 *val)
+{
+	size_t len;
+	u8 *buf;
+	int i;
+
+	buf = nvmem_cell_read_variable_common(dev, cell_id, sizeof(*val), &len);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	/* Copy w/ implicit endian conversion */
+	*val = 0;
+	for (i = 0; i < len; i++)
+		*val |= buf[i] << (8 * i);
+
+	kfree(buf);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmem_cell_read_variable_le_u64);
+
 /**
  * nvmem_device_cell_read() - Read a given nvmem device and cell
  *
diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index 052293f4cbdb..923dada24eb4 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -65,6 +65,10 @@ int nvmem_cell_read_u8(struct device *dev, const char *cell_id, u8 *val);
 int nvmem_cell_read_u16(struct device *dev, const char *cell_id, u16 *val);
 int nvmem_cell_read_u32(struct device *dev, const char *cell_id, u32 *val);
 int nvmem_cell_read_u64(struct device *dev, const char *cell_id, u64 *val);
+int nvmem_cell_read_variable_le_u32(struct device *dev, const char *cell_id,
+				    u32 *val);
+int nvmem_cell_read_variable_le_u64(struct device *dev, const char *cell_id,
+				    u64 *val);
 
 /* direct nvmem device read/write interface */
 struct nvmem_device *nvmem_device_get(struct device *dev, const char *name);
-- 
cgit v1.2.3


From a7f3d3d3600c8ed119eb0d2483de0062ce2e3707 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 26 Mar 2021 22:03:05 +0100
Subject: dma-mapping: add unlikely hint to error path in dma_mapping_error

Zillions of drivers use the unlikely() hint when checking the result of
dma_mapping_error(). This is an inline function anyway, so we can move
the hint into the function and remove it from drivers over time.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index e9d19b974f26..183e7103a66d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -95,7 +95,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	debug_dma_mapping_error(dev, dma_addr);
 
-	if (dma_addr == DMA_MAPPING_ERROR)
+	if (unlikely(dma_addr == DMA_MAPPING_ERROR))
 		return -ENOMEM;
 	return 0;
 }
-- 
cgit v1.2.3


From 1768289b44bae847612751d418fc5c5e680b5e5c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 31 Mar 2021 18:05:25 +0300
Subject: driver core: platform: Declare early_platform_cleanup() prototype
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiler is not happy:

  CC      drivers/base/platform.o
drivers/base/platform.c:1557:20: warning: no previous prototype for ‘early_platform_cleanup’ [-Wmissing-prototypes]
 1557 | void __weak __init early_platform_cleanup(void) { }
      |                    ^~~~~~~~~~~~~~~~~~~~~~

Declare early_platform_cleanup() prototype in the header to make everyone happy.

Fixes: eecd37e105f0 ("drivers: Fix boot problem on SuperH")
Cc: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210331150525.59223-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_device.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 3f23f6e430bf..cd81e060863c 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -359,4 +359,7 @@ static inline int is_sh_early_platform_device(struct platform_device *pdev)
 }
 #endif /* CONFIG_SUPERH */
 
+/* For now only SuperH uses it */
+void early_platform_cleanup(void);
+
 #endif /* _PLATFORM_DEVICE_H_ */
-- 
cgit v1.2.3


From d699ae4fc27496d01e8bc5ab2106bd79d1e7be92 Mon Sep 17 00:00:00 2001
From: Alexander Lochmann <alexander.lochmann@tu-dortmund.de>
Date: Thu, 11 Feb 2021 10:51:55 +0100
Subject: ext4: updated locking documentation for journal_t

Some members of transaction_t are allowed to be read without any lock
being held if consistency doesn't matter.  Based on LockDoc's
findings, we extended the locking documentation of those members.
Each one of them is marked with a short comment: "no lock for quick
racy checks".

Signed-off-by: Alexander Lochmann <alexander.lochmann@tu-dortmund.de>
Signed-off-by: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/ad82c7a9-a624-4ed5-5ada-a6410c44c0b3@tu-dortmund.de
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 99d3cd051ac3..74710bb231a6 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -768,7 +768,8 @@ enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
 struct journal_s
 {
 	/**
-	 * @j_flags: General journaling state flags [j_state_lock]
+	 * @j_flags: General journaling state flags [j_state_lock,
+	 * no lock for quick racy checks]
 	 */
 	unsigned long		j_flags;
 
@@ -808,7 +809,8 @@ struct journal_s
 	/**
 	 * @j_barrier_count:
 	 *
-	 * Number of processes waiting to create a barrier lock [j_state_lock]
+	 * Number of processes waiting to create a barrier lock [j_state_lock,
+	 * no lock for quick racy checks]
 	 */
 	int			j_barrier_count;
 
@@ -821,7 +823,8 @@ struct journal_s
 	 * @j_running_transaction:
 	 *
 	 * Transactions: The current running transaction...
-	 * [j_state_lock] [caller holding open handle]
+	 * [j_state_lock, no lock for quick racy checks] [caller holding
+	 * open handle]
 	 */
 	transaction_t		*j_running_transaction;
 
@@ -1033,7 +1036,7 @@ struct journal_s
 	 * @j_commit_sequence:
 	 *
 	 * Sequence number of the most recently committed transaction
-	 * [j_state_lock].
+	 * [j_state_lock, no lock for quick racy checks]
 	 */
 	tid_t			j_commit_sequence;
 
@@ -1041,7 +1044,7 @@ struct journal_s
 	 * @j_commit_request:
 	 *
 	 * Sequence number of the most recent transaction wanting commit
-	 * [j_state_lock]
+	 * [j_state_lock, no lock for quick racy checks]
 	 */
 	tid_t			j_commit_request;
 
-- 
cgit v1.2.3


From 3042b1b45c4106feff063932d4fd481c5009dbe1 Mon Sep 17 00:00:00 2001
From: Alexander Lochmann <alexander.lochmann@tu-dortmund.de>
Date: Thu, 11 Feb 2021 18:14:10 +0100
Subject: Updated locking documentation for transaction_t

Some members of transaction_t are allowed to be read without any lock
being held if accessed from the correct context.  We used LockDoc's
findings to determine those members.  Each member of them is marked
with a short comment: "no lock needed for jbd2 thread".

Signed-off-by: Alexander Lochmann <alexander.lochmann@tu-dortmund.de>
Signed-off-by: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210211171410.17984-1-alexander.lochmann@tu-dortmund.de
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 74710bb231a6..b9aa85081a40 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -594,18 +594,22 @@ struct transaction_s
 	 */
 	unsigned long		t_log_start;
 
-	/* Number of buffers on the t_buffers list [j_list_lock] */
+	/* 
+	 * Number of buffers on the t_buffers list [j_list_lock, no locks
+	 * needed for jbd2 thread]
+	 */
 	int			t_nr_buffers;
 
 	/*
 	 * Doubly-linked circular list of all buffers reserved but not yet
-	 * modified by this transaction [j_list_lock]
+	 * modified by this transaction [j_list_lock, no locks needed fo
+	 * jbd2 thread]
 	 */
 	struct journal_head	*t_reserved_list;
 
 	/*
 	 * Doubly-linked circular list of all metadata buffers owned by this
-	 * transaction [j_list_lock]
+	 * transaction [j_list_lock, no locks needed for jbd2 thread]
 	 */
 	struct journal_head	*t_buffers;
 
@@ -629,9 +633,11 @@ struct transaction_s
 	struct journal_head	*t_checkpoint_io_list;
 
 	/*
-	 * Doubly-linked circular list of metadata buffers being shadowed by log
-	 * IO.  The IO buffers on the iobuf list and the shadow buffers on this
-	 * list match each other one for one at all times. [j_list_lock]
+	 * Doubly-linked circular list of metadata buffers being
+	 * shadowed by log IO.  The IO buffers on the iobuf list and
+	 * the shadow buffers on this list match each other one for
+	 * one at all times. [j_list_lock, no locks needed for jbd2
+	 * thread]
 	 */
 	struct journal_head	*t_shadow_list;
 
-- 
cgit v1.2.3


From f06c609645ecd043c79380fac94145926603fb33 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Apr 2021 19:17:46 +0200
Subject: block: remove the unused RQF_ALLOCED flag

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 1 -
 include/linux/blkdev.h | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 9ebb344e2585..271f6596435b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -302,7 +302,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(QUIET),
 	RQF_NAME(ELVPRIV),
 	RQF_NAME(IO_STAT),
-	RQF_NAME(ALLOCED),
 	RQF_NAME(PM),
 	RQF_NAME(HASHED),
 	RQF_NAME(STATS),
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc6bc8383b43..158aefae1030 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -85,8 +85,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_ELVPRIV		((__force req_flags_t)(1 << 12))
 /* account into disk and partition IO statistics */
 #define RQF_IO_STAT		((__force req_flags_t)(1 << 13))
-/* request came from our alloc pool */
-#define RQF_ALLOCED		((__force req_flags_t)(1 << 14))
 /* runtime pm request */
 #define RQF_PM			((__force req_flags_t)(1 << 15))
 /* on IO scheduler merge hash */
-- 
cgit v1.2.3


From e3baacf54275647a018ee35bff3bc775a8a2a01a Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 29 Mar 2021 15:57:33 +0300
Subject: regulator: helpers: Export helper voltage listing

Some drivers need to translate voltage values to selectors prior regulator
registration. Currently a regulator_desc based list_voltages helper is only
exported for regulators using the linear_ranges. Export similar helper also
for regulators using simple linear mapping.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/1200ef7a50c84327ada019b85f6527b4fc9b5ce1.1617020713.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/helpers.c      | 36 ++++++++++++++++++++++++++++--------
 include/linux/regulator/driver.h |  2 ++
 2 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/helpers.c b/drivers/regulator/helpers.c
index f42b394a0c46..3e19ecbf7267 100644
--- a/drivers/regulator/helpers.c
+++ b/drivers/regulator/helpers.c
@@ -508,6 +508,33 @@ int regulator_map_voltage_pickable_linear_range(struct regulator_dev *rdev,
 }
 EXPORT_SYMBOL_GPL(regulator_map_voltage_pickable_linear_range);
 
+/**
+ * regulator_desc_list_voltage_linear - List voltages with simple calculation
+ *
+ * @desc: Regulator desc for regulator which volatges are to be listed
+ * @selector: Selector to convert into a voltage
+ *
+ * Regulators with a simple linear mapping between voltages and
+ * selectors can set min_uV and uV_step in the regulator descriptor
+ * and then use this function prior regulator registration to list
+ * the voltages. This is useful when voltages need to be listed during
+ * device-tree parsing.
+ */
+int regulator_desc_list_voltage_linear(const struct regulator_desc *desc,
+				       unsigned int selector)
+{
+	if (selector >= desc->n_voltages)
+		return -EINVAL;
+
+	if (selector < desc->linear_min_sel)
+		return 0;
+
+	selector -= desc->linear_min_sel;
+
+	return desc->min_uV + (desc->uV_step * selector);
+}
+EXPORT_SYMBOL_GPL(regulator_desc_list_voltage_linear);
+
 /**
  * regulator_list_voltage_linear - List voltages with simple calculation
  *
@@ -521,14 +548,7 @@ EXPORT_SYMBOL_GPL(regulator_map_voltage_pickable_linear_range);
 int regulator_list_voltage_linear(struct regulator_dev *rdev,
 				  unsigned int selector)
 {
-	if (selector >= rdev->desc->n_voltages)
-		return -EINVAL;
-	if (selector < rdev->desc->linear_min_sel)
-		return 0;
-
-	selector -= rdev->desc->linear_min_sel;
-
-	return rdev->desc->min_uV + (rdev->desc->uV_step * selector);
+	return regulator_desc_list_voltage_linear(rdev->desc, selector);
 }
 EXPORT_SYMBOL_GPL(regulator_list_voltage_linear);
 
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index d7c77ee370f3..39a540111645 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -543,4 +543,6 @@ void *regulator_get_init_drvdata(struct regulator_init_data *reg_init_data);
 int regulator_desc_list_voltage_linear_range(const struct regulator_desc *desc,
 					     unsigned int selector);
 
+int regulator_desc_list_voltage_linear(const struct regulator_desc *desc,
+				       unsigned int selector);
 #endif
-- 
cgit v1.2.3


From fb8fee9efdcf084d9e31ba14cc4734d97e5dd972 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 29 Mar 2021 15:59:04 +0300
Subject: regulator: Add regmap helper for ramp-delay setting

Quite a few regulator ICs do support setting ramp-delay by writing a value
matching the delay to a ramp-delay register.

Provide a simple helper for table-based delay setting.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/f101f1db564cf32cb58719c77af0b00d7236bb89.1617020713.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/helpers.c      | 65 ++++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/driver.h |  5 ++++
 2 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/helpers.c b/drivers/regulator/helpers.c
index 3e19ecbf7267..0e16e31c968f 100644
--- a/drivers/regulator/helpers.c
+++ b/drivers/regulator/helpers.c
@@ -901,3 +901,68 @@ bool regulator_is_equal(struct regulator *reg1, struct regulator *reg2)
 	return reg1->rdev == reg2->rdev;
 }
 EXPORT_SYMBOL_GPL(regulator_is_equal);
+
+static int find_closest_bigger(unsigned int target, const unsigned int *table,
+			       unsigned int num_sel, unsigned int *sel)
+{
+	unsigned int s, tmp, max, maxsel = 0;
+	bool found = false;
+
+	max = table[0];
+
+	for (s = 0; s < num_sel; s++) {
+		if (table[s] > max) {
+			max = table[s];
+			maxsel = s;
+		}
+		if (table[s] >= target) {
+			if (!found || table[s] - target < tmp - target) {
+				tmp = table[s];
+				*sel = s;
+				found = true;
+				if (tmp == target)
+					break;
+			}
+		}
+	}
+
+	if (!found) {
+		*sel = maxsel;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * regulator_set_ramp_delay_regmap - set_ramp_delay() helper
+ *
+ * @rdev: regulator to operate on
+ *
+ * Regulators that use regmap for their register I/O can set the ramp_reg
+ * and ramp_mask fields in their descriptor and then use this as their
+ * set_ramp_delay operation, saving some code.
+ */
+int regulator_set_ramp_delay_regmap(struct regulator_dev *rdev, int ramp_delay)
+{
+	int ret;
+	unsigned int sel;
+
+	if (!rdev->desc->n_ramp_values)
+		return -EINVAL;
+
+	ret = find_closest_bigger(ramp_delay, rdev->desc->ramp_delay_table,
+				  rdev->desc->n_ramp_values, &sel);
+
+	if (ret) {
+		dev_warn(rdev_get_dev(rdev),
+			 "Can't set ramp-delay %u, setting %u\n", ramp_delay,
+			 rdev->desc->ramp_delay_table[sel]);
+	}
+
+	sel <<= ffs(rdev->desc->ramp_mask) - 1;
+
+	return regmap_update_bits(rdev->regmap, rdev->desc->ramp_reg,
+				  rdev->desc->ramp_mask, sel);
+}
+EXPORT_SYMBOL_GPL(regulator_set_ramp_delay_regmap);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 39a540111645..597ed117086f 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -373,6 +373,10 @@ struct regulator_desc {
 	unsigned int pull_down_reg;
 	unsigned int pull_down_mask;
 	unsigned int pull_down_val_on;
+	unsigned int ramp_reg;
+	unsigned int ramp_mask;
+	const unsigned int *ramp_delay_table;
+	unsigned int n_ramp_values;
 
 	unsigned int enable_time;
 
@@ -535,6 +539,7 @@ int regulator_set_current_limit_regmap(struct regulator_dev *rdev,
 				       int min_uA, int max_uA);
 int regulator_get_current_limit_regmap(struct regulator_dev *rdev);
 void *regulator_get_init_drvdata(struct regulator_init_data *reg_init_data);
+int regulator_set_ramp_delay_regmap(struct regulator_dev *rdev, int ramp_delay);
 
 /*
  * Helper functions intended to be used by regulator drivers prior registering
-- 
cgit v1.2.3


From 4c4c0a89abd5c08e91df9bcce4ebcb3433bbb9bf Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 19 Feb 2021 08:18:12 +0200
Subject: net/mlx5: Pack mlx5_rl_entry structure

mlx5_rl_entry structure is not properly packed as shown below. Due to this
an array of size 9144 bytes allocated which is aligned to 16Kbytes.
Hence, pack the structure and avoid the wastage.

This offers 8Kbytes of saving per mlx5_core_dev struct.

pahole -C mlx5_rl_entry  drivers/net/ethernet/mellanox/mlx5/core/en_main.o

Existing layout:

struct mlx5_rl_entry {
        u8                         rl_raw[48];           /*     0    48 */
        u16                        index;                /*    48     2 */

        /* XXX 6 bytes hole, try to pack */

        u64                        refcount;             /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        u16                        uid;                  /*    64     2 */
        u8                         dedicated:1;          /*    66: 0  1 */

        /* size: 72, cachelines: 2, members: 5 */
        /* sum members: 60, holes: 1, sum holes: 6 */
        /* sum bitfield members: 1 bits (0 bytes) */
        /* padding: 5 */
        /* bit_padding: 7 bits */
        /* last cacheline: 8 bytes */
};

After alignment:

struct mlx5_rl_entry {
        u8                         rl_raw[48];           /*     0    48 */
        u64                        refcount;             /*    48     8 */
        u16                        index;                /*    56     2 */
        u16                        uid;                  /*    58     2 */
        u8                         dedicated:1;          /*    60: 0  1 */

        /* size: 64, cachelines: 1, members: 5 */
        /* padding: 3 */
        /* bit_padding: 7 bits */
};

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 23bb01d7c9b9..a9bd7e3bd554 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -517,8 +517,8 @@ struct mlx5_rate_limit {
 
 struct mlx5_rl_entry {
 	u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)];
-	u16 index;
 	u64 refcount;
+	u16 index;
 	u16 uid;
 	u8 dedicated : 1;
 };
-- 
cgit v1.2.3


From 6b30b6d4d36c978e0ab0f22e85bf3c646732e98b Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Fri, 19 Feb 2021 12:06:54 +0200
Subject: net/mlx5: Allocate rate limit table when rate is configured

A device supports 128 rate limiters. A static table allocation consumes
8KB of memory even when rate is not configured.

Instead, allocate the table when at least one rate is configured.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/rl.c | 46 ++++++++++++++++++++++------
 include/linux/mlx5/driver.h                  |  1 +
 2 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
index 08792fe701e3..0526e3798c09 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -117,6 +117,9 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
 	bool empty_found = false;
 	int i;
 
+	lockdep_assert_held(&table->rl_lock);
+	WARN_ON(!table->rl_entry);
+
 	for (i = 0; i < table->max_size; i++) {
 		if (dedicated) {
 			if (!table->rl_entry[i].refcount)
@@ -172,10 +175,17 @@ bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0,
 }
 EXPORT_SYMBOL(mlx5_rl_are_equal);
 
-static int mlx5_rl_table_alloc(struct mlx5_rl_table *table)
+static int mlx5_rl_table_get(struct mlx5_rl_table *table)
 {
 	int i;
 
+	lockdep_assert_held(&table->rl_lock);
+
+	if (table->rl_entry) {
+		table->refcount++;
+		return 0;
+	}
+
 	table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry),
 				  GFP_KERNEL);
 	if (!table->rl_entry)
@@ -187,13 +197,27 @@ static int mlx5_rl_table_alloc(struct mlx5_rl_table *table)
 	for (i = 0; i < table->max_size; i++)
 		table->rl_entry[i].index = i + 1;
 
+	table->refcount++;
 	return 0;
 }
 
+static void mlx5_rl_table_put(struct mlx5_rl_table *table)
+{
+	lockdep_assert_held(&table->rl_lock);
+	if (--table->refcount)
+		return;
+
+	kfree(table->rl_entry);
+	table->rl_entry = NULL;
+}
+
 static void mlx5_rl_table_free(struct mlx5_core_dev *dev, struct mlx5_rl_table *table)
 {
 	int i;
 
+	if (!table->rl_entry)
+		return;
+
 	/* Clear all configured rates */
 	for (i = 0; i < table->max_size; i++)
 		if (table->rl_entry[i].refcount)
@@ -219,8 +243,8 @@ int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid,
 {
 	struct mlx5_rl_table *table = &dev->priv.rl_table;
 	struct mlx5_rl_entry *entry;
-	int err = 0;
 	u32 rate;
+	int err;
 
 	if (!table->max_size)
 		return -EOPNOTSUPP;
@@ -233,13 +257,16 @@ int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid,
 	}
 
 	mutex_lock(&table->rl_lock);
+	err = mlx5_rl_table_get(table);
+	if (err)
+		goto out;
 
 	entry = find_rl_entry(table, rl_in, uid, dedicated_entry);
 	if (!entry) {
 		mlx5_core_err(dev, "Max number of %u rates reached\n",
 			      table->max_size);
 		err = -ENOSPC;
-		goto out;
+		goto rl_err;
 	}
 	if (!entry->refcount) {
 		/* new rate limit */
@@ -255,14 +282,18 @@ int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid,
 					 burst_upper_bound),
 				MLX5_GET(set_pp_rate_limit_context, rl_in,
 					 typical_packet_size));
-			goto out;
+			goto rl_err;
 		}
 
 		entry->dedicated = dedicated_entry;
 	}
 	mlx5_rl_entry_get(entry);
 	*index = entry->index;
+	mutex_unlock(&table->rl_lock);
+	return 0;
 
+rl_err:
+	mlx5_rl_table_put(table);
 out:
 	mutex_unlock(&table->rl_lock);
 	return err;
@@ -277,6 +308,7 @@ void mlx5_rl_remove_rate_raw(struct mlx5_core_dev *dev, u16 index)
 	mutex_lock(&table->rl_lock);
 	entry = &table->rl_entry[index - 1];
 	mlx5_rl_entry_put(dev, entry);
+	mlx5_rl_table_put(table);
 	mutex_unlock(&table->rl_lock);
 }
 EXPORT_SYMBOL(mlx5_rl_remove_rate_raw);
@@ -325,6 +357,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl)
 		goto out;
 	}
 	mlx5_rl_entry_put(dev, entry);
+	mlx5_rl_table_put(table);
 out:
 	mutex_unlock(&table->rl_lock);
 }
@@ -333,7 +366,6 @@ EXPORT_SYMBOL(mlx5_rl_remove_rate);
 int mlx5_init_rl_table(struct mlx5_core_dev *dev)
 {
 	struct mlx5_rl_table *table = &dev->priv.rl_table;
-	int err;
 
 	mutex_init(&table->rl_lock);
 	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) {
@@ -346,10 +378,6 @@ int mlx5_init_rl_table(struct mlx5_core_dev *dev)
 	table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate);
 	table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate);
 
-	err = mlx5_rl_table_alloc(table);
-	if (err)
-		return err;
-
 	mlx5_core_info(dev, "Rate limit: %u rates are supported, range: %uMbps to %uMbps\n",
 		       table->max_size,
 		       table->min_rate >> 10,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a9bd7e3bd554..baf38b5a2a8c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -530,6 +530,7 @@ struct mlx5_rl_table {
 	u32                     max_rate;
 	u32                     min_rate;
 	struct mlx5_rl_entry   *rl_entry;
+	u64 refcount;
 };
 
 struct mlx5_core_roce {
-- 
cgit v1.2.3


From 2daae89666ad253281bb3d6a027c00a702c02eff Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Thu, 1 Apr 2021 14:46:37 +0800
Subject: bpf, cgroup: Delete repeated struct bpf_prog declaration

struct bpf_prog is declared twice. There is one declaration which is
independent on the macro at 18th line. So the below one is not needed
though. Remove the duplicate.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210401064637.993327-1-wanjiabing@vivo.com
---
 include/linux/bpf-cgroup.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 6a29fe11485d..8b77d08d4b47 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -458,7 +458,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr);
 #else
 
-struct bpf_prog;
 struct cgroup_bpf {};
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
-- 
cgit v1.2.3


From 6ac4c6f887f5a8efb6a6952798c09a2562022966 Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Thu, 1 Apr 2021 15:20:37 +0800
Subject: bpf: Remove repeated struct btf_type declaration

struct btf_type is declared twice. One is declared at 35th line. The below
one is not needed, hence remove the duplicate.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20210401072037.995849-1-wanjiabing@vivo.com
---
 include/linux/bpf.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9fdd839b418c..ff8cd68c01b3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -928,7 +928,6 @@ struct bpf_link_primer {
 };
 
 struct bpf_struct_ops_value;
-struct btf_type;
 struct btf_member;
 
 #define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64
-- 
cgit v1.2.3


From fb05121fd6a20f0830ff2a4420c51af6ca4ac6e7 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 19 Mar 2021 11:06:50 +0000
Subject: signal: Add unsafe_get_compat_sigset()

In the same way as commit 14026b94ccfe ("signal: Add
unsafe_put_compat_sigset()"), this time add
unsafe_get_compat_sigset() macro which is the 'unsafe'
version of get_compat_sigset()

For the bigendian, use unsafe_get_user() directly
to avoid intermediate copy through the stack.

For the littleendian, use a straight unsafe_copy_from_user().

This commit adds the generic fallback for unsafe_copy_from_user().
Architectures wanting to use unsafe_get_compat_sigset() have to
make sure they have their own unsafe_copy_from_user().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/b05bf434ee13c76bc9df5f02653a10db5e7b54e5.1616151715.git.christophe.leroy@csgroup.eu
---
 include/linux/compat.h  | 35 +++++++++++++++++++++++++++++++++++
 include/linux/uaccess.h |  1 +
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6e65be753603..5112c3e35782 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -465,6 +465,34 @@ put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
 		unsafe_put_user(__s->sig[0], &__c->sig[0], label);	\
 	}								\
 } while (0)
+
+#define unsafe_get_compat_sigset(set, compat, label) do {		\
+	const compat_sigset_t __user *__c = compat;			\
+	compat_sigset_word hi, lo;					\
+	sigset_t *__s = set;						\
+									\
+	switch (_NSIG_WORDS) {						\
+	case 4:								\
+		unsafe_get_user(lo, &__c->sig[7], label);		\
+		unsafe_get_user(hi, &__c->sig[6], label);		\
+		__s->sig[3] = hi | (((long)lo) << 32);			\
+		fallthrough;						\
+	case 3:								\
+		unsafe_get_user(lo, &__c->sig[5], label);		\
+		unsafe_get_user(hi, &__c->sig[4], label);		\
+		__s->sig[2] = hi | (((long)lo) << 32);			\
+		fallthrough;						\
+	case 2:								\
+		unsafe_get_user(lo, &__c->sig[3], label);		\
+		unsafe_get_user(hi, &__c->sig[2], label);		\
+		__s->sig[1] = hi | (((long)lo) << 32);			\
+		fallthrough;						\
+	case 1:								\
+		unsafe_get_user(lo, &__c->sig[1], label);		\
+		unsafe_get_user(hi, &__c->sig[0], label);		\
+		__s->sig[0] = hi | (((long)lo) << 32);			\
+	}								\
+} while (0)
 #else
 #define unsafe_put_compat_sigset(compat, set, label) do {		\
 	compat_sigset_t __user *__c = compat;				\
@@ -472,6 +500,13 @@ put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
 									\
 	unsafe_copy_to_user(__c, __s, sizeof(*__c), label);		\
 } while (0)
+
+#define unsafe_get_compat_sigset(set, compat, label) do {		\
+	const compat_sigset_t __user *__c = compat;			\
+	sigset_t *__s = set;						\
+									\
+	unsafe_copy_from_user(__s, __c, sizeof(*__c), label);		\
+} while (0)
 #endif
 
 extern int compat_ptrace_request(struct task_struct *child,
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index c7c6e8b8344d..c05e903cef02 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -397,6 +397,7 @@ long strnlen_user_nofault(const void __user *unsafe_addr, long count);
 #define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
 #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
 #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
+#define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
 static inline unsigned long user_access_save(void) { return 0UL; }
 static inline void user_access_restore(unsigned long flags) { }
 #endif
-- 
cgit v1.2.3


From c3d5c2d96d69f2578d6fbf66e39cc2cf840d9812 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 4 Apr 2021 10:22:18 +0300
Subject: PCI/IOV: Add sysfs MSI-X vector assignment interface

A typical cloud provider SR-IOV use case is to create many VFs for use by
guest VMs. The VFs may not be assigned to a VM until a customer requests a
VM of a certain size, e.g., number of CPUs. A VF may need MSI-X vectors
proportional to the number of CPUs in the VM, but there is no standard way
to change the number of MSI-X vectors supported by a VF.

Some Mellanox ConnectX devices support dynamic assignment of MSI-X vectors
to SR-IOV VFs. This can be done by the PF driver after VFs are enabled,
and it can be done without affecting VFs that are already in use. The
hardware supports a limited pool of MSI-X vectors that can be assigned to
the PF or to individual VFs.  This is device-specific behavior that
requires support in the PF driver.

Add a read-only "sriov_vf_total_msix" sysfs file for the PF and a writable
"sriov_vf_msix_count" file for each VF. Management software may use these
to learn how many MSI-X vectors are available and to dynamically assign
them to VFs before the VFs are passed through to a VM.

If the PF driver implements the ->sriov_get_vf_total_msix() callback,
"sriov_vf_total_msix" contains the total number of MSI-X vectors available
for distribution among VFs.

If no driver is bound to the VF, writing "N" to "sriov_vf_msix_count" uses
the PF driver ->sriov_set_msix_vec_count() callback to assign "N" MSI-X
vectors to the VF.  When a VF driver subsequently reads the MSI-X Message
Control register, it will see the new Table Size "N".

Link: https://lore.kernel.org/linux-pci/20210314124256.70253-2-leon@kernel.org
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 Documentation/ABI/testing/sysfs-bus-pci |  29 +++++++++
 drivers/pci/iov.c                       | 102 ++++++++++++++++++++++++++++++--
 drivers/pci/pci-sysfs.c                 |   3 +-
 drivers/pci/pci.h                       |   3 +-
 include/linux/pci.h                     |   8 +++
 5 files changed, 137 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 25c9c39770c6..e5cfd170b491 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -375,3 +375,32 @@ Description:
 		The value comes from the PCI kernel device state and can be one
 		of: "unknown", "error", "D0", D1", "D2", "D3hot", "D3cold".
 		The file is read only.
+
+What:		/sys/bus/pci/devices/.../sriov_vf_total_msix
+Date:		January 2021
+Contact:	Leon Romanovsky <leonro@nvidia.com>
+Description:
+		This file is associated with a SR-IOV physical function (PF).
+		It contains the total number of MSI-X vectors available for
+		assignment to all virtual functions (VFs) associated with PF.
+		The value will be zero if the device doesn't support this
+		functionality. For supported devices, the value will be
+		constant and won't be changed after MSI-X vectors assignment.
+
+What:		/sys/bus/pci/devices/.../sriov_vf_msix_count
+Date:		January 2021
+Contact:	Leon Romanovsky <leonro@nvidia.com>
+Description:
+		This file is associated with a SR-IOV virtual function (VF).
+		It allows configuration of the number of MSI-X vectors for
+		the VF. This allows devices that have a global pool of MSI-X
+		vectors to optimally divide them between VFs based on VF usage.
+
+		The values accepted are:
+		 * > 0 - this number will be reported as the Table Size in the
+			 VF's MSI-X capability
+		 * < 0 - not valid
+		 * = 0 - will reset to the device default value
+
+		The file is writable if the PF is bound to a driver that
+		implements ->sriov_set_msix_vec_count().
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 4afd4ee4f7f0..afc06e6ce115 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -31,6 +31,7 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int vf_id)
 	return (dev->devfn + dev->sriov->offset +
 		dev->sriov->stride * vf_id) & 0xff;
 }
+EXPORT_SYMBOL_GPL(pci_iov_virtfn_devfn);
 
 /*
  * Per SR-IOV spec sec 3.3.10 and 3.3.11, First VF Offset and VF Stride may
@@ -157,6 +158,92 @@ failed:
 	return rc;
 }
 
+#ifdef CONFIG_PCI_MSI
+static ssize_t sriov_vf_total_msix_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	u32 vf_total_msix = 0;
+
+	device_lock(dev);
+	if (!pdev->driver || !pdev->driver->sriov_get_vf_total_msix)
+		goto unlock;
+
+	vf_total_msix = pdev->driver->sriov_get_vf_total_msix(pdev);
+unlock:
+	device_unlock(dev);
+	return sysfs_emit(buf, "%u\n", vf_total_msix);
+}
+static DEVICE_ATTR_RO(sriov_vf_total_msix);
+
+static ssize_t sriov_vf_msix_count_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct pci_dev *vf_dev = to_pci_dev(dev);
+	struct pci_dev *pdev = pci_physfn(vf_dev);
+	int val, ret;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0)
+		return -EINVAL;
+
+	device_lock(&pdev->dev);
+	if (!pdev->driver || !pdev->driver->sriov_set_msix_vec_count) {
+		ret = -EOPNOTSUPP;
+		goto err_pdev;
+	}
+
+	device_lock(&vf_dev->dev);
+	if (vf_dev->driver) {
+		/*
+		 * A driver is already attached to this VF and has configured
+		 * itself based on the current MSI-X vector count. Changing
+		 * the vector size could mess up the driver, so block it.
+		 */
+		ret = -EBUSY;
+		goto err_dev;
+	}
+
+	ret = pdev->driver->sriov_set_msix_vec_count(vf_dev, val);
+
+err_dev:
+	device_unlock(&vf_dev->dev);
+err_pdev:
+	device_unlock(&pdev->dev);
+	return ret ? : count;
+}
+static DEVICE_ATTR_WO(sriov_vf_msix_count);
+#endif
+
+static struct attribute *sriov_vf_dev_attrs[] = {
+#ifdef CONFIG_PCI_MSI
+	&dev_attr_sriov_vf_msix_count.attr,
+#endif
+	NULL,
+};
+
+static umode_t sriov_vf_attrs_are_visible(struct kobject *kobj,
+					  struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (!pdev->is_virtfn)
+		return 0;
+
+	return a->mode;
+}
+
+const struct attribute_group sriov_vf_dev_attr_group = {
+	.attrs = sriov_vf_dev_attrs,
+	.is_visible = sriov_vf_attrs_are_visible,
+};
+
 int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 {
 	int i;
@@ -400,18 +487,21 @@ static DEVICE_ATTR_RO(sriov_stride);
 static DEVICE_ATTR_RO(sriov_vf_device);
 static DEVICE_ATTR_RW(sriov_drivers_autoprobe);
 
-static struct attribute *sriov_dev_attrs[] = {
+static struct attribute *sriov_pf_dev_attrs[] = {
 	&dev_attr_sriov_totalvfs.attr,
 	&dev_attr_sriov_numvfs.attr,
 	&dev_attr_sriov_offset.attr,
 	&dev_attr_sriov_stride.attr,
 	&dev_attr_sriov_vf_device.attr,
 	&dev_attr_sriov_drivers_autoprobe.attr,
+#ifdef CONFIG_PCI_MSI
+	&dev_attr_sriov_vf_total_msix.attr,
+#endif
 	NULL,
 };
 
-static umode_t sriov_attrs_are_visible(struct kobject *kobj,
-				       struct attribute *a, int n)
+static umode_t sriov_pf_attrs_are_visible(struct kobject *kobj,
+					  struct attribute *a, int n)
 {
 	struct device *dev = kobj_to_dev(kobj);
 
@@ -421,9 +511,9 @@ static umode_t sriov_attrs_are_visible(struct kobject *kobj,
 	return a->mode;
 }
 
-const struct attribute_group sriov_dev_attr_group = {
-	.attrs = sriov_dev_attrs,
-	.is_visible = sriov_attrs_are_visible,
+const struct attribute_group sriov_pf_dev_attr_group = {
+	.attrs = sriov_pf_dev_attrs,
+	.is_visible = sriov_pf_attrs_are_visible,
 };
 
 int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index f8afd54ca3e1..a6b8fbbba6d2 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1567,7 +1567,8 @@ static const struct attribute_group *pci_dev_attr_groups[] = {
 	&pci_dev_attr_group,
 	&pci_dev_hp_attr_group,
 #ifdef CONFIG_PCI_IOV
-	&sriov_dev_attr_group,
+	&sriov_pf_dev_attr_group,
+	&sriov_vf_dev_attr_group,
 #endif
 	&pci_bridge_attr_group,
 	&pcie_dev_attr_group,
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index ef7c4661314f..afb87b917f07 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -501,7 +501,8 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno);
 resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno);
 void pci_restore_iov_state(struct pci_dev *dev);
 int pci_iov_bus_range(struct pci_bus *bus);
-extern const struct attribute_group sriov_dev_attr_group;
+extern const struct attribute_group sriov_pf_dev_attr_group;
+extern const struct attribute_group sriov_vf_dev_attr_group;
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 86c799c97b77..9b575a676888 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -856,6 +856,12 @@ struct module;
  *		e.g. drivers/net/e100.c.
  * @sriov_configure: Optional driver callback to allow configuration of
  *		number of VFs to enable via sysfs "sriov_numvfs" file.
+ * @sriov_set_msix_vec_count: PF Driver callback to change number of MSI-X
+ *              vectors on a VF. Triggered via sysfs "sriov_vf_msix_count".
+ *              This will change MSI-X Table Size in the VF Message Control
+ *              registers.
+ * @sriov_get_vf_total_msix: PF driver callback to get the total number of
+ *              MSI-X vectors available for distribution to the VFs.
  * @err_handler: See Documentation/PCI/pci-error-recovery.rst
  * @groups:	Sysfs attribute groups.
  * @driver:	Driver model structure.
@@ -871,6 +877,8 @@ struct pci_driver {
 	int  (*resume)(struct pci_dev *dev);	/* Device woken up */
 	void (*shutdown)(struct pci_dev *dev);
 	int  (*sriov_configure)(struct pci_dev *dev, int num_vfs); /* On PF */
+	int  (*sriov_set_msix_vec_count)(struct pci_dev *vf, int msix_vec_count); /* On PF */
+	u32  (*sriov_get_vf_total_msix)(struct pci_dev *pf);
 	const struct pci_error_handlers *err_handler;
 	const struct attribute_group **groups;
 	struct device_driver	driver;
-- 
cgit v1.2.3


From 0b989c1e37053196676b2238f82195bd5a339d58 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 14 Mar 2021 14:42:54 +0200
Subject: net/mlx5: Add dynamic MSI-X capabilities bits

These new fields declare the number of MSI-X vectors that is possible to
allocate on the VF through PF configuration.

Value must be in range defined by min_dynamic_vf_msix_table_size and
max_dynamic_vf_msix_table_size.

The driver should continue to query its MSI-X table through PCI
configuration header.

Link: https://lore.kernel.org/linux-pci/20210314124256.70253-3-leon@kernel.org
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3ee7a86f39e4..432290b58a0b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1683,7 +1683,16 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   reserved_at_6e0[0x10];
 	u8	   sf_base_id[0x10];
 
-	u8	   reserved_at_700[0x80];
+	u8	   reserved_at_700[0x8];
+	u8	   num_total_dynamic_vf_msix[0x18];
+	u8	   reserved_at_720[0x14];
+	u8	   dynamic_msix_table_size[0xc];
+	u8	   reserved_at_740[0xc];
+	u8	   min_dynamic_vf_msix_table_size[0x4];
+	u8	   reserved_at_750[0x4];
+	u8	   max_dynamic_vf_msix_table_size[0xc];
+
+	u8	   reserved_at_760[0x20];
 	u8	   vhca_tunnel_commands[0x40];
 	u8	   reserved_at_7c0[0x40];
 };
-- 
cgit v1.2.3


From a72232eabdfcfe365a05a3eb392288b78d25a5ca Mon Sep 17 00:00:00 2001
From: Vipin Sharma <vipinsh@google.com>
Date: Mon, 29 Mar 2021 21:42:04 -0700
Subject: cgroup: Add misc cgroup controller

The Miscellaneous cgroup provides the resource limiting and tracking
mechanism for the scalar resources which cannot be abstracted like the
other cgroup resources. Controller is enabled by the CONFIG_CGROUP_MISC
config option.

A resource can be added to the controller via enum misc_res_type{} in
the include/linux/misc_cgroup.h file and the corresponding name via
misc_res_name[] in the kernel/cgroup/misc.c file. Provider of the
resource must set its capacity prior to using the resource by calling
misc_cg_set_capacity().

Once a capacity is set then the resource usage can be updated using
charge and uncharge APIs. All of the APIs to interact with misc
controller are in include/linux/misc_cgroup.h.

Miscellaneous controller provides 3 interface files. If two misc
resources (res_a and res_b) are registered then:

misc.capacity
A read-only flat-keyed file shown only in the root cgroup.  It shows
miscellaneous scalar resources available on the platform along with
their quantities::

    $ cat misc.capacity
    res_a 50
    res_b 10

misc.current
A read-only flat-keyed file shown in the non-root cgroups.  It shows
the current usage of the resources in the cgroup and its children::

    $ cat misc.current
    res_a 3
    res_b 0

misc.max
A read-write flat-keyed file shown in the non root cgroups. Allowed
maximum usage of the resources in the cgroup and its children.::

    $ cat misc.max
    res_a max
    res_b 4

Limit can be set by::

    # echo res_a 1 > misc.max

Limit can be set to max by::

    # echo res_a max > misc.max

Limits can be set more than the capacity value in the misc.capacity
file.

Signed-off-by: Vipin Sharma <vipinsh@google.com>
Reviewed-by: David Rientjes <rientjes@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup_subsys.h |   4 +
 include/linux/misc_cgroup.h   | 126 +++++++++++++
 init/Kconfig                  |  14 ++
 kernel/cgroup/Makefile        |   1 +
 kernel/cgroup/misc.c          | 401 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 546 insertions(+)
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

(limited to 'include/linux')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..445235487230 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -61,6 +61,10 @@ SUBSYS(pids)
 SUBSYS(rdma)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_MISC)
+SUBSYS(misc)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
new file mode 100644
index 000000000000..1195d36558b4
--- /dev/null
+++ b/include/linux/misc_cgroup.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Miscellaneous cgroup controller.
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+#ifndef _MISC_CGROUP_H_
+#define _MISC_CGROUP_H_
+
+/**
+ * Types of misc cgroup entries supported by the host.
+ */
+enum misc_res_type {
+	MISC_CG_RES_TYPES
+};
+
+struct misc_cg;
+
+#ifdef CONFIG_CGROUP_MISC
+
+#include <linux/cgroup.h>
+
+/**
+ * struct misc_res: Per cgroup per misc type resource
+ * @max: Maximum limit on the resource.
+ * @usage: Current usage of the resource.
+ * @failed: True if charged failed for the resource in a cgroup.
+ */
+struct misc_res {
+	unsigned long max;
+	atomic_long_t usage;
+	bool failed;
+};
+
+/**
+ * struct misc_cg - Miscellaneous controller's cgroup structure.
+ * @css: cgroup subsys state object.
+ * @res: Array of misc resources usage in the cgroup.
+ */
+struct misc_cg {
+	struct cgroup_subsys_state css;
+	struct misc_res res[MISC_CG_RES_TYPES];
+};
+
+unsigned long misc_cg_res_total_usage(enum misc_res_type type);
+int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity);
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
+		       unsigned long amount);
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
+		      unsigned long amount);
+
+/**
+ * css_misc() - Get misc cgroup from the css.
+ * @css: cgroup subsys state object.
+ *
+ * Context: Any context.
+ * Return:
+ * * %NULL - If @css is null.
+ * * struct misc_cg* - misc cgroup pointer of the passed css.
+ */
+static inline struct misc_cg *css_misc(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct misc_cg, css) : NULL;
+}
+
+/*
+ * get_current_misc_cg() - Find and get the misc cgroup of the current task.
+ *
+ * Returned cgroup has its ref count increased by 1. Caller must call
+ * put_misc_cg() to return the reference.
+ *
+ * Return: Misc cgroup to which the current task belongs to.
+ */
+static inline struct misc_cg *get_current_misc_cg(void)
+{
+	return css_misc(task_get_css(current, misc_cgrp_id));
+}
+
+/*
+ * put_misc_cg() - Put the misc cgroup and reduce its ref count.
+ * @cg - cgroup to put.
+ */
+static inline void put_misc_cg(struct misc_cg *cg)
+{
+	if (cg)
+		css_put(&cg->css);
+}
+
+#else /* !CONFIG_CGROUP_MISC */
+
+unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+{
+	return 0;
+}
+
+static inline int misc_cg_set_capacity(enum misc_res_type type,
+				       unsigned long capacity)
+{
+	return 0;
+}
+
+static inline int misc_cg_try_charge(enum misc_res_type type,
+				     struct misc_cg *cg,
+				     unsigned long amount)
+{
+	return 0;
+}
+
+static inline void misc_cg_uncharge(enum misc_res_type type,
+				    struct misc_cg *cg,
+				    unsigned long amount)
+{
+}
+
+static inline struct misc_cg *get_current_misc_cg(void)
+{
+	return NULL;
+}
+
+static inline void put_misc_cg(struct misc_cg *cg)
+{
+}
+
+#endif /* CONFIG_CGROUP_MISC */
+#endif /* _MISC_CGROUP_H_ */
diff --git a/init/Kconfig b/init/Kconfig
index 5f5c776ef192..18ece598a297 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1110,6 +1110,20 @@ config CGROUP_BPF
 	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
 	  inet sockets.
 
+config CGROUP_MISC
+	bool "Misc resource controller"
+	default n
+	help
+	  Provides a controller for miscellaneous resources on a host.
+
+	  Miscellaneous scalar resources are the resources on the host system
+	  which cannot be abstracted like the other cgroups. This controller
+	  tracks and limits the miscellaneous resources used by a process
+	  attached to a cgroup hierarchy.
+
+	  For more information, please check misc cgroup section in
+	  /Documentation/admin-guide/cgroup-v2.rst.
+
 config CGROUP_DEBUG
 	bool "Debug controller"
 	default n
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 5d7a76bfbbb7..12f8457ad1f9 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -5,4 +5,5 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_MISC) += misc.o
 obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
new file mode 100644
index 000000000000..4352bc4a3bd5
--- /dev/null
+++ b/kernel/cgroup/misc.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Miscellaneous cgroup controller
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+
+#include <linux/limits.h>
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/atomic.h>
+#include <linux/slab.h>
+#include <linux/misc_cgroup.h>
+
+#define MAX_STR "max"
+#define MAX_NUM ULONG_MAX
+
+/* Miscellaneous res name, keep it in sync with enum misc_res_type */
+static const char *const misc_res_name[] = {
+};
+
+/* Root misc cgroup */
+static struct misc_cg root_cg;
+
+/*
+ * Miscellaneous resources capacity for the entire machine. 0 capacity means
+ * resource is not initialized or not present in the host.
+ *
+ * root_cg.max and capacity are independent of each other. root_cg.max can be
+ * more than the actual capacity. We are using Limits resource distribution
+ * model of cgroup for miscellaneous controller.
+ */
+static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
+
+/**
+ * parent_misc() - Get the parent of the passed misc cgroup.
+ * @cgroup: cgroup whose parent needs to be fetched.
+ *
+ * Context: Any context.
+ * Return:
+ * * struct misc_cg* - Parent of the @cgroup.
+ * * %NULL - If @cgroup is null or the passed cgroup does not have a parent.
+ */
+static struct misc_cg *parent_misc(struct misc_cg *cgroup)
+{
+	return cgroup ? css_misc(cgroup->css.parent) : NULL;
+}
+
+/**
+ * valid_type() - Check if @type is valid or not.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return:
+ * * true - If valid type.
+ * * false - If not valid type.
+ */
+static inline bool valid_type(enum misc_res_type type)
+{
+	return type >= 0 && type < MISC_CG_RES_TYPES;
+}
+
+/**
+ * misc_cg_res_total_usage() - Get the current total usage of the resource.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return: Current total usage of the resource.
+ */
+unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+{
+	if (valid_type(type))
+		return atomic_long_read(&root_cg.res[type].usage);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
+
+/**
+ * misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
+ * @type: Type of the misc res.
+ * @capacity: Supported capacity of the misc res on the host.
+ *
+ * If capacity is 0 then the charging a misc cgroup fails for that type.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - Successfully registered the capacity.
+ * * %-EINVAL - If @type is invalid.
+ */
+int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
+{
+	if (!valid_type(type))
+		return -EINVAL;
+
+	WRITE_ONCE(misc_res_capacity[type], capacity);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
+
+/**
+ * misc_cg_cancel_charge() - Cancel the charge from the misc cgroup.
+ * @type: Misc res type in misc cg to cancel the charge from.
+ * @cg: Misc cgroup to cancel charge from.
+ * @amount: Amount to cancel.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
+				  unsigned long amount)
+{
+	WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
+		  "misc cgroup resource %s became less than 0",
+		  misc_res_name[type]);
+}
+
+/**
+ * misc_cg_try_charge() - Try charging the misc cgroup.
+ * @type: Misc res type to charge.
+ * @cg: Misc cgroup which will be charged.
+ * @amount: Amount to charge.
+ *
+ * Charge @amount to the misc cgroup. Caller must use the same cgroup during
+ * the uncharge call.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -EINVAL - If @type is invalid or misc res has 0 capacity.
+ * * -EBUSY - If max limit will be crossed or total usage will be more than the
+ *	      capacity.
+ */
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
+		       unsigned long amount)
+{
+	struct misc_cg *i, *j;
+	int ret;
+	struct misc_res *res;
+	int new_usage;
+
+	if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
+		return -EINVAL;
+
+	if (!amount)
+		return 0;
+
+	for (i = cg; i; i = parent_misc(i)) {
+		res = &i->res[type];
+
+		new_usage = atomic_long_add_return(amount, &res->usage);
+		if (new_usage > READ_ONCE(res->max) ||
+		    new_usage > READ_ONCE(misc_res_capacity[type])) {
+			if (!res->failed) {
+				pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
+					misc_res_name[type]);
+				pr_cont_cgroup_path(i->css.cgroup);
+				pr_cont("\n");
+				res->failed = true;
+			}
+			ret = -EBUSY;
+			goto err_charge;
+		}
+	}
+	return 0;
+
+err_charge:
+	for (j = cg; j != i; j = parent_misc(j))
+		misc_cg_cancel_charge(type, j, amount);
+	misc_cg_cancel_charge(type, i, amount);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(misc_cg_try_charge);
+
+/**
+ * misc_cg_uncharge() - Uncharge the misc cgroup.
+ * @type: Misc res type which was charged.
+ * @cg: Misc cgroup which will be uncharged.
+ * @amount: Charged amount.
+ *
+ * Context: Any context.
+ */
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
+		      unsigned long amount)
+{
+	struct misc_cg *i;
+
+	if (!(amount && valid_type(type) && cg))
+		return;
+
+	for (i = cg; i; i = parent_misc(i))
+		misc_cg_cancel_charge(type, i, amount);
+}
+EXPORT_SYMBOL_GPL(misc_cg_uncharge);
+
+/**
+ * misc_cg_max_show() - Show the misc cgroup max limit.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_max_show(struct seq_file *sf, void *v)
+{
+	int i;
+	struct misc_cg *cg = css_misc(seq_css(sf));
+	unsigned long max;
+
+	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+		if (READ_ONCE(misc_res_capacity[i])) {
+			max = READ_ONCE(cg->res[i].max);
+			if (max == MAX_NUM)
+				seq_printf(sf, "%s max\n", misc_res_name[i]);
+			else
+				seq_printf(sf, "%s %lu\n", misc_res_name[i],
+					   max);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * misc_cg_max_write() - Update the maximum limit of the cgroup.
+ * @of: Handler for the file.
+ * @buf: Data from the user. It should be either "max", 0, or a positive
+ *	 integer.
+ * @nbytes: Number of bytes of the data.
+ * @off: Offset in the file.
+ *
+ * User can pass data like:
+ * echo sev 23 > misc.max, OR
+ * echo sev max > misc.max
+ *
+ * Context: Any context.
+ * Return:
+ * * >= 0 - Number of bytes processed in the input.
+ * * -EINVAL - If buf is not valid.
+ * * -ERANGE - If number is bigger than the unsigned long capacity.
+ */
+static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
+				 size_t nbytes, loff_t off)
+{
+	struct misc_cg *cg;
+	unsigned long max;
+	int ret = 0, i;
+	enum misc_res_type type = MISC_CG_RES_TYPES;
+	char *token;
+
+	buf = strstrip(buf);
+	token = strsep(&buf, " ");
+
+	if (!token || !buf)
+		return -EINVAL;
+
+	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+		if (!strcmp(misc_res_name[i], token)) {
+			type = i;
+			break;
+		}
+	}
+
+	if (type == MISC_CG_RES_TYPES)
+		return -EINVAL;
+
+	if (!strcmp(MAX_STR, buf)) {
+		max = MAX_NUM;
+	} else {
+		ret = kstrtoul(buf, 0, &max);
+		if (ret)
+			return ret;
+	}
+
+	cg = css_misc(of_css(of));
+
+	if (READ_ONCE(misc_res_capacity[type]))
+		WRITE_ONCE(cg->res[type].max, max);
+	else
+		ret = -EINVAL;
+
+	return ret ? ret : nbytes;
+}
+
+/**
+ * misc_cg_current_show() - Show the current usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_current_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long usage;
+	struct misc_cg *cg = css_misc(seq_css(sf));
+
+	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+		usage = atomic_long_read(&cg->res[i].usage);
+		if (READ_ONCE(misc_res_capacity[i]) || usage)
+			seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
+	}
+
+	return 0;
+}
+
+/**
+ * misc_cg_capacity_show() - Show the total capacity of misc res on the host.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Only present in the root cgroup directory.
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_capacity_show(struct seq_file *sf, void *v)
+{
+	int i;
+	unsigned long cap;
+
+	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+		cap = READ_ONCE(misc_res_capacity[i]);
+		if (cap)
+			seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
+	}
+
+	return 0;
+}
+
+/* Misc cgroup interface files */
+static struct cftype misc_cg_files[] = {
+	{
+		.name = "max",
+		.write = misc_cg_max_write,
+		.seq_show = misc_cg_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.seq_show = misc_cg_current_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "capacity",
+		.seq_show = misc_cg_capacity_show,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+	{}
+};
+
+/**
+ * misc_cg_alloc() - Allocate misc cgroup.
+ * @parent_css: Parent cgroup.
+ *
+ * Context: Process context.
+ * Return:
+ * * struct cgroup_subsys_state* - css of the allocated cgroup.
+ * * ERR_PTR(-ENOMEM) - No memory available to allocate.
+ */
+static struct cgroup_subsys_state *
+misc_cg_alloc(struct cgroup_subsys_state *parent_css)
+{
+	enum misc_res_type i;
+	struct misc_cg *cg;
+
+	if (!parent_css) {
+		cg = &root_cg;
+	} else {
+		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+		if (!cg)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+		WRITE_ONCE(cg->res[i].max, MAX_NUM);
+		atomic_long_set(&cg->res[i].usage, 0);
+	}
+
+	return &cg->css;
+}
+
+/**
+ * misc_cg_free() - Free the misc cgroup.
+ * @css: cgroup subsys object.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_misc(css));
+}
+
+/* Cgroup controller callbacks */
+struct cgroup_subsys misc_cgrp_subsys = {
+	.css_alloc = misc_cg_alloc,
+	.css_free = misc_cg_free,
+	.legacy_cftypes = misc_cg_files,
+	.dfl_cftypes = misc_cg_files,
+};
-- 
cgit v1.2.3


From 7aef27f0b2a8a58c28578d3e0caf3f27e1a1c39c Mon Sep 17 00:00:00 2001
From: Vipin Sharma <vipinsh@google.com>
Date: Mon, 29 Mar 2021 21:42:06 -0700
Subject: svm/sev: Register SEV and SEV-ES ASIDs to the misc controller

Secure Encrypted Virtualization (SEV) and Secure Encrypted
Virtualization - Encrypted State (SEV-ES) ASIDs are used to encrypt KVMs
on AMD platform. These ASIDs are available in the limited quantities on
a host.

Register their capacity and usage to the misc controller for tracking
via cgroups.

Signed-off-by: Vipin Sharma <vipinsh@google.com>
Reviewed-by: David Rientjes <rientjes@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kvm/svm/sev.c      | 70 ++++++++++++++++++++++++++++++++++++++-------
 arch/x86/kvm/svm/svm.h      |  1 +
 include/linux/misc_cgroup.h |  6 ++++
 kernel/cgroup/misc.c        |  6 ++++
 4 files changed, 73 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 874ea309279f..214eefb20414 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include <linux/psp-sev.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
+#include <linux/misc_cgroup.h>
 #include <linux/processor.h>
 #include <linux/trace_events.h>
 #include <asm/fpu/internal.h>
@@ -28,6 +29,21 @@
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
 static u8 sev_enc_bit;
 static int sev_flush_asids(void);
 static DECLARE_RWSEM(sev_deactivate_lock);
@@ -89,8 +105,19 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
 
 static int sev_asid_new(struct kvm_sev_info *sev)
 {
-	int pos, min_asid, max_asid;
+	int pos, min_asid, max_asid, ret;
 	bool retry = true;
+	enum misc_res_type type;
+
+	type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+	WARN_ON(sev->misc_cg);
+	sev->misc_cg = get_current_misc_cg();
+	ret = misc_cg_try_charge(type, sev->misc_cg, 1);
+	if (ret) {
+		put_misc_cg(sev->misc_cg);
+		sev->misc_cg = NULL;
+		return ret;
+	}
 
 	mutex_lock(&sev_bitmap_lock);
 
@@ -108,7 +135,8 @@ again:
 			goto again;
 		}
 		mutex_unlock(&sev_bitmap_lock);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto e_uncharge;
 	}
 
 	__set_bit(pos, sev_asid_bitmap);
@@ -116,6 +144,11 @@ again:
 	mutex_unlock(&sev_bitmap_lock);
 
 	return pos + 1;
+e_uncharge:
+	misc_cg_uncharge(type, sev->misc_cg, 1);
+	put_misc_cg(sev->misc_cg);
+	sev->misc_cg = NULL;
+	return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -125,14 +158,15 @@ static int sev_get_asid(struct kvm *kvm)
 	return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm_sev_info *sev)
 {
 	struct svm_cpu_data *sd;
 	int cpu, pos;
+	enum misc_res_type type;
 
 	mutex_lock(&sev_bitmap_lock);
 
-	pos = asid - 1;
+	pos = sev->asid - 1;
 	__set_bit(pos, sev_reclaim_asid_bitmap);
 
 	for_each_possible_cpu(cpu) {
@@ -141,6 +175,11 @@ static void sev_asid_free(int asid)
 	}
 
 	mutex_unlock(&sev_bitmap_lock);
+
+	type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+	misc_cg_uncharge(type, sev->misc_cg, 1);
+	put_misc_cg(sev->misc_cg);
+	sev->misc_cg = NULL;
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -188,19 +227,20 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	asid = sev_asid_new(sev);
 	if (asid < 0)
 		return ret;
+	sev->asid = asid;
 
 	ret = sev_platform_init(&argp->error);
 	if (ret)
 		goto e_free;
 
 	sev->active = true;
-	sev->asid = asid;
 	INIT_LIST_HEAD(&sev->regions_list);
 
 	return 0;
 
 e_free:
-	sev_asid_free(asid);
+	sev_asid_free(sev);
+	sev->asid = 0;
 	return ret;
 }
 
@@ -1315,12 +1355,12 @@ void sev_vm_destroy(struct kvm *kvm)
 	mutex_unlock(&kvm->lock);
 
 	sev_unbind_asid(kvm, sev->handle);
-	sev_asid_free(sev->asid);
+	sev_asid_free(sev);
 }
 
 void __init sev_hardware_setup(void)
 {
-	unsigned int eax, ebx, ecx, edx;
+	unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
 	bool sev_es_supported = false;
 	bool sev_supported = false;
 
@@ -1352,7 +1392,11 @@ void __init sev_hardware_setup(void)
 	if (!sev_reclaim_asid_bitmap)
 		goto out;
 
-	pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
+	sev_asid_count = max_sev_asid - min_sev_asid + 1;
+	if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count))
+		goto out;
+
+	pr_info("SEV supported: %u ASIDs\n", sev_asid_count);
 	sev_supported = true;
 
 	/* SEV-ES support requested? */
@@ -1367,7 +1411,11 @@ void __init sev_hardware_setup(void)
 	if (min_sev_asid == 1)
 		goto out;
 
-	pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1);
+	sev_es_asid_count = min_sev_asid - 1;
+	if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count))
+		goto out;
+
+	pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count);
 	sev_es_supported = true;
 
 out:
@@ -1382,6 +1430,8 @@ void sev_hardware_teardown(void)
 
 	bitmap_free(sev_asid_bitmap);
 	bitmap_free(sev_reclaim_asid_bitmap);
+	misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
+	misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
 
 	sev_flush_asids();
 }
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 39e071fdab0c..9806aaebc37f 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -65,6 +65,7 @@ struct kvm_sev_info {
 	unsigned long pages_locked; /* Number of pages locked */
 	struct list_head regions_list;  /* List of registered regions */
 	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
+	struct misc_cg *misc_cg; /* For misc cgroup accounting */
 };
 
 struct kvm_svm {
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index 1195d36558b4..c5af592481c0 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -12,6 +12,12 @@
  * Types of misc cgroup entries supported by the host.
  */
 enum misc_res_type {
+#ifdef CONFIG_KVM_AMD_SEV
+	/* AMD SEV ASIDs resource */
+	MISC_CG_RES_SEV,
+	/* AMD SEV-ES ASIDs resource */
+	MISC_CG_RES_SEV_ES,
+#endif
 	MISC_CG_RES_TYPES
 };
 
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 4352bc4a3bd5..ec02d963cad1 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -18,6 +18,12 @@
 
 /* Miscellaneous res name, keep it in sync with enum misc_res_type */
 static const char *const misc_res_name[] = {
+#ifdef CONFIG_KVM_AMD_SEV
+	/* AMD SEV ASIDs resource */
+	"sev",
+	/* AMD SEV-ES ASIDs resource */
+	"sev_es",
+#endif
 };
 
 /* Root misc cgroup */
-- 
cgit v1.2.3


From dd3f4e4972f146a685930ccfed95e4e1d13d952a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 4 Apr 2021 18:29:37 -0400
Subject: cgroup: misc: mark dummy misc_cg_res_total_usage() static inline

The dummy implementation was missing static inline triggering the following
compile warning on llvm.

   In file included from arch/x86/kvm/svm/sev.c:17:
>> include/linux/misc_cgroup.h:98:15: warning: no previous prototype for function 'misc_cg_res_total_usage' [-Wmissing-prototypes]
   unsigned long misc_cg_res_total_usage(enum misc_res_type type)
                 ^
   include/linux/misc_cgroup.h:98:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
   unsigned long misc_cg_res_total_usage(enum misc_res_type type)
   ^
   static
   1 warning generated.

Add it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
---
 include/linux/misc_cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index c5af592481c0..da2367e2ac1e 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -95,7 +95,7 @@ static inline void put_misc_cg(struct misc_cg *cg)
 
 #else /* !CONFIG_CGROUP_MISC */
 
-unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+static inline unsigned long misc_cg_res_total_usage(enum misc_res_type type)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 25faa935f9e0bd5aba34a820defb982d43bb4a77 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Tue, 9 Feb 2021 17:58:43 +0800
Subject: power: supply: core: Use true and false for bool variable

Fix the following coccicheck warning:

./include/linux/power_supply.h:507:9-10: WARNING: return of 0/1 in
function 'power_supply_is_watt_property' with return type bool.

./include/linux/power_supply.h:479:9-10: WARNING: return of 0/1 in
function 'power_supply_is_amp_property' with return type bool.

Reported-by: Abaci Robot<abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 include/linux/power_supply.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 6e776be5bfa0..174a534fd907 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -482,12 +482,12 @@ static inline bool power_supply_is_amp_property(enum power_supply_property psp)
 	case POWER_SUPPLY_PROP_CURRENT_NOW:
 	case POWER_SUPPLY_PROP_CURRENT_AVG:
 	case POWER_SUPPLY_PROP_CURRENT_BOOT:
-		return 1;
+		return true;
 	default:
 		break;
 	}
 
-	return 0;
+	return false;
 }
 
 static inline bool power_supply_is_watt_property(enum power_supply_property psp)
@@ -510,12 +510,12 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp)
 	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE:
 	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
 	case POWER_SUPPLY_PROP_POWER_NOW:
-		return 1;
+		return true;
 	default:
 		break;
 	}
 
-	return 0;
+	return false;
 }
 
 #ifdef CONFIG_POWER_SUPPLY_HWMON
-- 
cgit v1.2.3


From e588fead04ec51ad9ede7010676de19dcaa50b71 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 29 Mar 2021 18:12:06 +0300
Subject: software node: Introduce SOFTWARE_NODE_REFERENCE() helper macro

This is useful to assign software node reference with arguments
in a common way. Moreover, we have already couple of users that
may be converted. And by the fact, one of them is moved right here
to use the helper.

Tested-by: Daniel Scally <djrscally@gmail.com>
Reviewed-by: Daniel Scally <djrscally@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210329151207.36619-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/test/property-entry-test.c | 11 ++---------
 include/linux/property.h                | 13 ++++++++-----
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/test/property-entry-test.c b/drivers/base/test/property-entry-test.c
index 3a4f755c483c..1106fedcceed 100644
--- a/drivers/base/test/property-entry-test.c
+++ b/drivers/base/test/property-entry-test.c
@@ -412,15 +412,8 @@ static void pe_test_reference(struct kunit *test)
 	};
 
 	static const struct software_node_ref_args refs[] = {
-		{
-			.node = &nodes[0],
-			.nargs = 0,
-		},
-		{
-			.node = &nodes[1],
-			.nargs = 2,
-			.args = { 3, 4 },
-		},
+		SOFTWARE_NODE_REFERENCE(&nodes[0]),
+		SOFTWARE_NODE_REFERENCE(&nodes[1], 3, 4),
 	};
 
 	const struct property_entry entries[] = {
diff --git a/include/linux/property.h b/include/linux/property.h
index dd4687b56239..0d876316e61d 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -254,6 +254,13 @@ struct software_node_ref_args {
 	u64 args[NR_FWNODE_REFERENCE_ARGS];
 };
 
+#define SOFTWARE_NODE_REFERENCE(_ref_, ...)			\
+(const struct software_node_ref_args) {				\
+	.node = _ref_,						\
+	.nargs = ARRAY_SIZE(((u64[]){ 0, ##__VA_ARGS__ })) - 1,	\
+	.args = { __VA_ARGS__ },				\
+}
+
 /**
  * struct property_entry - "Built-in" device property representation.
  * @name: Name of the property.
@@ -362,11 +369,7 @@ struct property_entry {
 	.name = _name_,							\
 	.length = sizeof(struct software_node_ref_args),		\
 	.type = DEV_PROP_REF,						\
-	{ .pointer = &(const struct software_node_ref_args) {		\
-		.node = _ref_,						\
-		.nargs = ARRAY_SIZE(((u64[]){ 0, ##__VA_ARGS__ })) - 1,	\
-		.args = { __VA_ARGS__ },				\
-	} },								\
+	{ .pointer = &SOFTWARE_NODE_REFERENCE(_ref_, ##__VA_ARGS__), },	\
 }
 
 struct property_entry *
-- 
cgit v1.2.3


From d737e5d418706abf32f6de68c3e09958516d422f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 9 Feb 2021 16:04:15 -0500
Subject: SUNRPC: Set TCP_CORK until the transmit queue is empty

When we have multiple RPC requests queued up, it makes sense to set the
TCP_CORK option while the transmit queue is non-empty.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h | 1 +
 net/sunrpc/xprt.c           | 2 ++
 net/sunrpc/xprtsock.c       | 5 ++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index d2e97ee802af..d81fe8b364d0 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -247,6 +247,7 @@ struct rpc_xprt {
 	struct rpc_task *	snd_task;	/* Task blocked in send */
 
 	struct list_head	xmit_queue;	/* Send queue */
+	atomic_long_t		xmit_queuelen;
 
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 691ccf8049a4..a853f75d4968 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1352,6 +1352,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
 		list_add_tail(&req->rq_xmit, &xprt->xmit_queue);
 		INIT_LIST_HEAD(&req->rq_xmit2);
 out:
+		atomic_long_inc(&xprt->xmit_queuelen);
 		set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate);
 		spin_unlock(&xprt->queue_lock);
 	}
@@ -1381,6 +1382,7 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task)
 		}
 	} else
 		list_del(&req->rq_xmit2);
+	atomic_long_dec(&req->rq_xprt->xmit_queuelen);
 }
 
 /**
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e35760f238a4..a64f5ed1edb4 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1018,6 +1018,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 	 * to cope with writespace callbacks arriving _after_ we have
 	 * called sendmsg(). */
 	req->rq_xtime = ktime_get();
+	tcp_sock_set_cork(transport->inet, true);
 	while (1) {
 		status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
 					   transport->xmit.offset, rm, &sent);
@@ -1032,6 +1033,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 		if (likely(req->rq_bytes_sent >= msglen)) {
 			req->rq_xmit_bytes_sent += transport->xmit.offset;
 			transport->xmit.offset = 0;
+			if (atomic_long_read(&xprt->xmit_queuelen) == 1)
+				tcp_sock_set_cork(transport->inet, false);
 			return 0;
 		}
 
@@ -2163,6 +2166,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		}
 
 		xs_tcp_set_socket_timeouts(xprt, sock);
+		tcp_sock_set_nodelay(sk);
 
 		write_lock_bh(&sk->sk_callback_lock);
 
@@ -2177,7 +2181,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
 		/* socket options */
 		sock_reset_flag(sk, SOCK_LINGER);
-		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
 		xprt_clear_connected(xprt);
 
-- 
cgit v1.2.3


From aca01415e076aa96cca0f801f4420ee5c10c660d Mon Sep 17 00:00:00 2001
From: Bence Csókás <bence98@sch.bme.hu>
Date: Wed, 31 Mar 2021 19:19:20 +0000
Subject: i2c: Add I2C_AQ_NO_REP_START adapter quirk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This quirk signifies that the adapter cannot do a repeated
START, it always issues a STOP condition after transfers.

Suggested-by: Wolfram Sang <wsa@kernel.org>
Signed-off-by: Bence Csókás <bence98@sch.bme.hu>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 include/linux/i2c.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 56622658b215..a670ae129f4b 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -687,6 +687,8 @@ struct i2c_adapter_quirks {
 #define I2C_AQ_NO_ZERO_LEN_READ		BIT(5)
 #define I2C_AQ_NO_ZERO_LEN_WRITE	BIT(6)
 #define I2C_AQ_NO_ZERO_LEN		(I2C_AQ_NO_ZERO_LEN_READ | I2C_AQ_NO_ZERO_LEN_WRITE)
+/* adapter cannot do repeated START */
+#define I2C_AQ_NO_REP_START		BIT(7)
 
 /*
  * i2c_adapter is the structure used to identify a physical i2c bus along
-- 
cgit v1.2.3


From eeb85a14ee3494febb85ccfbee0772eda0823b13 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Mon, 5 Apr 2021 00:12:23 -0700
Subject: net: Allow to specify ifindex when device is moved to another
 namespace

Currently, we can specify ifindex on link creation. This change allows
to specify ifindex when a device is moved to another network namespace.

Even now, a device ifindex can be changed if there is another device
with the same ifindex in the target namespace. So this change doesn't
introduce completely new behavior, it adds more control to the process.

CRIU users want to restore containers with pre-created network devices.
A user will provide network devices and instructions where they have to
be restored, then CRIU will restore network namespaces and move devices
into them. The problem is that devices have to be restored with the same
indexes that they have before C/R.

Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c |  2 +-
 include/linux/netdevice.h       |  3 ++-
 net/core/dev.c                  | 24 +++++++++++++++++-------
 net/core/rtnetlink.c            | 19 +++++++++++++++----
 net/ieee802154/core.c           |  4 ++--
 net/wireless/core.c             |  4 ++--
 6 files changed, 39 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 7349a70af083..8c0c70e1da77 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2354,7 +2354,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 	 */
 	if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
 		ret = dev_change_net_namespace(vf_netdev,
-					       dev_net(ndev), "eth%d");
+					       dev_net(ndev), "eth%d", 0);
 		if (ret)
 			netdev_err(vf_netdev,
 				   "could not move to same namespace as %s: %d\n",
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f57b70fc251f..b482236c0e99 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4026,7 +4026,8 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags,
 int dev_change_name(struct net_device *, const char *);
 int dev_set_alias(struct net_device *, const char *, size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
-int dev_change_net_namespace(struct net_device *, struct net *, const char *);
+int dev_change_net_namespace(struct net_device *dev, struct net *net,
+			     const char *pat, int new_ifindex);
 int __dev_set_mtu(struct net_device *, int);
 int dev_validate_mtu(struct net_device *dev, int mtu,
 		     struct netlink_ext_ack *extack);
diff --git a/net/core/dev.c b/net/core/dev.c
index b4c67a5be606..9d1a8fac793f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11067,6 +11067,8 @@ EXPORT_SYMBOL(unregister_netdev);
  *	@net: network namespace
  *	@pat: If not NULL name pattern to try if the current device name
  *	      is already taken in the destination network namespace.
+ *	@new_ifindex: If not zero, specifies device index in the target
+ *	              namespace.
  *
  *	This function shuts down a device interface and moves it
  *	to a new network namespace. On success 0 is returned, on
@@ -11075,10 +11077,11 @@ EXPORT_SYMBOL(unregister_netdev);
  *	Callers must hold the rtnl semaphore.
  */
 
-int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+int dev_change_net_namespace(struct net_device *dev, struct net *net,
+			     const char *pat, int new_ifindex)
 {
 	struct net *net_old = dev_net(dev);
-	int err, new_nsid, new_ifindex;
+	int err, new_nsid;
 
 	ASSERT_RTNL();
 
@@ -11109,6 +11112,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 			goto out;
 	}
 
+	/* Check that new_ifindex isn't used yet. */
+	err = -EBUSY;
+	if (new_ifindex && __dev_get_by_index(net, new_ifindex))
+		goto out;
+
 	/*
 	 * And now a mini version of register_netdevice unregister_netdevice.
 	 */
@@ -11136,10 +11144,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 
 	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
 	/* If there is an ifindex conflict assign a new one */
-	if (__dev_get_by_index(net, dev->ifindex))
-		new_ifindex = dev_new_index(net);
-	else
-		new_ifindex = dev->ifindex;
+	if (!new_ifindex) {
+		if (__dev_get_by_index(net, dev->ifindex))
+			new_ifindex = dev_new_index(net);
+		else
+			new_ifindex = dev->ifindex;
+	}
 
 	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
 			    new_ifindex);
@@ -11448,7 +11458,7 @@ static void __net_exit default_device_exit(struct net *net)
 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 		if (__dev_get_by_name(&init_net, fb_name))
 			snprintf(fb_name, IFNAMSIZ, "dev%%d");
-		err = dev_change_net_namespace(dev, &init_net, fb_name);
+		err = dev_change_net_namespace(dev, &init_net, fb_name, 0);
 		if (err) {
 			pr_emerg("%s: failed to move %s to init_net: %d\n",
 				 __func__, dev->name, err);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1bdcb33fb561..d51252afde0a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2266,6 +2266,9 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 			return -EINVAL;
 	}
 
+	if (tb[IFLA_NEW_IFINDEX] && nla_get_s32(tb[IFLA_NEW_IFINDEX]) <= 0)
+		return -EINVAL;
+
 	if (tb[IFLA_AF_SPEC]) {
 		struct nlattr *af;
 		int rem, err;
@@ -2603,14 +2606,22 @@ static int do_setlink(const struct sk_buff *skb,
 		return err;
 
 	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
-		struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
-							    tb, CAP_NET_ADMIN);
+		struct net *net;
+		int new_ifindex;
+
+		net = rtnl_link_get_net_capable(skb, dev_net(dev),
+						tb, CAP_NET_ADMIN);
 		if (IS_ERR(net)) {
 			err = PTR_ERR(net);
 			goto errout;
 		}
 
-		err = dev_change_net_namespace(dev, net, ifname);
+		if (tb[IFLA_NEW_IFINDEX])
+			new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]);
+		else
+			new_ifindex = 0;
+
+		err = dev_change_net_namespace(dev, net, ifname, new_ifindex);
 		put_net(net);
 		if (err)
 			goto errout;
@@ -3452,7 +3463,7 @@ replay:
 	if (err < 0)
 		goto out_unregister;
 	if (link_net) {
-		err = dev_change_net_namespace(dev, dest_net, ifname);
+		err = dev_change_net_namespace(dev, dest_net, ifname, 0);
 		if (err < 0)
 			goto out_unregister;
 	}
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index de259b5170ab..ec3068937fc3 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -205,7 +205,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
 		if (!wpan_dev->netdev)
 			continue;
 		wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
-		err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d");
+		err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d", 0);
 		if (err)
 			break;
 		wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
@@ -222,7 +222,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
 				continue;
 			wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
 			err = dev_change_net_namespace(wpan_dev->netdev, net,
-						       "wpan%d");
+						       "wpan%d", 0);
 			WARN_ON(err);
 			wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
 		}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index a2785379df6e..fabb677b7d58 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -165,7 +165,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
 		if (!wdev->netdev)
 			continue;
 		wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
-		err = dev_change_net_namespace(wdev->netdev, net, "wlan%d");
+		err = dev_change_net_namespace(wdev->netdev, net, "wlan%d", 0);
 		if (err)
 			break;
 		wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
@@ -182,7 +182,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
 				continue;
 			wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
 			err = dev_change_net_namespace(wdev->netdev, net,
-							"wlan%d");
+							"wlan%d", 0);
 			WARN_ON(err);
 			wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
 		}
-- 
cgit v1.2.3


From 237c609f8744a8d5415f40a7ee731957934b0eef Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 1 Apr 2021 16:11:04 +0200
Subject: netfilter: nfnetlink: add and use nfnetlink_broadcast

This removes the only reference of net->nfnl outside of the nfnetlink
module.  This allows to move net->nfnl to net_generic infra.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h | 2 ++
 net/netfilter/nfnetlink.c           | 7 +++++++
 net/netfilter/nfnetlink_acct.c      | 3 +--
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 791d516e1e88..d4c14257db5d 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -51,6 +51,8 @@ int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
 		   unsigned int group, int echo, gfp_t flags);
 int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
 int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid);
+void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid,
+			 __u32 group, gfp_t allocation);
 
 static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type)
 {
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index d3df66a39b5e..06e106b3ed85 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -178,6 +178,13 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_unicast);
 
+void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid,
+			 __u32 group, gfp_t allocation)
+{
+	netlink_broadcast(net->nfnl, skb, portid, group, allocation);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_broadcast);
+
 /* Process one complete nfnetlink message. */
 static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index bb930f3b06c7..6895f31c5fbb 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -469,8 +469,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct)
 		kfree_skb(skb);
 		return;
 	}
-	netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
-			  GFP_ATOMIC);
+	nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC);
 }
 
 int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct)
-- 
cgit v1.2.3


From d556435156b7970b8ce61b355df558a5168927cc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 22 Mar 2021 11:21:38 +0100
Subject: jbd2: avoid -Wempty-body warnings

Building with 'make W=1' shows a harmless -Wempty-body warning:

fs/jbd2/recovery.c: In function 'fc_do_one_pass':
fs/jbd2/recovery.c:267:75: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body]
  267 |                 jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
      |                                                                           ^

Change the empty dprintk() macros to no_printk(), which avoids this
warning and adds format string checking.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210322102152.95684-1-arnd@kernel.org
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b9aa85081a40..db0e1920cb12 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -61,7 +61,7 @@ void __jbd2_debug(int level, const char *file, const char *func,
 #define jbd_debug(n, fmt, a...) \
 	__jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
 #else
-#define jbd_debug(n, fmt, a...)    /**/
+#define jbd_debug(n, fmt, a...)  no_printk(fmt, ##a)
 #endif
 
 extern void *jbd2_alloc(size_t size, gfp_t flags);
-- 
cgit v1.2.3


From f4022062e5417ab7228e95aec1a8687059a19db7 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Mon, 15 Mar 2021 16:56:46 +0000
Subject: soundwire: add static port mapping support

Some of the SoundWire device ports are statically mapped to Controller
ports during design, however there is no way to expose this information
to the controller. Controllers like Qualcomm ones use this info to setup
static bandwidth parameters for those ports.

A generic port allocation is not possible in this cases!
So this patch adds a new member m_port_map to struct sdw_slave to expose
this static map.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20210315165650.13392-2-srinivas.kandagatla@linaro.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 5ff9a8f37e91..ced07f8fde87 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -642,6 +642,7 @@ struct sdw_slave_ops {
  * @debugfs: Slave debugfs
  * @node: node for bus list
  * @port_ready: Port ready completion flag for each Slave port
+ * @m_port_map: static Master port map for each Slave port
  * @dev_num: Current Device Number, values can be 0 or dev_num_sticky
  * @dev_num_sticky: one-time static Device Number assigned by Bus
  * @probed: boolean tracking driver state
@@ -673,6 +674,7 @@ struct sdw_slave {
 #endif
 	struct list_head node;
 	struct completion port_ready[SDW_MAX_PORTS];
+	unsigned int m_port_map[SDW_MAX_PORTS];
 	enum sdw_clk_stop_mode curr_clk_stop_mode;
 	u16 dev_num;
 	u16 dev_num_sticky;
-- 
cgit v1.2.3


From 5befc7c26e5a98cd49789fb1beb52c62bd472dba Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 24 Mar 2021 16:18:05 -0700
Subject: nvme: implement non-mdts command limits

Commands that access LBA contents without a data transfer between the
host historically have not had a spec defined upper limit. The driver
set the queue constraints for such commands to the max data transfer
size just to be safe, but this artificial constraint frequently limits
devices below their capabilities.

The NVMe Workgroup ratified TP4040 defines how a controller may
advertise their non-MDTS limits. Use these if provided and default to
the current constraints if not. Since the Dataset Management command
limits are defined in logical blocks, but without a namespace to tell us
the logical block size, the code defaults to the safe 512b size.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 106 ++++++++++++++++++++++++++++++++---------------
 drivers/nvme/host/nvme.h |   3 ++
 include/linux/nvme.h     |  10 +++++
 3 files changed, 85 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3f3b985c9fa6..e37e2ecd574c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1948,7 +1948,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
 	struct request_queue *queue = disk->queue;
 	u32 size = queue_logical_block_size(queue);
 
-	if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
+	if (ctrl->max_discard_sectors == 0) {
 		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
 		return;
 	}
@@ -1966,39 +1966,13 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
 	if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
 		return;
 
-	blk_queue_max_discard_sectors(queue, UINT_MAX);
-	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+	blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
+	blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
 
 	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
 		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
 }
 
-static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
-{
-	u64 max_blocks;
-
-	if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) ||
-	    (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
-		return;
-	/*
-	 * Even though NVMe spec explicitly states that MDTS is not
-	 * applicable to the write-zeroes:- "The restriction does not apply to
-	 * commands that do not transfer data between the host and the
-	 * controller (e.g., Write Uncorrectable ro Write Zeroes command).".
-	 * In order to be more cautious use controller's max_hw_sectors value
-	 * to configure the maximum sectors for the write-zeroes which is
-	 * configured based on the controller's MDTS field in the
-	 * nvme_init_ctrl_finish() if available.
-	 */
-	if (ns->ctrl->max_hw_sectors == UINT_MAX)
-		max_blocks = (u64)USHRT_MAX + 1;
-	else
-		max_blocks = ns->ctrl->max_hw_sectors + 1;
-
-	blk_queue_max_write_zeroes_sectors(disk->queue,
-					   nvme_lba_to_sect(ns, max_blocks));
-}
-
 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
 {
 	return !uuid_is_null(&ids->uuid) ||
@@ -2168,7 +2142,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
 	set_capacity_and_notify(disk, capacity);
 
 	nvme_config_discard(disk, ns);
-	nvme_config_write_zeroes(disk, ns);
+	blk_queue_max_write_zeroes_sectors(disk->queue,
+					   ns->ctrl->max_zeroes_sectors);
 
 	set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) ||
 		test_bit(NVME_NS_FORCE_RO, &ns->flags));
@@ -3072,14 +3047,72 @@ out:
 	return 0;
 }
 
+static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
+{
+	u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+
+	return 1 << (units + page_shift - 9);
+}
+
+static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
+{
+	struct nvme_command c = { };
+	struct nvme_id_ctrl_nvm *id;
+	int ret;
+
+	if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
+		ctrl->max_discard_sectors = UINT_MAX;
+		ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
+	} else {
+		ctrl->max_discard_sectors = 0;
+		ctrl->max_discard_segments = 0;
+	}
+
+	/*
+	 * Even though NVMe spec explicitly states that MDTS is not applicable
+	 * to the write-zeroes, we are cautious and limit the size to the
+	 * controllers max_hw_sectors value, which is based on the MDTS field
+	 * and possibly other limiting factors.
+	 */
+	if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
+	    !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
+		ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
+	else
+		ctrl->max_zeroes_sectors = 0;
+
+	if (nvme_ctrl_limited_cns(ctrl))
+		return 0;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
+		return 0;
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = NVME_ID_CNS_CS_CTRL;
+	c.identify.csi = NVME_CSI_NVM;
+
+	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+	if (ret)
+		goto free_data;
+
+	if (id->dmrl)
+		ctrl->max_discard_segments = id->dmrl;
+	if (id->dmrsl)
+		ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl);
+	if (id->wzsl)
+		ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
+
+free_data:
+	kfree(id);
+	return ret;
+}
+
 static int nvme_init_identify(struct nvme_ctrl *ctrl)
 {
 	struct nvme_id_ctrl *id;
-	int ret, page_shift;
 	u32 max_hw_sectors;
 	bool prev_apst_enabled;
-
-	page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+	int ret;
 
 	ret = nvme_identify_ctrl(ctrl, &id);
 	if (ret) {
@@ -3136,7 +3169,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
 	atomic_set(&ctrl->abort_limit, id->acl + 1);
 	ctrl->vwc = id->vwc;
 	if (id->mdts)
-		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
+		max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
 	else
 		max_hw_sectors = UINT_MAX;
 	ctrl->max_hw_sectors =
@@ -3247,6 +3280,10 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
 	if (ret)
 		return ret;
 
+	ret = nvme_init_non_mdts_limits(ctrl);
+	if (ret < 0)
+		return ret;
+
 	ret = nvme_configure_apst(ctrl);
 	if (ret < 0)
 		return ret;
@@ -4808,6 +4845,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b0863c59fac4..815c032a190e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -276,6 +276,9 @@ struct nvme_ctrl {
 	u32 max_hw_sectors;
 	u32 max_segments;
 	u32 max_integrity_segments;
+	u32 max_discard_sectors;
+	u32 max_discard_segments;
+	u32 max_zeroes_sectors;
 #ifdef CONFIG_BLK_DEV_ZONED
 	u32 max_zone_append;
 #endif
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b08787cd0881..edcbd60b88b9 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -405,6 +405,16 @@ struct nvme_id_ctrl_zns {
 	__u8	rsvd1[4095];
 };
 
+struct nvme_id_ctrl_nvm {
+	__u8	vsl;
+	__u8	wzsl;
+	__u8	wusl;
+	__u8	dmrl;
+	__le32	dmrsl;
+	__le64	dmsl;
+	__u8	rsvd16[4080];
+};
+
 enum {
 	NVME_ID_CNS_NS			= 0x00,
 	NVME_ID_CNS_CTRL		= 0x01,
-- 
cgit v1.2.3


From 6dd6954668035aa62b8c73d219d8c0adf6dc866a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 6 Apr 2021 14:45:03 +0200
Subject: dt-bindings:iio:dac: update microchip,mcp4725.yaml reference

Changeset 6ced946a4bba ("dt-bindings:iio:dac:microchip,mcp4725 yaml conversion")
renamed: Documentation/devicetree/bindings/iio/dac/mcp4725.txt
to: Documentation/devicetree/bindings/iio/dac/microchip,mcp4725.yaml.

Update its cross-reference accordingly.

Fixes: 6ced946a4bba ("dt-bindings:iio:dac:microchip,mcp4725 yaml conversion")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 include/linux/iio/dac/mcp4725.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/dac/mcp4725.h b/include/linux/iio/dac/mcp4725.h
index e9801c8d49c0..1f7e53c506b6 100644
--- a/include/linux/iio/dac/mcp4725.h
+++ b/include/linux/iio/dac/mcp4725.h
@@ -15,7 +15,7 @@
  * @vref_buffered: Controls buffering of the external reference voltage.
  *
  * Vref related settings are available only on MCP4756. See
- * Documentation/devicetree/bindings/iio/dac/mcp4725.txt for more information.
+ * Documentation/devicetree/bindings/iio/dac/microchip,mcp4725.yaml for more information.
  */
 struct mcp4725_platform_data {
 	bool use_vref;
-- 
cgit v1.2.3


From ce288e0535688cc3475a3c3d4d96624514c3550c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 31 Mar 2021 09:29:59 +0200
Subject: block: remove BLK_BOUNCE_ISA support

Remove the BLK_BOUNCE_ISA support now that all users are gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20210331073001.46776-7-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c     |   3 +-
 block/blk-map.c           |   4 +-
 block/blk-settings.c      |  11 ----
 block/blk.h               |   5 --
 block/bounce.c            | 124 ++++++++++------------------------------------
 block/scsi_ioctl.c        |   2 +-
 drivers/ata/libata-scsi.c |   3 +-
 include/linux/blkdev.h    |   7 ---
 mm/Kconfig                |   9 ++--
 9 files changed, 35 insertions(+), 133 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index dfa652122a2d..4b4eb8964a6f 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -204,7 +204,6 @@ bool bio_integrity_prep(struct bio *bio)
 {
 	struct bio_integrity_payload *bip;
 	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
-	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	void *buf;
 	unsigned long start, end;
 	unsigned int len, nr_pages;
@@ -238,7 +237,7 @@ bool bio_integrity_prep(struct bio *bio)
 
 	/* Allocate kernel buffer for protection data */
 	len = intervals * bi->tuple_size;
-	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
+	buf = kmalloc(len, GFP_NOIO);
 	status = BLK_STS_RESOURCE;
 	if (unlikely(buf == NULL)) {
 		printk(KERN_ERR "could not allocate integrity buffer\n");
diff --git a/block/blk-map.c b/block/blk-map.c
index 1ffef782fcf2..b62b52dcb61d 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -181,7 +181,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
 
 			i++;
 		} else {
-			page = alloc_page(rq->q->bounce_gfp | gfp_mask);
+			page = alloc_page(GFP_NOIO | gfp_mask);
 			if (!page) {
 				ret = -ENOMEM;
 				goto cleanup;
@@ -486,7 +486,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
 		if (bytes > len)
 			bytes = len;
 
-		page = alloc_page(q->bounce_gfp | gfp_mask);
+		page = alloc_page(GFP_NOIO | gfp_mask);
 		if (!page)
 			goto cleanup;
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b4aa2f37fab6..f9937dd2810e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -103,28 +103,17 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
 void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
 {
 	unsigned long b_pfn = max_addr >> PAGE_SHIFT;
-	int dma = 0;
 
-	q->bounce_gfp = GFP_NOIO;
 #if BITS_PER_LONG == 64
 	/*
 	 * Assume anything <= 4GB can be handled by IOMMU.  Actually
 	 * some IOMMUs can handle everything, but I don't know of a
 	 * way to test this here.
 	 */
-	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
-		dma = 1;
 	q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
 #else
-	if (b_pfn < blk_max_low_pfn)
-		dma = 1;
 	q->limits.bounce_pfn = b_pfn;
 #endif
-	if (dma) {
-		init_emergency_isa_pool();
-		q->bounce_gfp = GFP_NOIO | GFP_DMA;
-		q->limits.bounce_pfn = b_pfn;
-	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
diff --git a/block/blk.h b/block/blk.h
index 3b53e44b967e..895c9f4a5182 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -312,13 +312,8 @@ static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
 #endif
 
 #ifdef CONFIG_BOUNCE
-extern int init_emergency_isa_pool(void);
 extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
 #else
-static inline int init_emergency_isa_pool(void)
-{
-	return 0;
-}
 static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 {
 }
diff --git a/block/bounce.c b/block/bounce.c
index 6c441f4f1cd4..debd5b0bd318 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -29,7 +29,7 @@
 #define ISA_POOL_SIZE	16
 
 static struct bio_set bounce_bio_set, bounce_bio_split;
-static mempool_t page_pool, isa_page_pool;
+static mempool_t page_pool;
 
 static void init_bounce_bioset(void)
 {
@@ -89,41 +89,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 
 #endif /* CONFIG_HIGHMEM */
 
-/*
- * allocate pages in the DMA region for the ISA pool
- */
-static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
-{
-	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
-}
-
-static DEFINE_MUTEX(isa_mutex);
-
-/*
- * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
- * as the max address, so check if the pool has already been created.
- */
-int init_emergency_isa_pool(void)
-{
-	int ret;
-
-	mutex_lock(&isa_mutex);
-
-	if (mempool_initialized(&isa_page_pool)) {
-		mutex_unlock(&isa_mutex);
-		return 0;
-	}
-
-	ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
-			   mempool_free_pages, (void *) 0);
-	BUG_ON(ret);
-
-	pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
-	init_bounce_bioset();
-	mutex_unlock(&isa_mutex);
-	return 0;
-}
-
 /*
  * Simple bounce buffer support for highmem pages. Depending on the
  * queue gfp mask set, *to may or may not be a highmem page. kmap it
@@ -159,7 +124,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 	}
 }
 
-static void bounce_end_io(struct bio *bio, mempool_t *pool)
+static void bounce_end_io(struct bio *bio)
 {
 	struct bio *bio_orig = bio->bi_private;
 	struct bio_vec *bvec, orig_vec;
@@ -173,7 +138,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 		orig_vec = bio_iter_iovec(bio_orig, orig_iter);
 		if (bvec->bv_page != orig_vec.bv_page) {
 			dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-			mempool_free(bvec->bv_page, pool);
+			mempool_free(bvec->bv_page, &page_pool);
 		}
 		bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
 	}
@@ -185,33 +150,17 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 
 static void bounce_end_io_write(struct bio *bio)
 {
-	bounce_end_io(bio, &page_pool);
-}
-
-static void bounce_end_io_write_isa(struct bio *bio)
-{
-
-	bounce_end_io(bio, &isa_page_pool);
+	bounce_end_io(bio);
 }
 
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
+static void bounce_end_io_read(struct bio *bio)
 {
 	struct bio *bio_orig = bio->bi_private;
 
 	if (!bio->bi_status)
 		copy_to_high_bio_irq(bio_orig, bio);
 
-	bounce_end_io(bio, pool);
-}
-
-static void bounce_end_io_read(struct bio *bio)
-{
-	__bounce_end_io_read(bio, &page_pool);
-}
-
-static void bounce_end_io_read_isa(struct bio *bio)
-{
-	__bounce_end_io_read(bio, &isa_page_pool);
+	bounce_end_io(bio);
 }
 
 static struct bio *bounce_clone_bio(struct bio *bio_src)
@@ -287,8 +236,8 @@ err_put:
 	return NULL;
 }
 
-static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
-			       mempool_t *pool)
+
+void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 {
 	struct bio *bio;
 	int rw = bio_data_dir(*bio_orig);
@@ -298,6 +247,20 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	bool bounce = false;
 	int sectors = 0;
 
+	/*
+	 * Data-less bio, nothing to bounce
+	 */
+	if (!bio_has_data(*bio_orig))
+		return;
+
+	/*
+	 * Just check if the bounce pfn is equal to or bigger than the highest
+	 * pfn in the system -- in that case, don't waste time iterating over
+	 * bio segments
+	 */
+	if (q->limits.bounce_pfn >= blk_max_pfn)
+		return;
+
 	bio_for_each_segment(from, *bio_orig, iter) {
 		if (i++ < BIO_MAX_VECS)
 			sectors += from.bv_len >> 9;
@@ -327,7 +290,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		if (page_to_pfn(page) <= q->limits.bounce_pfn)
 			continue;
 
-		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+		to->bv_page = mempool_alloc(&page_pool, GFP_NOIO);
 		inc_zone_page_state(to->bv_page, NR_BOUNCE);
 
 		if (rw == WRITE) {
@@ -346,46 +309,11 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 
 	bio->bi_flags |= (1 << BIO_BOUNCED);
 
-	if (pool == &page_pool) {
+	if (rw == READ)
+		bio->bi_end_io = bounce_end_io_read;
+	else
 		bio->bi_end_io = bounce_end_io_write;
-		if (rw == READ)
-			bio->bi_end_io = bounce_end_io_read;
-	} else {
-		bio->bi_end_io = bounce_end_io_write_isa;
-		if (rw == READ)
-			bio->bi_end_io = bounce_end_io_read_isa;
-	}
 
 	bio->bi_private = *bio_orig;
 	*bio_orig = bio;
 }
-
-void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
-{
-	mempool_t *pool;
-
-	/*
-	 * Data-less bio, nothing to bounce
-	 */
-	if (!bio_has_data(*bio_orig))
-		return;
-
-	/*
-	 * for non-isa bounce case, just check if the bounce pfn is equal
-	 * to or bigger than the highest pfn in the system -- in that case,
-	 * don't waste time iterating over bio segments
-	 */
-	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (q->limits.bounce_pfn >= blk_max_pfn)
-			return;
-		pool = &page_pool;
-	} else {
-		BUG_ON(!mempool_initialized(&isa_page_pool));
-		pool = &isa_page_pool;
-	}
-
-	/*
-	 * slow path
-	 */
-	__blk_queue_bounce(q, bio_orig, pool);
-}
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 6599bac0a78c..1048b0925567 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -431,7 +431,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	bytes = max(in_len, out_len);
 	if (bytes) {
-		buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
+		buffer = kzalloc(bytes, GFP_NOIO | GFP_USER | __GFP_NOWARN);
 		if (!buffer)
 			return -ENOMEM;
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 48b8934970f3..fd8b6febbf70 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1043,8 +1043,7 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
 		blk_queue_max_segments(q, queue_max_segments(q) - 1);
 
 		sdev->dma_drain_len = ATAPI_MAX_DRAIN;
-		sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len,
-				q->bounce_gfp | GFP_KERNEL);
+		sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len, GFP_NOIO);
 		if (!sdev->dma_drain_buf) {
 			ata_dev_err(dev, "drain buffer allocation failed\n");
 			return -ENOMEM;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc6bc8383b43..0dbb72ea3735 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -436,11 +436,6 @@ struct request_queue {
 	 */
 	int			id;
 
-	/*
-	 * queue needs bounce pages for pages above this limit
-	 */
-	gfp_t			bounce_gfp;
-
 	spinlock_t		queue_lock;
 
 	/*
@@ -847,7 +842,6 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn;
  *
  * BLK_BOUNCE_HIGH	: bounce all highmem pages
  * BLK_BOUNCE_ANY	: don't bounce anything
- * BLK_BOUNCE_ISA	: bounce pages above ISA DMA boundary
  */
 
 #if BITS_PER_LONG == 32
@@ -856,7 +850,6 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn;
 #define BLK_BOUNCE_HIGH		-1ULL
 #endif
 #define BLK_BOUNCE_ANY		(-1ULL)
-#define BLK_BOUNCE_ISA		(DMA_BIT_MASK(24))
 
 /*
  * default timeout for SG_IO if none specified
diff --git a/mm/Kconfig b/mm/Kconfig
index 24c045b24b95..d0808a23e54b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -283,12 +283,11 @@ config PHYS_ADDR_T_64BIT
 config BOUNCE
 	bool "Enable bounce buffers"
 	default y
-	depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+	depends on BLOCK && MMU && HIGHMEM
 	help
-	  Enable bounce buffers for devices that cannot access
-	  the full range of memory available to the CPU. Enabled
-	  by default when ZONE_DMA or HIGHMEM is selected, but you
-	  may say n to override this.
+	  Enable bounce buffers for devices that cannot access the full range of
+	  memory available to the CPU. Enabled by default when HIGHMEM is
+	  selected, but you may say n to override this.
 
 config VIRT_TO_BUS
 	bool
-- 
cgit v1.2.3


From 9bb33f24abbd0fa2fadad01ec75438d7cc239189 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 31 Mar 2021 09:30:00 +0200
Subject: block: refactor the bounce buffering code

Get rid of all the PFN arithmetics and just use an enum for the two
remaining options, and use PageHighMem for the actual bounce decision.

Add a fast path to entirely avoid the call for the common case of a queue
not using the legacy bouncing code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20210331073001.46776-8-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  6 ++----
 block/blk-settings.c   | 42 ++++++++----------------------------------
 block/blk.h            | 16 ++++++++++++----
 block/bounce.c         | 35 +++++------------------------------
 include/linux/blkdev.h | 29 +++++++++++------------------
 5 files changed, 38 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index fc60ff208497..9bcdae93f6d4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1161,10 +1161,8 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
 	}
 
 	/*
-	 * queue's settings related to segment counting like q->bounce_pfn
-	 * may differ from that of other stacking queues.
-	 * Recalculate it to check the request correctly on this queue's
-	 * limitation.
+	 * The queue settings related to segment counting may differ from the
+	 * original queue.
 	 */
 	rq->nr_phys_segments = blk_recalc_rq_segments(rq);
 	if (rq->nr_phys_segments > queue_max_segments(q)) {
diff --git a/block/blk-settings.c b/block/blk-settings.c
index f9937dd2810e..9c009090c4b5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -7,7 +7,6 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
 #include <linux/gcd.h>
 #include <linux/lcm.h>
 #include <linux/jiffies.h>
@@ -17,11 +16,6 @@
 #include "blk.h"
 #include "blk-wbt.h"
 
-unsigned long blk_max_low_pfn;
-EXPORT_SYMBOL(blk_max_low_pfn);
-
-unsigned long blk_max_pfn;
-
 void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
 {
 	q->rq_timeout = timeout;
@@ -55,7 +49,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
-	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
+	lim->bounce = BLK_BOUNCE_NONE;
 	lim->alignment_offset = 0;
 	lim->io_opt = 0;
 	lim->misaligned = 0;
@@ -92,28 +86,16 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q: the request queue for the device
- * @max_addr: the maximum address the device can handle
+ * @bounce: bounce limit to enforce
  *
  * Description:
- *    Different hardware can have different requirements as to what pages
- *    it can do I/O directly to. A low level driver can call
- *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
- *    buffers for doing I/O to pages residing above @max_addr.
+ *    Force bouncing for ISA DMA ranges or highmem.
+ *
+ *    DEPRECATED, don't use in new code.
  **/
-void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
+void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce)
 {
-	unsigned long b_pfn = max_addr >> PAGE_SHIFT;
-
-#if BITS_PER_LONG == 64
-	/*
-	 * Assume anything <= 4GB can be handled by IOMMU.  Actually
-	 * some IOMMUs can handle everything, but I don't know of a
-	 * way to test this here.
-	 */
-	q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
-#else
-	q->limits.bounce_pfn = b_pfn;
-#endif
+	q->limits.bounce = bounce;
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
@@ -536,7 +518,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 					b->max_write_zeroes_sectors);
 	t->max_zone_append_sectors = min(t->max_zone_append_sectors,
 					b->max_zone_append_sectors);
-	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
+	t->bounce = max(t->bounce, b->bounce);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
@@ -916,11 +898,3 @@ void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
 	}
 }
 EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
-
-static int __init blk_settings_init(void)
-{
-	blk_max_low_pfn = max_low_pfn - 1;
-	blk_max_pfn = max_pfn - 1;
-	return 0;
-}
-subsys_initcall(blk_settings_init);
diff --git a/block/blk.h b/block/blk.h
index 895c9f4a5182..8f4337c5a9e6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -6,6 +6,7 @@
 #include <linux/blk-mq.h>
 #include <linux/part_stat.h>
 #include <linux/blk-crypto.h>
+#include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
 #include <xen/xen.h>
 #include "blk-crypto-internal.h"
 #include "blk-mq.h"
@@ -311,13 +312,20 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
 static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
 #endif
 
-#ifdef CONFIG_BOUNCE
-extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
-#else
+void __blk_queue_bounce(struct request_queue *q, struct bio **bio);
+
+static inline bool blk_queue_may_bounce(struct request_queue *q)
+{
+	return IS_ENABLED(CONFIG_BOUNCE) &&
+		q->limits.bounce == BLK_BOUNCE_HIGH &&
+		max_low_pfn >= max_pfn;
+}
+
 static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 {
+	if (unlikely(blk_queue_may_bounce(q) && bio_has_data(*bio)))
+		__blk_queue_bounce(q, bio);	
 }
-#endif /* CONFIG_BOUNCE */
 
 #ifdef CONFIG_BLK_CGROUP_IOLATENCY
 extern int blk_iolatency_init(struct request_queue *q);
diff --git a/block/bounce.c b/block/bounce.c
index debd5b0bd318..6bafc0d1f867 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -18,7 +18,6 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/memblock.h>
 #include <linux/printk.h>
 #include <asm/tlbflush.h>
 
@@ -49,11 +48,11 @@ static void init_bounce_bioset(void)
 	bounce_bs_setup = true;
 }
 
-#if defined(CONFIG_HIGHMEM)
 static __init int init_emergency_pool(void)
 {
 	int ret;
-#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
+
+#ifndef CONFIG_MEMORY_HOTPLUG
 	if (max_pfn <= max_low_pfn)
 		return 0;
 #endif
@@ -67,9 +66,7 @@ static __init int init_emergency_pool(void)
 }
 
 __initcall(init_emergency_pool);
-#endif
 
-#ifdef CONFIG_HIGHMEM
 /*
  * highmem version, map in to vec
  */
@@ -82,13 +79,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 	kunmap_atomic(vto);
 }
 
-#else /* CONFIG_HIGHMEM */
-
-#define bounce_copy_vec(to, vfrom)	\
-	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
-
-#endif /* CONFIG_HIGHMEM */
-
 /*
  * Simple bounce buffer support for highmem pages. Depending on the
  * queue gfp mask set, *to may or may not be a highmem page. kmap it
@@ -236,8 +226,7 @@ err_put:
 	return NULL;
 }
 
-
-void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
+void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 {
 	struct bio *bio;
 	int rw = bio_data_dir(*bio_orig);
@@ -247,24 +236,10 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	bool bounce = false;
 	int sectors = 0;
 
-	/*
-	 * Data-less bio, nothing to bounce
-	 */
-	if (!bio_has_data(*bio_orig))
-		return;
-
-	/*
-	 * Just check if the bounce pfn is equal to or bigger than the highest
-	 * pfn in the system -- in that case, don't waste time iterating over
-	 * bio segments
-	 */
-	if (q->limits.bounce_pfn >= blk_max_pfn)
-		return;
-
 	bio_for_each_segment(from, *bio_orig, iter) {
 		if (i++ < BIO_MAX_VECS)
 			sectors += from.bv_len >> 9;
-		if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn)
+		if (PageHighMem(from.bv_page))
 			bounce = true;
 	}
 	if (!bounce)
@@ -287,7 +262,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
 		struct page *page = to->bv_page;
 
-		if (page_to_pfn(page) <= q->limits.bounce_pfn)
+		if (!PageHighMem(page))
 			continue;
 
 		to->bv_page = mempool_alloc(&page_pool, GFP_NOIO);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0dbb72ea3735..55cc8b96c844 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -313,8 +313,17 @@ enum blk_zoned_model {
 	BLK_ZONED_HM,		/* Host-managed zoned block device */
 };
 
+/*
+ * BLK_BOUNCE_NONE:	never bounce (default)
+ * BLK_BOUNCE_HIGH:	bounce all highmem pages
+ */
+enum blk_bounce {
+	BLK_BOUNCE_NONE,
+	BLK_BOUNCE_HIGH,
+};
+
 struct queue_limits {
-	unsigned long		bounce_pfn;
+	enum blk_bounce		bounce;
 	unsigned long		seg_boundary_mask;
 	unsigned long		virt_boundary_mask;
 
@@ -835,22 +844,6 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
 	return q->nr_requests;
 }
 
-extern unsigned long blk_max_low_pfn, blk_max_pfn;
-
-/*
- * standard bounce addresses:
- *
- * BLK_BOUNCE_HIGH	: bounce all highmem pages
- * BLK_BOUNCE_ANY	: don't bounce anything
- */
-
-#if BITS_PER_LONG == 32
-#define BLK_BOUNCE_HIGH		((u64)blk_max_low_pfn << PAGE_SHIFT)
-#else
-#define BLK_BOUNCE_HIGH		-1ULL
-#endif
-#define BLK_BOUNCE_ANY		(-1ULL)
-
 /*
  * default timeout for SG_IO if none specified
  */
@@ -1134,7 +1127,7 @@ extern void blk_abort_request(struct request *);
  * Access functions for manipulating queue properties
  */
 extern void blk_cleanup_queue(struct request_queue *);
-extern void blk_queue_bounce_limit(struct request_queue *, u64);
+void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce limit);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
-- 
cgit v1.2.3


From 393bb12e00580aaa23356504eed38d8f5571153a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 31 Mar 2021 09:30:01 +0200
Subject: block: stop calling blk_queue_bounce for passthrough requests

Instead of overloading the passthrough fast path with the deprecated
block layer bounce buffering let the users that combine an old
undermaintained driver with a highmem system pay the price by always
falling back to copies in that case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20210331073001.46776-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-map.c                    | 116 ++++++++++---------------------------
 block/bounce.c                     |  11 +---
 drivers/nvme/host/lightnvm.c       |   2 +-
 drivers/target/target_core_pscsi.c |   4 +-
 include/linux/blkdev.h             |   2 +-
 5 files changed, 36 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-map.c b/block/blk-map.c
index b62b52dcb61d..dac78376acc8 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -123,7 +123,6 @@ static int bio_uncopy_user(struct bio *bio)
 			bio_free_pages(bio);
 	}
 	kfree(bmd);
-	bio_put(bio);
 	return ret;
 }
 
@@ -132,7 +131,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
 {
 	struct bio_map_data *bmd;
 	struct page *page;
-	struct bio *bio, *bounce_bio;
+	struct bio *bio;
 	int i = 0, ret;
 	int nr_pages;
 	unsigned int len = iter->count;
@@ -218,16 +217,9 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
 
 	bio->bi_private = bmd;
 
-	bounce_bio = bio;
-	ret = blk_rq_append_bio(rq, &bounce_bio);
+	ret = blk_rq_append_bio(rq, bio);
 	if (ret)
 		goto cleanup;
-
-	/*
-	 * We link the bounce buffer in and could have to traverse it later, so
-	 * we have to get a ref to prevent it from being freed
-	 */
-	bio_get(bounce_bio);
 	return 0;
 cleanup:
 	if (!map_data)
@@ -242,7 +234,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 		gfp_t gfp_mask)
 {
 	unsigned int max_sectors = queue_max_hw_sectors(rq->q);
-	struct bio *bio, *bounce_bio;
+	struct bio *bio;
 	int ret;
 	int j;
 
@@ -304,49 +296,17 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 			break;
 	}
 
-	/*
-	 * Subtle: if we end up needing to bounce a bio, it would normally
-	 * disappear when its bi_end_io is run.  However, we need the original
-	 * bio for the unmap, so grab an extra reference to it
-	 */
-	bio_get(bio);
-
-	bounce_bio = bio;
-	ret = blk_rq_append_bio(rq, &bounce_bio);
+	ret = blk_rq_append_bio(rq, bio);
 	if (ret)
-		goto out_put_orig;
-
-	/*
-	 * We link the bounce buffer in and could have to traverse it
-	 * later, so we have to get a ref to prevent it from being freed
-	 */
-	bio_get(bounce_bio);
+		goto out_unmap;
 	return 0;
 
- out_put_orig:
-	bio_put(bio);
  out_unmap:
 	bio_release_pages(bio, false);
 	bio_put(bio);
 	return ret;
 }
 
-/**
- *	bio_unmap_user	-	unmap a bio
- *	@bio:		the bio being unmapped
- *
- *	Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
- *	process context.
- *
- *	bio_unmap_user() may sleep.
- */
-static void bio_unmap_user(struct bio *bio)
-{
-	bio_release_pages(bio, bio_data_dir(bio) == READ);
-	bio_put(bio);
-	bio_put(bio);
-}
-
 static void bio_invalidate_vmalloc_pages(struct bio *bio)
 {
 #ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
@@ -519,33 +479,27 @@ cleanup:
  * Append a bio to a passthrough request.  Only works if the bio can be merged
  * into the request based on the driver constraints.
  */
-int blk_rq_append_bio(struct request *rq, struct bio **bio)
+int blk_rq_append_bio(struct request *rq, struct bio *bio)
 {
-	struct bio *orig_bio = *bio;
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	unsigned int nr_segs = 0;
 
-	blk_queue_bounce(rq->q, bio);
+	if (WARN_ON_ONCE(rq->q->limits.bounce != BLK_BOUNCE_NONE))
+		return -EINVAL;
 
-	bio_for_each_bvec(bv, *bio, iter)
+	bio_for_each_bvec(bv, bio, iter)
 		nr_segs++;
 
 	if (!rq->bio) {
-		blk_rq_bio_prep(rq, *bio, nr_segs);
+		blk_rq_bio_prep(rq, bio, nr_segs);
 	} else {
-		if (!ll_back_merge_fn(rq, *bio, nr_segs)) {
-			if (orig_bio != *bio) {
-				bio_put(*bio);
-				*bio = orig_bio;
-			}
+		if (!ll_back_merge_fn(rq, bio, nr_segs))
 			return -EINVAL;
-		}
-
-		rq->biotail->bi_next = *bio;
-		rq->biotail = *bio;
-		rq->__data_len += (*bio)->bi_iter.bi_size;
-		bio_crypt_free_ctx(*bio);
+		rq->biotail->bi_next = bio;
+		rq->biotail = bio;
+		rq->__data_len += (bio)->bi_iter.bi_size;
+		bio_crypt_free_ctx(bio);
 	}
 
 	return 0;
@@ -566,12 +520,6 @@ EXPORT_SYMBOL(blk_rq_append_bio);
  *
  *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
  *    still in process context.
- *
- *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
- *    before being submitted to the device, as pages mapped may be out of
- *    reach. It's the callers responsibility to make sure this happens. The
- *    original bio must be passed back in to blk_rq_unmap_user() for proper
- *    unmapping.
  */
 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			struct rq_map_data *map_data,
@@ -588,6 +536,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 
 	if (map_data)
 		copy = true;
+	else if (blk_queue_may_bounce(q))
+		copy = true;
 	else if (iov_iter_alignment(iter) & align)
 		copy = true;
 	else if (queue_virt_boundary(q))
@@ -641,25 +591,21 @@ EXPORT_SYMBOL(blk_rq_map_user);
  */
 int blk_rq_unmap_user(struct bio *bio)
 {
-	struct bio *mapped_bio;
+	struct bio *next_bio;
 	int ret = 0, ret2;
 
 	while (bio) {
-		mapped_bio = bio;
-		if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
-			mapped_bio = bio->bi_private;
-
 		if (bio->bi_private) {
-			ret2 = bio_uncopy_user(mapped_bio);
+			ret2 = bio_uncopy_user(bio);
 			if (ret2 && !ret)
 				ret = ret2;
 		} else {
-			bio_unmap_user(mapped_bio);
+			bio_release_pages(bio, bio_data_dir(bio) == READ);
 		}
 
-		mapped_bio = bio;
+		next_bio = bio;
 		bio = bio->bi_next;
-		bio_put(mapped_bio);
+		bio_put(next_bio);
 	}
 
 	return ret;
@@ -684,7 +630,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 {
 	int reading = rq_data_dir(rq) == READ;
 	unsigned long addr = (unsigned long) kbuf;
-	struct bio *bio, *orig_bio;
+	struct bio *bio;
 	int ret;
 
 	if (len > (queue_max_hw_sectors(q) << 9))
@@ -692,7 +638,8 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (!len || !kbuf)
 		return -EINVAL;
 
-	if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf))
+	if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
+	    blk_queue_may_bounce(q))
 		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
 	else
 		bio = bio_map_kern(q, kbuf, len, gfp_mask);
@@ -703,14 +650,9 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	bio->bi_opf &= ~REQ_OP_MASK;
 	bio->bi_opf |= req_op(rq);
 
-	orig_bio = bio;
-	ret = blk_rq_append_bio(rq, &bio);
-	if (unlikely(ret)) {
-		/* request is too big */
-		bio_put(orig_bio);
-		return ret;
-	}
-
-	return 0;
+	ret = blk_rq_append_bio(rq, bio);
+	if (unlikely(ret))
+		bio_put(bio);
+	return ret;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/bounce.c b/block/bounce.c
index 6bafc0d1f867..94081e013c58 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -180,12 +180,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
 	 *    asking for trouble and would force extra work on
 	 *    __bio_clone_fast() anyways.
 	 */
-	if (bio_is_passthrough(bio_src))
-		bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL,
-				  bio_segments(bio_src));
-	else
-		bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src),
-				       &bounce_bio_set);
+	bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src),
+			       &bounce_bio_set);
 	bio->bi_bdev		= bio_src->bi_bdev;
 	if (bio_flagged(bio_src, BIO_REMAPPED))
 		bio_set_flag(bio, BIO_REMAPPED);
@@ -245,8 +241,7 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	if (!bounce)
 		return;
 
-	if (!bio_is_passthrough(*bio_orig) &&
-	    sectors < bio_sectors(*bio_orig)) {
+	if (sectors < bio_sectors(*bio_orig)) {
 		bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
 		bio_chain(bio, *bio_orig);
 		submit_bio_noacct(*bio_orig);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index b705988629f2..f6ca2fbb711e 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -660,7 +660,7 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
 	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
 	if (rqd->bio)
-		blk_rq_append_bio(rq, &rqd->bio);
+		blk_rq_append_bio(rq, rqd->bio);
 	else
 		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
 
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 3cbc074992bc..7df4a9c9c7ff 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -911,7 +911,7 @@ new_bio:
 					" %d i: %d bio: %p, allocating another"
 					" bio\n", bio->bi_vcnt, i, bio);
 
-				rc = blk_rq_append_bio(req, &bio);
+				rc = blk_rq_append_bio(req, bio);
 				if (rc) {
 					pr_err("pSCSI: failed to append bio\n");
 					goto fail;
@@ -930,7 +930,7 @@ new_bio:
 	}
 
 	if (bio) {
-		rc = blk_rq_append_bio(req, &bio);
+		rc = blk_rq_append_bio(req, bio);
 		if (rc) {
 			pr_err("pSCSI: failed to append bio\n");
 			goto fail;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 55cc8b96c844..d5d320da51f8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -909,7 +909,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
-extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
+int blk_rq_append_bio(struct request *rq, struct bio *bio);
 extern void blk_queue_split(struct bio **);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
 extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,
-- 
cgit v1.2.3


From 0bfc6a4ea63c2adac71a824397ef48f28dbc5e47 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:05 -0600
Subject: vfio: Split creation of a vfio_device into init and register ops

This makes the struct vfio_device part of the public interface so it
can be used with container_of and so forth, as is typical for a Linux
subystem.

This is the first step to bring some type-safety to the vfio interface by
allowing the replacement of 'void *' and 'struct device *' inputs with a
simple and clear 'struct vfio_device *'

For now the self-allocating vfio_add_group_dev() interface is kept so each
user can be updated as a separate patch.

The expected usage pattern is

  driver core probe() function:
     my_device = kzalloc(sizeof(*mydevice));
     vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice);
     /* other driver specific prep */
     vfio_register_group_dev(&my_device->vdev);
     dev_set_drvdata(dev, my_device);

  driver core remove() function:
     my_device = dev_get_drvdata(dev);
     vfio_unregister_group_dev(&my_device->vdev);
     /* other driver specific tear down */
     kfree(my_device);

Allowing the driver to be able to use the drvdata and vfio_device to go
to/from its own data.

The pattern also makes it clear that vfio_register_group_dev() must be
last in the sequence, as once it is called the core code can immediately
start calling ops. The init/register gap is provided to allow for the
driver to do setup before ops can be called and thus avoid races.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Liu Yi L <yi.l.liu@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio.rst |  31 ++++++----
 drivers/vfio/vfio.c               | 125 ++++++++++++++++++++------------------
 include/linux/vfio.h              |  16 +++++
 3 files changed, 99 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst
index f1a4d3c3ba0b..d3a02300913a 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -249,18 +249,23 @@ VFIO bus driver API
 
 VFIO bus drivers, such as vfio-pci make use of only a few interfaces
 into VFIO core.  When devices are bound and unbound to the driver,
-the driver should call vfio_add_group_dev() and vfio_del_group_dev()
-respectively::
-
-	extern int vfio_add_group_dev(struct device *dev,
-				      const struct vfio_device_ops *ops,
-				      void *device_data);
-
-	extern void *vfio_del_group_dev(struct device *dev);
-
-vfio_add_group_dev() indicates to the core to begin tracking the
-iommu_group of the specified dev and register the dev as owned by
-a VFIO bus driver.  The driver provides an ops structure for callbacks
+the driver should call vfio_register_group_dev() and
+vfio_unregister_group_dev() respectively::
+
+	void vfio_init_group_dev(struct vfio_device *device,
+				struct device *dev,
+				const struct vfio_device_ops *ops,
+				void *device_data);
+	int vfio_register_group_dev(struct vfio_device *device);
+	void vfio_unregister_group_dev(struct vfio_device *device);
+
+The driver should embed the vfio_device in its own structure and call
+vfio_init_group_dev() to pre-configure it before going to registration.
+vfio_register_group_dev() indicates to the core to begin tracking the
+iommu_group of the specified dev and register the dev as owned by a VFIO bus
+driver. Once vfio_register_group_dev() returns it is possible for userspace to
+start accessing the driver, thus the driver should ensure it is completely
+ready before calling it. The driver provides an ops structure for callbacks
 similar to a file operations structure::
 
 	struct vfio_device_ops {
@@ -276,7 +281,7 @@ similar to a file operations structure::
 	};
 
 Each function is passed the device_data that was originally registered
-in the vfio_add_group_dev() call above.  This allows the bus driver
+in the vfio_register_group_dev() call above.  This allows the bus driver
 an easy place to store its opaque, private data.  The open/release
 callbacks are issued when a new file descriptor is created for a
 device (via VFIO_GROUP_GET_DEVICE_FD).  The ioctl interface provides
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 32660e8a69ae..2ea430de505b 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -89,16 +89,6 @@ struct vfio_group {
 	struct blocking_notifier_head	notifier;
 };
 
-struct vfio_device {
-	refcount_t			refcount;
-	struct completion		comp;
-	struct device			*dev;
-	const struct vfio_device_ops	*ops;
-	struct vfio_group		*group;
-	struct list_head		group_next;
-	void				*device_data;
-};
-
 #ifdef CONFIG_VFIO_NOIOMMU
 static bool noiommu __read_mostly;
 module_param_named(enable_unsafe_noiommu_mode,
@@ -532,35 +522,6 @@ static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 /**
  * Device objects - create, release, get, put, search
  */
-static
-struct vfio_device *vfio_group_create_device(struct vfio_group *group,
-					     struct device *dev,
-					     const struct vfio_device_ops *ops,
-					     void *device_data)
-{
-	struct vfio_device *device;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return ERR_PTR(-ENOMEM);
-
-	refcount_set(&device->refcount, 1);
-	init_completion(&device->comp);
-	device->dev = dev;
-	/* Our reference on group is moved to the device */
-	device->group = group;
-	device->ops = ops;
-	device->device_data = device_data;
-	dev_set_drvdata(dev, device);
-
-	mutex_lock(&group->device_lock);
-	list_add(&device->group_next, &group->device_list);
-	group->dev_counter++;
-	mutex_unlock(&group->device_lock);
-
-	return device;
-}
-
 /* Device reference always implies a group reference */
 void vfio_device_put(struct vfio_device *device)
 {
@@ -779,14 +740,23 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
 /**
  * VFIO driver API
  */
-int vfio_add_group_dev(struct device *dev,
-		       const struct vfio_device_ops *ops, void *device_data)
+void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
+			 const struct vfio_device_ops *ops, void *device_data)
+{
+	init_completion(&device->comp);
+	device->dev = dev;
+	device->ops = ops;
+	device->device_data = device_data;
+}
+EXPORT_SYMBOL_GPL(vfio_init_group_dev);
+
+int vfio_register_group_dev(struct vfio_device *device)
 {
+	struct vfio_device *existing_device;
 	struct iommu_group *iommu_group;
 	struct vfio_group *group;
-	struct vfio_device *device;
 
-	iommu_group = iommu_group_get(dev);
+	iommu_group = iommu_group_get(device->dev);
 	if (!iommu_group)
 		return -EINVAL;
 
@@ -805,21 +775,50 @@ int vfio_add_group_dev(struct device *dev,
 		iommu_group_put(iommu_group);
 	}
 
-	device = vfio_group_get_device(group, dev);
-	if (device) {
-		dev_WARN(dev, "Device already exists on group %d\n",
+	existing_device = vfio_group_get_device(group, device->dev);
+	if (existing_device) {
+		dev_WARN(device->dev, "Device already exists on group %d\n",
 			 iommu_group_id(iommu_group));
-		vfio_device_put(device);
+		vfio_device_put(existing_device);
 		vfio_group_put(group);
 		return -EBUSY;
 	}
 
-	device = vfio_group_create_device(group, dev, ops, device_data);
-	if (IS_ERR(device)) {
-		vfio_group_put(group);
-		return PTR_ERR(device);
-	}
+	/* Our reference on group is moved to the device */
+	device->group = group;
+
+	/* Refcounting can't start until the driver calls register */
+	refcount_set(&device->refcount, 1);
+
+	mutex_lock(&group->device_lock);
+	list_add(&device->group_next, &group->device_list);
+	group->dev_counter++;
+	mutex_unlock(&group->device_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_register_group_dev);
+
+int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops,
+		       void *device_data)
+{
+	struct vfio_device *device;
+	int ret;
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device)
+		return -ENOMEM;
+
+	vfio_init_group_dev(device, dev, ops, device_data);
+	ret = vfio_register_group_dev(device);
+	if (ret)
+		goto err_kfree;
+	dev_set_drvdata(dev, device);
 	return 0;
+
+err_kfree:
+	kfree(device);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 
@@ -887,11 +886,9 @@ EXPORT_SYMBOL_GPL(vfio_device_data);
 /*
  * Decrement the device reference count and wait for the device to be
  * removed.  Open file descriptors for the device... */
-void *vfio_del_group_dev(struct device *dev)
+void vfio_unregister_group_dev(struct vfio_device *device)
 {
-	struct vfio_device *device = dev_get_drvdata(dev);
 	struct vfio_group *group = device->group;
-	void *device_data = device->device_data;
 	struct vfio_unbound_dev *unbound;
 	unsigned int i = 0;
 	bool interrupted = false;
@@ -908,7 +905,7 @@ void *vfio_del_group_dev(struct device *dev)
 	 */
 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 	if (unbound) {
-		unbound->dev = dev;
+		unbound->dev = device->dev;
 		mutex_lock(&group->unbound_lock);
 		list_add(&unbound->unbound_next, &group->unbound_list);
 		mutex_unlock(&group->unbound_lock);
@@ -919,7 +916,7 @@ void *vfio_del_group_dev(struct device *dev)
 	rc = try_wait_for_completion(&device->comp);
 	while (rc <= 0) {
 		if (device->ops->request)
-			device->ops->request(device_data, i++);
+			device->ops->request(device->device_data, i++);
 
 		if (interrupted) {
 			rc = wait_for_completion_timeout(&device->comp,
@@ -929,7 +926,7 @@ void *vfio_del_group_dev(struct device *dev)
 				&device->comp, HZ * 10);
 			if (rc < 0) {
 				interrupted = true;
-				dev_warn(dev,
+				dev_warn(device->dev,
 					 "Device is currently in use, task"
 					 " \"%s\" (%d) "
 					 "blocked until device is released",
@@ -960,11 +957,19 @@ void *vfio_del_group_dev(struct device *dev)
 	if (list_empty(&group->device_list))
 		wait_event(group->container_q, !group->container);
 
-	/* Matches the get in vfio_group_create_device() */
+	/* Matches the get in vfio_register_group_dev() */
 	vfio_group_put(group);
+}
+EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
+
+void *vfio_del_group_dev(struct device *dev)
+{
+	struct vfio_device *device = dev_get_drvdata(dev);
+	void *device_data = device->device_data;
+
+	vfio_unregister_group_dev(device);
 	dev_set_drvdata(dev, NULL);
 	kfree(device);
-
 	return device_data;
 }
 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index b7e18bde5aa8..ad8b579d67d3 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -15,6 +15,18 @@
 #include <linux/poll.h>
 #include <uapi/linux/vfio.h>
 
+struct vfio_device {
+	struct device *dev;
+	const struct vfio_device_ops *ops;
+	struct vfio_group *group;
+
+	/* Members below here are private, not for driver use */
+	refcount_t refcount;
+	struct completion comp;
+	struct list_head group_next;
+	void *device_data;
+};
+
 /**
  * struct vfio_device_ops - VFIO bus driver device callbacks
  *
@@ -48,11 +60,15 @@ struct vfio_device_ops {
 extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
 extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 
+void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
+			 const struct vfio_device_ops *ops, void *device_data);
+int vfio_register_group_dev(struct vfio_device *device);
 extern int vfio_add_group_dev(struct device *dev,
 			      const struct vfio_device_ops *ops,
 			      void *device_data);
 
 extern void *vfio_del_group_dev(struct device *dev);
+void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 extern void *vfio_device_data(struct vfio_device *device);
-- 
cgit v1.2.3


From 1ae1b20f6f2c67659c963e5fe58f9b4a47df9f12 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:07 -0600
Subject: vfio/mdev: Use vfio_init/register/unregister_group_dev

mdev gets little benefit because it doesn't actually do anything, however
it is the last user, so move the vfio_init/register/unregister_group_dev()
code here for now.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Liu Yi L <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <10-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/mdev/vfio_mdev.c | 20 ++++++++++++++++++--
 drivers/vfio/vfio.c           | 39 ++-------------------------------------
 include/linux/vfio.h          |  5 -----
 3 files changed, 20 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index b52eea128549..4043cc91f952 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -124,13 +124,29 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
 static int vfio_mdev_probe(struct device *dev)
 {
 	struct mdev_device *mdev = to_mdev_device(dev);
+	struct vfio_device *vdev;
+	int ret;
 
-	return vfio_add_group_dev(dev, &vfio_mdev_dev_ops, mdev);
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return -ENOMEM;
+
+	vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops, mdev);
+	ret = vfio_register_group_dev(vdev);
+	if (ret) {
+		kfree(vdev);
+		return ret;
+	}
+	dev_set_drvdata(&mdev->dev, vdev);
+	return 0;
 }
 
 static void vfio_mdev_remove(struct device *dev)
 {
-	vfio_del_group_dev(dev);
+	struct vfio_device *vdev = dev_get_drvdata(dev);
+
+	vfio_unregister_group_dev(vdev);
+	kfree(vdev);
 }
 
 static struct mdev_driver vfio_mdev_driver = {
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2ea430de505b..180b4ab02d11 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -99,8 +99,8 @@ MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  Thi
 /*
  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
  * and remove functions, any use cases other than acquiring the first
- * reference for the purpose of calling vfio_add_group_dev() or removing
- * that symmetric reference after vfio_del_group_dev() should use the raw
+ * reference for the purpose of calling vfio_register_group_dev() or removing
+ * that symmetric reference after vfio_unregister_group_dev() should use the raw
  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
  * removes the device from the dummy group and cannot be nested.
  */
@@ -799,29 +799,6 @@ int vfio_register_group_dev(struct vfio_device *device)
 }
 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 
-int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops,
-		       void *device_data)
-{
-	struct vfio_device *device;
-	int ret;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return -ENOMEM;
-
-	vfio_init_group_dev(device, dev, ops, device_data);
-	ret = vfio_register_group_dev(device);
-	if (ret)
-		goto err_kfree;
-	dev_set_drvdata(dev, device);
-	return 0;
-
-err_kfree:
-	kfree(device);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(vfio_add_group_dev);
-
 /**
  * Get a reference to the vfio_device for a device.  Even if the
  * caller thinks they own the device, they could be racing with a
@@ -962,18 +939,6 @@ void vfio_unregister_group_dev(struct vfio_device *device)
 }
 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 
-void *vfio_del_group_dev(struct device *dev)
-{
-	struct vfio_device *device = dev_get_drvdata(dev);
-	void *device_data = device->device_data;
-
-	vfio_unregister_group_dev(device);
-	dev_set_drvdata(dev, NULL);
-	kfree(device);
-	return device_data;
-}
-EXPORT_SYMBOL_GPL(vfio_del_group_dev);
-
 /**
  * VFIO base fd, /dev/vfio/vfio
  */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ad8b579d67d3..4995faf51efe 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -63,11 +63,6 @@ extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 			 const struct vfio_device_ops *ops, void *device_data);
 int vfio_register_group_dev(struct vfio_device *device);
-extern int vfio_add_group_dev(struct device *dev,
-			      const struct vfio_device_ops *ops,
-			      void *device_data);
-
-extern void *vfio_del_group_dev(struct device *dev);
 void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
-- 
cgit v1.2.3


From 6df62c5b05f4ad6876815ea8b8775905a090224a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:08 -0600
Subject: vfio: Make vfio_device_ops pass a 'struct vfio_device *' instead of
 'void *'

This is the standard kernel pattern, the ops associated with a struct get
the struct pointer in for typesafety. The expected design is to use
container_of to cleanly go from the subsystem level type to the driver
level type without having any type erasure in a void *.

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <12-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio.rst            | 18 ++++++-----
 drivers/vfio/fsl-mc/vfio_fsl_mc.c            | 36 ++++++++++++---------
 drivers/vfio/mdev/vfio_mdev.c                | 33 ++++++++++---------
 drivers/vfio/pci/vfio_pci.c                  | 47 +++++++++++++++++-----------
 drivers/vfio/platform/vfio_platform_common.c | 33 +++++++++++--------
 drivers/vfio/vfio.c                          | 20 ++++++------
 include/linux/vfio.h                         | 16 +++++-----
 7 files changed, 117 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst
index d3a02300913a..3337f337293a 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -269,20 +269,22 @@ ready before calling it. The driver provides an ops structure for callbacks
 similar to a file operations structure::
 
 	struct vfio_device_ops {
-		int	(*open)(void *device_data);
-		void	(*release)(void *device_data);
-		ssize_t	(*read)(void *device_data, char __user *buf,
+		int	(*open)(struct vfio_device *vdev);
+		void	(*release)(struct vfio_device *vdev);
+		ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
 				size_t count, loff_t *ppos);
-		ssize_t	(*write)(void *device_data, const char __user *buf,
+		ssize_t	(*write)(struct vfio_device *vdev,
+				 const char __user *buf,
 				 size_t size, loff_t *ppos);
-		long	(*ioctl)(void *device_data, unsigned int cmd,
+		long	(*ioctl)(struct vfio_device *vdev, unsigned int cmd,
 				 unsigned long arg);
-		int	(*mmap)(void *device_data, struct vm_area_struct *vma);
+		int	(*mmap)(struct vfio_device *vdev,
+				struct vm_area_struct *vma);
 	};
 
-Each function is passed the device_data that was originally registered
+Each function is passed the vdev that was originally registered
 in the vfio_register_group_dev() call above.  This allows the bus driver
-an easy place to store its opaque, private data.  The open/release
+to obtain its private data using container_of().  The open/release
 callbacks are issued when a new file descriptor is created for a
 device (via VFIO_GROUP_GET_DEVICE_FD).  The ioctl interface provides
 a direct pass through for VFIO_DEVICE_* ioctls.  The read/write/mmap
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
index ad6c6feeeb4b..45f397c04a89 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
@@ -135,9 +135,10 @@ static void vfio_fsl_mc_regions_cleanup(struct vfio_fsl_mc_device *vdev)
 	kfree(vdev->regions);
 }
 
-static int vfio_fsl_mc_open(void *device_data)
+static int vfio_fsl_mc_open(struct vfio_device *core_vdev)
 {
-	struct vfio_fsl_mc_device *vdev = device_data;
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
 	int ret;
 
 	if (!try_module_get(THIS_MODULE))
@@ -161,9 +162,10 @@ err_reg_init:
 	return ret;
 }
 
-static void vfio_fsl_mc_release(void *device_data)
+static void vfio_fsl_mc_release(struct vfio_device *core_vdev)
 {
-	struct vfio_fsl_mc_device *vdev = device_data;
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
 	int ret;
 
 	mutex_lock(&vdev->reflck->lock);
@@ -197,11 +199,12 @@ static void vfio_fsl_mc_release(void *device_data)
 	module_put(THIS_MODULE);
 }
 
-static long vfio_fsl_mc_ioctl(void *device_data, unsigned int cmd,
-			      unsigned long arg)
+static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev,
+			      unsigned int cmd, unsigned long arg)
 {
 	unsigned long minsz;
-	struct vfio_fsl_mc_device *vdev = device_data;
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
 	struct fsl_mc_device *mc_dev = vdev->mc_dev;
 
 	switch (cmd) {
@@ -327,10 +330,11 @@ static long vfio_fsl_mc_ioctl(void *device_data, unsigned int cmd,
 	}
 }
 
-static ssize_t vfio_fsl_mc_read(void *device_data, char __user *buf,
+static ssize_t vfio_fsl_mc_read(struct vfio_device *core_vdev, char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct vfio_fsl_mc_device *vdev = device_data;
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
 	unsigned int index = VFIO_FSL_MC_OFFSET_TO_INDEX(*ppos);
 	loff_t off = *ppos & VFIO_FSL_MC_OFFSET_MASK;
 	struct fsl_mc_device *mc_dev = vdev->mc_dev;
@@ -404,10 +408,12 @@ static int vfio_fsl_mc_send_command(void __iomem *ioaddr, uint64_t *cmd_data)
 	return 0;
 }
 
-static ssize_t vfio_fsl_mc_write(void *device_data, const char __user *buf,
-				 size_t count, loff_t *ppos)
+static ssize_t vfio_fsl_mc_write(struct vfio_device *core_vdev,
+				 const char __user *buf, size_t count,
+				 loff_t *ppos)
 {
-	struct vfio_fsl_mc_device *vdev = device_data;
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
 	unsigned int index = VFIO_FSL_MC_OFFSET_TO_INDEX(*ppos);
 	loff_t off = *ppos & VFIO_FSL_MC_OFFSET_MASK;
 	struct fsl_mc_device *mc_dev = vdev->mc_dev;
@@ -468,9 +474,11 @@ static int vfio_fsl_mc_mmap_mmio(struct vfio_fsl_mc_region region,
 			       size, vma->vm_page_prot);
 }
 
-static int vfio_fsl_mc_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_fsl_mc_mmap(struct vfio_device *core_vdev,
+			    struct vm_area_struct *vma)
 {
-	struct vfio_fsl_mc_device *vdev = device_data;
+	struct vfio_fsl_mc_device *vdev =
+		container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
 	struct fsl_mc_device *mc_dev = vdev->mc_dev;
 	unsigned int index;
 
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 4043cc91f952..11b3e15403ba 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -21,10 +21,11 @@
 #define DRIVER_AUTHOR   "NVIDIA Corporation"
 #define DRIVER_DESC     "VFIO based driver for Mediated device"
 
-static int vfio_mdev_open(void *device_data)
+static int vfio_mdev_open(struct vfio_device *core_vdev)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
+
 	int ret;
 
 	if (unlikely(!parent->ops->open))
@@ -40,9 +41,9 @@ static int vfio_mdev_open(void *device_data)
 	return ret;
 }
 
-static void vfio_mdev_release(void *device_data)
+static void vfio_mdev_release(struct vfio_device *core_vdev)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (likely(parent->ops->release))
@@ -51,10 +52,10 @@ static void vfio_mdev_release(void *device_data)
 	module_put(THIS_MODULE);
 }
 
-static long vfio_mdev_unlocked_ioctl(void *device_data,
+static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev,
 				     unsigned int cmd, unsigned long arg)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (unlikely(!parent->ops->ioctl))
@@ -63,10 +64,10 @@ static long vfio_mdev_unlocked_ioctl(void *device_data,
 	return parent->ops->ioctl(mdev, cmd, arg);
 }
 
-static ssize_t vfio_mdev_read(void *device_data, char __user *buf,
+static ssize_t vfio_mdev_read(struct vfio_device *core_vdev, char __user *buf,
 			      size_t count, loff_t *ppos)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (unlikely(!parent->ops->read))
@@ -75,10 +76,11 @@ static ssize_t vfio_mdev_read(void *device_data, char __user *buf,
 	return parent->ops->read(mdev, buf, count, ppos);
 }
 
-static ssize_t vfio_mdev_write(void *device_data, const char __user *buf,
-			       size_t count, loff_t *ppos)
+static ssize_t vfio_mdev_write(struct vfio_device *core_vdev,
+			       const char __user *buf, size_t count,
+			       loff_t *ppos)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (unlikely(!parent->ops->write))
@@ -87,9 +89,10 @@ static ssize_t vfio_mdev_write(void *device_data, const char __user *buf,
 	return parent->ops->write(mdev, buf, count, ppos);
 }
 
-static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_mdev_mmap(struct vfio_device *core_vdev,
+			  struct vm_area_struct *vma)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (unlikely(!parent->ops->mmap))
@@ -98,9 +101,9 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
 	return parent->ops->mmap(mdev, vma);
 }
 
-static void vfio_mdev_request(void *device_data, unsigned int count)
+static void vfio_mdev_request(struct vfio_device *core_vdev, unsigned int count)
 {
-	struct mdev_device *mdev = device_data;
+	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
 	struct mdev_parent *parent = mdev->parent;
 
 	if (parent->ops->request)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index a0ac20a499cf..5f1a782d1c65 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -553,9 +553,10 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
 	vfio_device_put(pf_dev);
 }
 
-static void vfio_pci_release(void *device_data)
+static void vfio_pci_release(struct vfio_device *core_vdev)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 
 	mutex_lock(&vdev->reflck->lock);
 
@@ -581,9 +582,10 @@ static void vfio_pci_release(void *device_data)
 	module_put(THIS_MODULE);
 }
 
-static int vfio_pci_open(void *device_data)
+static int vfio_pci_open(struct vfio_device *core_vdev)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	int ret = 0;
 
 	if (!try_module_get(THIS_MODULE))
@@ -797,10 +799,11 @@ struct vfio_devices {
 	int max_index;
 };
 
-static long vfio_pci_ioctl(void *device_data,
+static long vfio_pci_ioctl(struct vfio_device *core_vdev,
 			   unsigned int cmd, unsigned long arg)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	unsigned long minsz;
 
 	if (cmd == VFIO_DEVICE_GET_INFO) {
@@ -1402,11 +1405,10 @@ hot_reset_release:
 	return -ENOTTY;
 }
 
-static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
+static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf,
 			   size_t count, loff_t *ppos, bool iswrite)
 {
 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
-	struct vfio_pci_device *vdev = device_data;
 
 	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
 		return -EINVAL;
@@ -1434,22 +1436,28 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
 	return -EINVAL;
 }
 
-static ssize_t vfio_pci_read(void *device_data, char __user *buf,
+static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf,
 			     size_t count, loff_t *ppos)
 {
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
+
 	if (!count)
 		return 0;
 
-	return vfio_pci_rw(device_data, buf, count, ppos, false);
+	return vfio_pci_rw(vdev, buf, count, ppos, false);
 }
 
-static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
+static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf,
 			      size_t count, loff_t *ppos)
 {
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
+
 	if (!count)
 		return 0;
 
-	return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
+	return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
 }
 
 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
@@ -1646,9 +1654,10 @@ static const struct vm_operations_struct vfio_pci_mmap_ops = {
 	.fault = vfio_pci_mmap_fault,
 };
 
-static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned int index;
 	u64 phys_len, req_len, pgoff, req_start;
@@ -1714,9 +1723,10 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 	return 0;
 }
 
-static void vfio_pci_request(void *device_data, unsigned int count)
+static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	struct pci_dev *pdev = vdev->pdev;
 
 	mutex_lock(&vdev->igate);
@@ -1830,9 +1840,10 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
 
 #define VF_TOKEN_ARG "vf_token="
 
-static int vfio_pci_match(void *device_data, char *buf)
+static int vfio_pci_match(struct vfio_device *core_vdev, char *buf)
 {
-	struct vfio_pci_device *vdev = device_data;
+	struct vfio_pci_device *vdev =
+		container_of(core_vdev, struct vfio_pci_device, vdev);
 	bool vf_token = false;
 	uuid_t uuid;
 	int ret;
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 6eb749250ee4..f5f6b537084a 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -218,9 +218,10 @@ static int vfio_platform_call_reset(struct vfio_platform_device *vdev,
 	return -EINVAL;
 }
 
-static void vfio_platform_release(void *device_data)
+static void vfio_platform_release(struct vfio_device *core_vdev)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 
 	mutex_lock(&driver_lock);
 
@@ -244,9 +245,10 @@ static void vfio_platform_release(void *device_data)
 	module_put(vdev->parent_module);
 }
 
-static int vfio_platform_open(void *device_data)
+static int vfio_platform_open(struct vfio_device *core_vdev)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	int ret;
 
 	if (!try_module_get(vdev->parent_module))
@@ -293,10 +295,12 @@ err_reg:
 	return ret;
 }
 
-static long vfio_platform_ioctl(void *device_data,
+static long vfio_platform_ioctl(struct vfio_device *core_vdev,
 				unsigned int cmd, unsigned long arg)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
+
 	unsigned long minsz;
 
 	if (cmd == VFIO_DEVICE_GET_INFO) {
@@ -455,10 +459,11 @@ err:
 	return -EFAULT;
 }
 
-static ssize_t vfio_platform_read(void *device_data, char __user *buf,
-				  size_t count, loff_t *ppos)
+static ssize_t vfio_platform_read(struct vfio_device *core_vdev,
+				  char __user *buf, size_t count, loff_t *ppos)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
 	loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
 
@@ -531,10 +536,11 @@ err:
 	return -EFAULT;
 }
 
-static ssize_t vfio_platform_write(void *device_data, const char __user *buf,
+static ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf,
 				   size_t count, loff_t *ppos)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
 	loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
 
@@ -573,9 +579,10 @@ static int vfio_platform_mmap_mmio(struct vfio_platform_region region,
 			       req_len, vma->vm_page_prot);
 }
 
-static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma)
+static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
 {
-	struct vfio_platform_device *vdev = device_data;
+	struct vfio_platform_device *vdev =
+		container_of(core_vdev, struct vfio_platform_device, vdev);
 	unsigned int index;
 
 	index = vma->vm_pgoff >> (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT);
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 180b4ab02d11..e6f5109fba48 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -832,7 +832,7 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 		int ret;
 
 		if (it->ops->match) {
-			ret = it->ops->match(it->device_data, buf);
+			ret = it->ops->match(it, buf);
 			if (ret < 0) {
 				device = ERR_PTR(ret);
 				break;
@@ -893,7 +893,7 @@ void vfio_unregister_group_dev(struct vfio_device *device)
 	rc = try_wait_for_completion(&device->comp);
 	while (rc <= 0) {
 		if (device->ops->request)
-			device->ops->request(device->device_data, i++);
+			device->ops->request(device, i++);
 
 		if (interrupted) {
 			rc = wait_for_completion_timeout(&device->comp,
@@ -1379,7 +1379,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	if (IS_ERR(device))
 		return PTR_ERR(device);
 
-	ret = device->ops->open(device->device_data);
+	ret = device->ops->open(device);
 	if (ret) {
 		vfio_device_put(device);
 		return ret;
@@ -1391,7 +1391,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	 */
 	ret = get_unused_fd_flags(O_CLOEXEC);
 	if (ret < 0) {
-		device->ops->release(device->device_data);
+		device->ops->release(device);
 		vfio_device_put(device);
 		return ret;
 	}
@@ -1401,7 +1401,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
 	if (IS_ERR(filep)) {
 		put_unused_fd(ret);
 		ret = PTR_ERR(filep);
-		device->ops->release(device->device_data);
+		device->ops->release(device);
 		vfio_device_put(device);
 		return ret;
 	}
@@ -1558,7 +1558,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
 {
 	struct vfio_device *device = filep->private_data;
 
-	device->ops->release(device->device_data);
+	device->ops->release(device);
 
 	vfio_group_try_dissolve_container(device->group);
 
@@ -1575,7 +1575,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep,
 	if (unlikely(!device->ops->ioctl))
 		return -EINVAL;
 
-	return device->ops->ioctl(device->device_data, cmd, arg);
+	return device->ops->ioctl(device, cmd, arg);
 }
 
 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
@@ -1586,7 +1586,7 @@ static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
 	if (unlikely(!device->ops->read))
 		return -EINVAL;
 
-	return device->ops->read(device->device_data, buf, count, ppos);
+	return device->ops->read(device, buf, count, ppos);
 }
 
 static ssize_t vfio_device_fops_write(struct file *filep,
@@ -1598,7 +1598,7 @@ static ssize_t vfio_device_fops_write(struct file *filep,
 	if (unlikely(!device->ops->write))
 		return -EINVAL;
 
-	return device->ops->write(device->device_data, buf, count, ppos);
+	return device->ops->write(device, buf, count, ppos);
 }
 
 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
@@ -1608,7 +1608,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 	if (unlikely(!device->ops->mmap))
 		return -EINVAL;
 
-	return device->ops->mmap(device->device_data, vma);
+	return device->ops->mmap(device, vma);
 }
 
 static const struct file_operations vfio_device_fops = {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 4995faf51efe..784c34c0a287 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -44,17 +44,17 @@ struct vfio_device {
  */
 struct vfio_device_ops {
 	char	*name;
-	int	(*open)(void *device_data);
-	void	(*release)(void *device_data);
-	ssize_t	(*read)(void *device_data, char __user *buf,
+	int	(*open)(struct vfio_device *vdev);
+	void	(*release)(struct vfio_device *vdev);
+	ssize_t	(*read)(struct vfio_device *vdev, char __user *buf,
 			size_t count, loff_t *ppos);
-	ssize_t	(*write)(void *device_data, const char __user *buf,
+	ssize_t	(*write)(struct vfio_device *vdev, const char __user *buf,
 			 size_t count, loff_t *size);
-	long	(*ioctl)(void *device_data, unsigned int cmd,
+	long	(*ioctl)(struct vfio_device *vdev, unsigned int cmd,
 			 unsigned long arg);
-	int	(*mmap)(void *device_data, struct vm_area_struct *vma);
-	void	(*request)(void *device_data, unsigned int count);
-	int	(*match)(void *device_data, char *buf);
+	int	(*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
+	void	(*request)(struct vfio_device *vdev, unsigned int count);
+	int	(*match)(struct vfio_device *vdev, char *buf);
 };
 
 extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
-- 
cgit v1.2.3


From 1e04ec14204dec28131855d8dd160c3d55d12797 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Mar 2021 09:53:08 -0600
Subject: vfio: Remove device_data from the vfio bus driver API

There are no longer any users, so it can go away. Everything is using
container_of now.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <14-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio.rst            |  3 +--
 drivers/vfio/fsl-mc/vfio_fsl_mc.c            |  5 +++--
 drivers/vfio/mdev/vfio_mdev.c                |  2 +-
 drivers/vfio/pci/vfio_pci.c                  |  2 +-
 drivers/vfio/platform/vfio_platform_common.c |  2 +-
 drivers/vfio/vfio.c                          | 12 +-----------
 include/linux/vfio.h                         |  4 +---
 7 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst
index 3337f337293a..decc68cb8114 100644
--- a/Documentation/driver-api/vfio.rst
+++ b/Documentation/driver-api/vfio.rst
@@ -254,8 +254,7 @@ vfio_unregister_group_dev() respectively::
 
 	void vfio_init_group_dev(struct vfio_device *device,
 				struct device *dev,
-				const struct vfio_device_ops *ops,
-				void *device_data);
+				const struct vfio_device_ops *ops);
 	int vfio_register_group_dev(struct vfio_device *device);
 	void vfio_unregister_group_dev(struct vfio_device *device);
 
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
index 45f397c04a89..980e59551301 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
@@ -75,7 +75,8 @@ static int vfio_fsl_mc_reflck_attach(struct vfio_fsl_mc_device *vdev)
 			goto unlock;
 		}
 
-		cont_vdev = vfio_device_data(device);
+		cont_vdev =
+			container_of(device, struct vfio_fsl_mc_device, vdev);
 		if (!cont_vdev || !cont_vdev->reflck) {
 			vfio_device_put(device);
 			ret = -ENODEV;
@@ -630,7 +631,7 @@ static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev)
 		goto out_group_put;
 	}
 
-	vfio_init_group_dev(&vdev->vdev, dev, &vfio_fsl_mc_ops, vdev);
+	vfio_init_group_dev(&vdev->vdev, dev, &vfio_fsl_mc_ops);
 	vdev->mc_dev = mc_dev;
 	mutex_init(&vdev->igate);
 
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 11b3e15403ba..ae7e322fbe3c 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -134,7 +134,7 @@ static int vfio_mdev_probe(struct device *dev)
 	if (!vdev)
 		return -ENOMEM;
 
-	vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops, mdev);
+	vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops);
 	ret = vfio_register_group_dev(vdev);
 	if (ret) {
 		kfree(vdev);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 1f70387c8afe..55ef27a15d4d 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -2022,7 +2022,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto out_group_put;
 	}
 
-	vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops, vdev);
+	vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops);
 	vdev->pdev = pdev;
 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
 	mutex_init(&vdev->igate);
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index f5f6b537084a..361e5b57e369 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -666,7 +666,7 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev,
 	struct iommu_group *group;
 	int ret;
 
-	vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops, vdev);
+	vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops);
 
 	ret = vfio_platform_acpi_probe(vdev, dev);
 	if (ret)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index e6f5109fba48..5e631c359ef2 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -741,12 +741,11 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
  * VFIO driver API
  */
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
-			 const struct vfio_device_ops *ops, void *device_data)
+			 const struct vfio_device_ops *ops)
 {
 	init_completion(&device->comp);
 	device->dev = dev;
 	device->ops = ops;
-	device->device_data = device_data;
 }
 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
 
@@ -851,15 +850,6 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 	return device;
 }
 
-/*
- * Caller must hold a reference to the vfio_device
- */
-void *vfio_device_data(struct vfio_device *device)
-{
-	return device->device_data;
-}
-EXPORT_SYMBOL_GPL(vfio_device_data);
-
 /*
  * Decrement the device reference count and wait for the device to be
  * removed.  Open file descriptors for the device... */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 784c34c0a287..a2c5b30e1763 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -24,7 +24,6 @@ struct vfio_device {
 	refcount_t refcount;
 	struct completion comp;
 	struct list_head group_next;
-	void *device_data;
 };
 
 /**
@@ -61,12 +60,11 @@ extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
 extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
 
 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
-			 const struct vfio_device_ops *ops, void *device_data);
+			 const struct vfio_device_ops *ops);
 int vfio_register_group_dev(struct vfio_device *device);
 void vfio_unregister_group_dev(struct vfio_device *device);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
-extern void *vfio_device_data(struct vfio_device *device);
 
 /* events for the backend driver notify callback */
 enum vfio_iommu_notify_type {
-- 
cgit v1.2.3


From dc67dac617e3586ec2db49f3c4fde1d6ac7afe14 Mon Sep 17 00:00:00 2001
From: Aditya Srivastava <yashsri421@gmail.com>
Date: Sun, 14 Mar 2021 12:58:28 +0530
Subject: soc: fsl: guts: fix comment syntax in file

The opening comment mark '/**' is used for kernel-doc comments.
There are certain comments in include/linux/fsl/guts.h which follows this
syntax, but the content inside does not comply with kernel-doc.

E.g., opening comment for "Freecale 85xx and 86xx Global Utilties
register set" follows kernel-doc syntax(i.e., '/**'), but the content
inside does not comply with any kernel-doc specification (function,
struct, etc).

This causes unwelcomed warning from kernel-doc:
"warning: expecting prototype for Freecale 85xx and 86xx Global Utilties register set(). Prototype was for __FSL_GUTS_H__() instead"

Replace all such comment occurrences with general comment format,
i.e. '/*' to pervent kernel-doc from parsing these.

Signed-off-by: Aditya Srivastava <yashsri421@gmail.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Li Yang <leoyang.li@nxp.com>
---
 include/linux/fsl/guts.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsl/guts.h b/include/linux/fsl/guts.h
index 0ac27b233f12..fdb55ca47a4f 100644
--- a/include/linux/fsl/guts.h
+++ b/include/linux/fsl/guts.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/**
+/*
  * Freecale 85xx and 86xx Global Utilties register set
  *
  * Authors: Jeff Brown
@@ -14,7 +14,7 @@
 #include <linux/types.h>
 #include <linux/io.h>
 
-/**
+/*
  * Global Utility Registers.
  *
  * Not all registers defined in this structure are available on all chips, so
-- 
cgit v1.2.3


From 2cd87a7b293dedbbaea3b6739f95d428a2d9890d Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 5 Apr 2021 17:43:03 +0100
Subject: coresight: core: Add support for dedicated percpu sinks

Add support for dedicated sinks that are bound to individual CPUs. (e.g,
TRBE). To allow quicker access to the sink for a given CPU bound source,
keep a percpu array of the sink devices. Also, add support for building
a path to the CPU local sink from the ETM.

This adds a new percpu sink type CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM.
This new sink type is exclusively available and can only work with percpu
source type device CORESIGHT_DEV_SUBTYPE_SOURCE_PROC.

This defines a percpu structure that accommodates a single coresight_device
which can be used to store an initialized instance from a sink driver. As
these sinks are exclusively linked and dependent on corresponding percpu
sources devices, they should also be the default sink device during a perf
session.

Outwards device connections are scanned while establishing paths between a
source and a sink device. But such connections are not present for certain
percpu source and sink devices which are exclusively linked and dependent.
Build the path directly and skip connection scanning for such devices.

Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
[Moved the set/get percpu sink APIs from TRBE patch to here
 Fixed build break on arm32]
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20210405164307.1720226-17-suzuki.poulose@arm.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 drivers/hwtracing/coresight/coresight-core.c | 29 ++++++++++++++++++++++++++--
 drivers/hwtracing/coresight/coresight-priv.h |  3 +++
 include/linux/coresight.h                    | 13 +++++++++++++
 3 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index 0062c8935653..55c645616bf6 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -23,6 +23,7 @@
 #include "coresight-priv.h"
 
 static DEFINE_MUTEX(coresight_mutex);
+DEFINE_PER_CPU(struct coresight_device *, csdev_sink);
 
 /**
  * struct coresight_node - elements of a path, from source to sink
@@ -70,6 +71,18 @@ void coresight_remove_cti_ops(void)
 }
 EXPORT_SYMBOL_GPL(coresight_remove_cti_ops);
 
+void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev)
+{
+	per_cpu(csdev_sink, cpu) = csdev;
+}
+EXPORT_SYMBOL_GPL(coresight_set_percpu_sink);
+
+struct coresight_device *coresight_get_percpu_sink(int cpu)
+{
+	return per_cpu(csdev_sink, cpu);
+}
+EXPORT_SYMBOL_GPL(coresight_get_percpu_sink);
+
 static int coresight_id_match(struct device *dev, void *data)
 {
 	int trace_id, i_trace_id;
@@ -784,6 +797,14 @@ static int _coresight_build_path(struct coresight_device *csdev,
 	if (csdev == sink)
 		goto out;
 
+	if (coresight_is_percpu_source(csdev) && coresight_is_percpu_sink(sink) &&
+	    sink == per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev))) {
+		if (_coresight_build_path(sink, sink, path) == 0) {
+			found = true;
+			goto out;
+		}
+	}
+
 	/* Not a sink - recursively explore each port found on this element */
 	for (i = 0; i < csdev->pdata->nr_outport; i++) {
 		struct coresight_device *child_dev;
@@ -999,8 +1020,12 @@ coresight_find_default_sink(struct coresight_device *csdev)
 	int depth = 0;
 
 	/* look for a default sink if we have not found for this device */
-	if (!csdev->def_sink)
-		csdev->def_sink = coresight_find_sink(csdev, &depth);
+	if (!csdev->def_sink) {
+		if (coresight_is_percpu_source(csdev))
+			csdev->def_sink = per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev));
+		if (!csdev->def_sink)
+			csdev->def_sink = coresight_find_sink(csdev, &depth);
+	}
 	return csdev->def_sink;
 }
 
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index f5f654ea2994..ff1dd2092ac5 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -232,4 +232,7 @@ coresight_find_csdev_by_fwnode(struct fwnode_handle *r_fwnode);
 void coresight_set_assoc_ectdev_mutex(struct coresight_device *csdev,
 				      struct coresight_device *ect_csdev);
 
+void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev);
+struct coresight_device *coresight_get_percpu_sink(int cpu);
+
 #endif
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 976ec2697610..85008a65e21f 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -50,6 +50,7 @@ enum coresight_dev_subtype_sink {
 	CORESIGHT_DEV_SUBTYPE_SINK_PORT,
 	CORESIGHT_DEV_SUBTYPE_SINK_BUFFER,
 	CORESIGHT_DEV_SUBTYPE_SINK_SYSMEM,
+	CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM,
 };
 
 enum coresight_dev_subtype_link {
@@ -455,6 +456,18 @@ static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 o
 }
 #endif	/* CONFIG_64BIT */
 
+static inline bool coresight_is_percpu_source(struct coresight_device *csdev)
+{
+	return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE) &&
+	       (csdev->subtype.source_subtype == CORESIGHT_DEV_SUBTYPE_SOURCE_PROC);
+}
+
+static inline bool coresight_is_percpu_sink(struct coresight_device *csdev)
+{
+	return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SINK) &&
+	       (csdev->subtype.sink_subtype == CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM);
+}
+
 extern struct coresight_device *
 coresight_register(struct coresight_desc *desc);
 extern void coresight_unregister(struct coresight_device *csdev);
-- 
cgit v1.2.3


From 0f6925b3e8da0dbbb52447ca8a8b42b371aac7db Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Apr 2021 06:26:02 -0700
Subject: virtio_net: Do not pull payload in skb->head

Xuan Zhuo reported that commit 3226b158e67c ("net: avoid 32 x truesize
under-estimation for tiny skbs") brought  a ~10% performance drop.

The reason for the performance drop was that GRO was forced
to chain sk_buff (using skb_shinfo(skb)->frag_list), which
uses more memory but also cause packet consumers to go over
a lot of overhead handling all the tiny skbs.

It turns out that virtio_net page_to_skb() has a wrong strategy :
It allocates skbs with GOOD_COPY_LEN (128) bytes in skb->head, then
copies 128 bytes from the page, before feeding the packet to GRO stack.

This was suboptimal before commit 3226b158e67c ("net: avoid 32 x truesize
under-estimation for tiny skbs") because GRO was using 2 frags per MSS,
meaning we were not packing MSS with 100% efficiency.

Fix is to pull only the ethernet header in page_to_skb()

Then, we change virtio_net_hdr_to_skb() to pull the missing
headers, instead of assuming they were already pulled by callers.

This fixes the performance regression, but could also allow virtio_net
to accept packets with more than 128bytes of headers.

Many thanks to Xuan Zhuo for his report, and his tests/help.

Fixes: 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs")
Reported-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://www.spinics.net/lists/netdev/msg731397.html
Co-Developed-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: virtualization@lists.linux-foundation.org
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c   | 10 +++++++---
 include/linux/virtio_net.h | 14 +++++++++-----
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 82e520d2cb12..0824e6999e49 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -406,9 +406,13 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	offset += hdr_padded_len;
 	p += hdr_padded_len;
 
-	copy = len;
-	if (copy > skb_tailroom(skb))
-		copy = skb_tailroom(skb);
+	/* Copy all frame if it fits skb->head, otherwise
+	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
+	 */
+	if (len <= skb_tailroom(skb))
+		copy = len;
+	else
+		copy = ETH_HLEN + metasize;
 	skb_put_data(skb, p, copy);
 
 	if (metasize) {
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 98775d7fa696..b465f8f3e554 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -65,14 +65,18 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 	skb_reset_mac_header(skb);
 
 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-		u16 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
-		u16 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
+		u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
+		u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
+		u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));
+
+		if (!pskb_may_pull(skb, needed))
+			return -EINVAL;
 
 		if (!skb_partial_csum_set(skb, start, off))
 			return -EINVAL;
 
 		p_off = skb_transport_offset(skb) + thlen;
-		if (p_off > skb_headlen(skb))
+		if (!pskb_may_pull(skb, p_off))
 			return -EINVAL;
 	} else {
 		/* gso packets without NEEDS_CSUM do not set transport_offset.
@@ -102,14 +106,14 @@ retry:
 			}
 
 			p_off = keys.control.thoff + thlen;
-			if (p_off > skb_headlen(skb) ||
+			if (!pskb_may_pull(skb, p_off) ||
 			    keys.basic.ip_proto != ip_proto)
 				return -EINVAL;
 
 			skb_set_transport_header(skb, keys.control.thoff);
 		} else if (gso_type) {
 			p_off = thlen;
-			if (p_off > skb_headlen(skb))
+			if (!pskb_may_pull(skb, p_off))
 				return -EINVAL;
 		}
 	}
-- 
cgit v1.2.3


From 77651900cede54930cd8a039dcd4583bfa308807 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Mon, 5 Apr 2021 16:13:41 -0700
Subject: usbnet: add _mii suffix to usbnet_set/get_link_ksettings

The generic functions assumed devices provided an MDIO interface (accessed
via older mii code, not phylib). This is true only for genuine ethernet.

Devices with a higher level of abstraction or based on different
technologies do not have MDIO. To support this case, first rename
the existing functions with _mii suffix.

v2: rebased on changed upstream
v3: changed names to clearly say that this does NOT use phylib
v4: moved hunks to correct patch; reworded commmit messages

Signed-off-by : Oliver Neukum <oneukum@suse.com>
Tested-by: Roland Dreier <roland@kernel.org>
Reviewed-by: Grant Grundler <grundler@chromium.org>
Tested-by: Grant Grundler <grundler@chromium.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/asix_devices.c | 12 ++++++------
 drivers/net/usb/cdc_ncm.c      |  4 ++--
 drivers/net/usb/dm9601.c       |  4 ++--
 drivers/net/usb/mcs7830.c      |  4 ++--
 drivers/net/usb/sierra_net.c   |  4 ++--
 drivers/net/usb/smsc75xx.c     |  4 ++--
 drivers/net/usb/sr9700.c       |  4 ++--
 drivers/net/usb/sr9800.c       |  4 ++--
 drivers/net/usb/usbnet.c       | 15 +++++++++------
 include/linux/usb/usbnet.h     |  4 ++--
 10 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c
index 6e13d8165852..19a8fafb8f04 100644
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -125,8 +125,8 @@ static const struct ethtool_ops ax88172_ethtool_ops = {
 	.get_eeprom		= asix_get_eeprom,
 	.set_eeprom		= asix_set_eeprom,
 	.nway_reset		= usbnet_nway_reset,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 static void ax88172_set_multicast(struct net_device *net)
@@ -291,8 +291,8 @@ static const struct ethtool_ops ax88772_ethtool_ops = {
 	.get_eeprom		= asix_get_eeprom,
 	.set_eeprom		= asix_set_eeprom,
 	.nway_reset		= usbnet_nway_reset,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 static int ax88772_link_reset(struct usbnet *dev)
@@ -782,8 +782,8 @@ static const struct ethtool_ops ax88178_ethtool_ops = {
 	.get_eeprom		= asix_get_eeprom,
 	.set_eeprom		= asix_set_eeprom,
 	.nway_reset		= usbnet_nway_reset,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 static int marvell_phy_init(struct usbnet *dev)
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 8ae565a801b5..04f3851dd48b 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -142,8 +142,8 @@ static const struct ethtool_ops cdc_ncm_ethtool_ops = {
 	.get_sset_count    = cdc_ncm_get_sset_count,
 	.get_strings       = cdc_ncm_get_strings,
 	.get_ethtool_stats = cdc_ncm_get_ethtool_stats,
-	.get_link_ksettings      = usbnet_get_link_ksettings,
-	.set_link_ksettings      = usbnet_set_link_ksettings,
+	.get_link_ksettings      = usbnet_get_link_ksettings_mii,
+	.set_link_ksettings      = usbnet_set_link_ksettings_mii,
 };
 
 static u32 cdc_ncm_check_rx_max(struct usbnet *dev, u32 new_rx)
diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c
index b5d2ac55a874..89cc61d7a675 100644
--- a/drivers/net/usb/dm9601.c
+++ b/drivers/net/usb/dm9601.c
@@ -282,8 +282,8 @@ static const struct ethtool_ops dm9601_ethtool_ops = {
 	.get_eeprom_len	= dm9601_get_eeprom_len,
 	.get_eeprom	= dm9601_get_eeprom,
 	.nway_reset	= usbnet_nway_reset,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 static void dm9601_set_multicast(struct net_device *net)
diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c
index fc512b780d15..9f9352a4522f 100644
--- a/drivers/net/usb/mcs7830.c
+++ b/drivers/net/usb/mcs7830.c
@@ -452,8 +452,8 @@ static const struct ethtool_ops mcs7830_ethtool_ops = {
 	.get_msglevel		= usbnet_get_msglevel,
 	.set_msglevel		= usbnet_set_msglevel,
 	.nway_reset		= usbnet_nway_reset,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 static const struct net_device_ops mcs7830_netdev_ops = {
diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c
index 55a244eca5ca..55025202dc4f 100644
--- a/drivers/net/usb/sierra_net.c
+++ b/drivers/net/usb/sierra_net.c
@@ -629,8 +629,8 @@ static const struct ethtool_ops sierra_net_ethtool_ops = {
 	.get_msglevel = usbnet_get_msglevel,
 	.set_msglevel = usbnet_set_msglevel,
 	.nway_reset = usbnet_nway_reset,
-	.get_link_ksettings = usbnet_get_link_ksettings,
-	.set_link_ksettings = usbnet_set_link_ksettings,
+	.get_link_ksettings = usbnet_get_link_ksettings_mii,
+	.set_link_ksettings = usbnet_set_link_ksettings_mii,
 };
 
 static int sierra_net_get_fw_attr(struct usbnet *dev, u16 *datap)
diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c
index 4353b370249f..f8cdabb9ef5a 100644
--- a/drivers/net/usb/smsc75xx.c
+++ b/drivers/net/usb/smsc75xx.c
@@ -741,8 +741,8 @@ static const struct ethtool_ops smsc75xx_ethtool_ops = {
 	.set_eeprom	= smsc75xx_ethtool_set_eeprom,
 	.get_wol	= smsc75xx_ethtool_get_wol,
 	.set_wol	= smsc75xx_ethtool_set_wol,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 static int smsc75xx_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd)
diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c
index 878557ad03ad..ce29261263cd 100644
--- a/drivers/net/usb/sr9700.c
+++ b/drivers/net/usb/sr9700.c
@@ -250,8 +250,8 @@ static const struct ethtool_ops sr9700_ethtool_ops = {
 	.get_eeprom_len	= sr9700_get_eeprom_len,
 	.get_eeprom	= sr9700_get_eeprom,
 	.nway_reset	= usbnet_nway_reset,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 static void sr9700_set_multicast(struct net_device *netdev)
diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c
index da56735d7755..a822d81310d5 100644
--- a/drivers/net/usb/sr9800.c
+++ b/drivers/net/usb/sr9800.c
@@ -527,8 +527,8 @@ static const struct ethtool_ops sr9800_ethtool_ops = {
 	.get_eeprom_len	= sr_get_eeprom_len,
 	.get_eeprom	= sr_get_eeprom,
 	.nway_reset	= usbnet_nway_reset,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 static int sr9800_link_reset(struct usbnet *dev)
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index f4f37ecfed58..5b4629c80b4b 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -944,7 +944,10 @@ EXPORT_SYMBOL_GPL(usbnet_open);
  * they'll probably want to use this base set.
  */
 
-int usbnet_get_link_ksettings(struct net_device *net,
+/* These methods are written on the assumption that the device
+ * uses MII
+ */
+int usbnet_get_link_ksettings_mii(struct net_device *net,
 			      struct ethtool_link_ksettings *cmd)
 {
 	struct usbnet *dev = netdev_priv(net);
@@ -956,9 +959,9 @@ int usbnet_get_link_ksettings(struct net_device *net,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings);
+EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_mii);
 
-int usbnet_set_link_ksettings(struct net_device *net,
+int usbnet_set_link_ksettings_mii(struct net_device *net,
 			      const struct ethtool_link_ksettings *cmd)
 {
 	struct usbnet *dev = netdev_priv(net);
@@ -978,7 +981,7 @@ int usbnet_set_link_ksettings(struct net_device *net,
 
 	return retval;
 }
-EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings);
+EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings_mii);
 
 u32 usbnet_get_link (struct net_device *net)
 {
@@ -1043,8 +1046,8 @@ static const struct ethtool_ops usbnet_ethtool_ops = {
 	.get_msglevel		= usbnet_get_msglevel,
 	.set_msglevel		= usbnet_set_msglevel,
 	.get_ts_info		= ethtool_op_get_ts_info,
-	.get_link_ksettings	= usbnet_get_link_ksettings,
-	.set_link_ksettings	= usbnet_set_link_ksettings,
+	.get_link_ksettings	= usbnet_get_link_ksettings_mii,
+	.set_link_ksettings	= usbnet_set_link_ksettings_mii,
 };
 
 /*-------------------------------------------------------------------------*/
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index cfbfd6fe01df..a89e1452107d 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -267,9 +267,9 @@ extern void usbnet_pause_rx(struct usbnet *);
 extern void usbnet_resume_rx(struct usbnet *);
 extern void usbnet_purge_paused_rxq(struct usbnet *);
 
-extern int usbnet_get_link_ksettings(struct net_device *net,
+extern int usbnet_get_link_ksettings_mii(struct net_device *net,
 				     struct ethtool_link_ksettings *cmd);
-extern int usbnet_set_link_ksettings(struct net_device *net,
+extern int usbnet_set_link_ksettings_mii(struct net_device *net,
 				     const struct ethtool_link_ksettings *cmd);
 extern u32 usbnet_get_link(struct net_device *net);
 extern u32 usbnet_get_msglevel(struct net_device *);
-- 
cgit v1.2.3


From 956baa99571bbaf88f3e91190dfb498c685b0e21 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Mon, 5 Apr 2021 16:13:42 -0700
Subject: usbnet: add method for reporting speed without MII

The old method for reporting link speed assumed a driver uses the
generic phy (mii) MDIO read/write functions. CDC devices don't
expose the phy.

Add a primitive internal version reporting back directly what
the CDC notification/status operations recorded.

v2: rebased on upstream
v3: changed names and made clear which units are used
v4: moved hunks to correct patch; rewrote commmit messages

Signed-off-by: Oliver Neukum <oneukum@suse.com>
Tested-by: Roland Dreier <roland@kernel.org>
Reviewed-by: Grant Grundler <grundler@chromium.org>
Tested-by: Grant Grundler <grundler@chromium.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 23 +++++++++++++++++++++++
 include/linux/usb/usbnet.h |  7 +++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 5b4629c80b4b..ecf62849f4c1 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -961,6 +961,27 @@ int usbnet_get_link_ksettings_mii(struct net_device *net,
 }
 EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_mii);
 
+int usbnet_get_link_ksettings_internal(struct net_device *net,
+					struct ethtool_link_ksettings *cmd)
+{
+	struct usbnet *dev = netdev_priv(net);
+
+	/* the assumption that speed is equal on tx and rx
+	 * is deeply engrained into the networking layer.
+	 * For wireless stuff it is not true.
+	 * We assume that rx_speed matters more.
+	 */
+	if (dev->rx_speed != SPEED_UNSET)
+		cmd->base.speed = dev->rx_speed / 1000000;
+	else if (dev->tx_speed != SPEED_UNSET)
+		cmd->base.speed = dev->tx_speed / 1000000;
+	else
+		cmd->base.speed = SPEED_UNKNOWN;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_internal);
+
 int usbnet_set_link_ksettings_mii(struct net_device *net,
 			      const struct ethtool_link_ksettings *cmd)
 {
@@ -1664,6 +1685,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	dev->intf = udev;
 	dev->driver_info = info;
 	dev->driver_name = name;
+	dev->rx_speed = SPEED_UNSET;
+	dev->tx_speed = SPEED_UNSET;
 
 	net->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!net->tstats)
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index a89e1452107d..8336e86ce606 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -53,6 +53,9 @@ struct usbnet {
 	u32			hard_mtu;	/* count any extra framing */
 	size_t			rx_urb_size;	/* size for rx urbs */
 	struct mii_if_info	mii;
+	long			rx_speed;	/* If MII not used */
+	long			tx_speed;	/* If MII not used */
+#		define SPEED_UNSET	-1
 
 	/* various kinds of pending driver work */
 	struct sk_buff_head	rxq;
@@ -81,8 +84,6 @@ struct usbnet {
 #		define EVENT_LINK_CHANGE	11
 #		define EVENT_SET_RX_MODE	12
 #		define EVENT_NO_IP_ALIGN	13
-	u32			rx_speed;	/* in bps - NOT Mbps */
-	u32			tx_speed;	/* in bps - NOT Mbps */
 };
 
 static inline struct usb_driver *driver_of(struct usb_interface *intf)
@@ -271,6 +272,8 @@ extern int usbnet_get_link_ksettings_mii(struct net_device *net,
 				     struct ethtool_link_ksettings *cmd);
 extern int usbnet_set_link_ksettings_mii(struct net_device *net,
 				     const struct ethtool_link_ksettings *cmd);
+extern int usbnet_get_link_ksettings_internal(struct net_device *net,
+				     struct ethtool_link_ksettings *cmd);
 extern u32 usbnet_get_link(struct net_device *net);
 extern u32 usbnet_get_msglevel(struct net_device *);
 extern void usbnet_set_msglevel(struct net_device *, u32);
-- 
cgit v1.2.3


From 1c84b33101c82683dee8b06761ca1f69e78c8ee7 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 1 Apr 2021 15:00:19 -0700
Subject: bpf, sockmap: Fix sk->prot unhash op reset

In '4da6a196f93b1' we fixed a potential unhash loop caused when
a TLS socket in a sockmap was removed from the sockmap. This
happened because the unhash operation on the TLS ctx continued
to point at the sockmap implementation of unhash even though the
psock has already been removed. The sockmap unhash handler when a
psock is removed does the following,

 void sock_map_unhash(struct sock *sk)
 {
	void (*saved_unhash)(struct sock *sk);
	struct sk_psock *psock;

	rcu_read_lock();
	psock = sk_psock(sk);
	if (unlikely(!psock)) {
		rcu_read_unlock();
		if (sk->sk_prot->unhash)
			sk->sk_prot->unhash(sk);
		return;
	}
        [...]
 }

The unlikely() case is there to handle the case where psock is detached
but the proto ops have not been updated yet. But, in the above case
with TLS and removed psock we never fixed sk_prot->unhash() and unhash()
points back to sock_map_unhash resulting in a loop. To fix this we added
this bit of code,

 static inline void sk_psock_restore_proto(struct sock *sk,
                                          struct sk_psock *psock)
 {
       sk->sk_prot->unhash = psock->saved_unhash;

This will set the sk_prot->unhash back to its saved value. This is the
correct callback for a TLS socket that has been removed from the sock_map.
Unfortunately, this also overwrites the unhash pointer for all psocks.
We effectively break sockmap unhash handling for any future socks.
Omitting the unhash operation will leave stale entries in the map if
a socket transition through unhash, but does not do close() op.

To fix set unhash correctly before calling into tls_update. This way the
TLS enabled socket will point to the saved unhash() handler.

Fixes: 4da6a196f93b1 ("bpf: Sockmap/tls, during free we may call tcp_bpf_unhash() in loop")
Reported-by: Cong Wang <xiyou.wangcong@gmail.com>
Reported-by: Lorenz Bauer <lmb@cloudflare.com>
Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/161731441904.68884.15593917809745631972.stgit@john-XPS-13-9370
---
 include/linux/skmsg.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 8edbbf5f2f93..822c048934e3 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -349,8 +349,13 @@ static inline void sk_psock_update_proto(struct sock *sk,
 static inline void sk_psock_restore_proto(struct sock *sk,
 					  struct sk_psock *psock)
 {
-	sk->sk_prot->unhash = psock->saved_unhash;
 	if (inet_csk_has_ulp(sk)) {
+		/* TLS does not have an unhash proto in SW cases, but we need
+		 * to ensure we stop using the sock_map unhash routine because
+		 * the associated psock is being removed. So use the original
+		 * unhash handler.
+		 */
+		WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash);
 		tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
 	} else {
 		sk->sk_write_space = psock->saved_write_space;
-- 
cgit v1.2.3


From a14587dfc5ad2312dabdd42a610d80ecd0dc8bea Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@nvidia.com>
Date: Thu, 21 Jan 2021 16:01:37 +0200
Subject: net/mlx5: Fix placement of log_max_flow_counter

The cited commit wrongly placed log_max_flow_counter field of
mlx5_ifc_flow_table_prop_layout_bits, align it to the HW spec intended
placement.

Fixes: 16f1c5bb3ed7 ("net/mlx5: Check device capability for maximum flow counters")
Signed-off-by: Raed Salem <raeds@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index df5d91c8b2d4..1ccedb7816d0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -437,11 +437,11 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         reserved_at_60[0x18];
 	u8         log_max_ft_num[0x8];
 
-	u8         reserved_at_80[0x18];
+	u8         reserved_at_80[0x10];
+	u8         log_max_flow_counter[0x8];
 	u8         log_max_destination[0x8];
 
-	u8         log_max_flow_counter[0x8];
-	u8         reserved_at_a8[0x10];
+	u8         reserved_at_a0[0x18];
 	u8         log_max_flow[0x8];
 
 	u8         reserved_at_c0[0x40];
-- 
cgit v1.2.3


From ce28f0fd670ddffcd564ce7119bdefbaf08f02d3 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Sun, 4 Apr 2021 10:50:50 +0300
Subject: net/mlx5: Fix PPLM register mapping

Add reserved mapping to cover all the register in order to avoid
setting arbitrary values to newer FW which implements the reserved
fields.

Fixes: a58837f52d43 ("net/mlx5e: Expose FEC feilds and related capability bit")
Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1ccedb7816d0..9940070cda8f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8835,6 +8835,8 @@ struct mlx5_ifc_pplm_reg_bits {
 
 	u8         fec_override_admin_100g_2x[0x10];
 	u8         fec_override_admin_50g_1x[0x10];
+
+	u8         reserved_at_140[0x140];
 };
 
 struct mlx5_ifc_ppcnt_reg_bits {
-- 
cgit v1.2.3


From 534b1204ca4694db1093b15cf3e79a99fcb6a6da Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Sun, 4 Apr 2021 12:55:00 +0300
Subject: net/mlx5: Fix PBMC register mapping

Add reserved mapping to cover all the register in order to avoid setting
arbitrary values to newer FW which implements the reserved fields.

Fixes: 50b4a3c23646 ("net/mlx5: PPTB and PBMC register firmware command support")
Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 9940070cda8f..9c68b2da14c6 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10200,7 +10200,7 @@ struct mlx5_ifc_pbmc_reg_bits {
 
 	struct mlx5_ifc_bufferx_reg_bits buffer[10];
 
-	u8         reserved_at_2e0[0x40];
+	u8         reserved_at_2e0[0x80];
 };
 
 struct mlx5_ifc_qtct_reg_bits {
-- 
cgit v1.2.3


From a91d98a0a2b8e4c433b7341708f7d706e0cf1c8e Mon Sep 17 00:00:00 2001
From: Chris Mi <cmi@nvidia.com>
Date: Thu, 10 Sep 2020 15:28:02 +0800
Subject: net/mlx5: Map register values to restore objects

Currently reg_c0 lower 16 bits and reg_b are used to store the chain
id that missed in FDB and NIC tables accordingly. However, the
registers' values may index a restore object, rather than a single u32
value. Different object types can be used to restore mutually exclusive
contexts such as chain id and sample group id.

Use the mapping object to associate an index with a restore object
as a prestep for supporting additional restore types.

Signed-off-by: Chris Mi <cmi@nvidia.com>
Reviewed-by: Oz Shlomo <ozsh@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/en/rep/tc.c    | 38 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  9 +++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  | 13 ++++++--
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 14 +++-----
 .../ethernet/mellanox/mlx5/core/lib/fs_chains.c    | 20 ++++++++----
 .../ethernet/mellanox/mlx5/core/lib/fs_chains.h    |  3 +-
 include/linux/mlx5/eswitch.h                       |  9 +++--
 7 files changed, 63 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
index 11a44d30adc7..dde83cba85c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
@@ -618,9 +618,10 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
 			     struct mlx5e_tc_update_priv *tc_priv)
 {
 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
-	u32 chain = 0, reg_c0, reg_c1, tunnel_id, zone_restore_id;
+	u32 reg_c0, reg_c1, tunnel_id, zone_restore_id;
 	struct mlx5_rep_uplink_priv *uplink_priv;
 	struct mlx5e_rep_priv *uplink_rpriv;
+	struct mlx5_mapped_obj mapped_obj;
 	struct tc_skb_ext *tc_skb_ext;
 	struct mlx5_eswitch *esw;
 	struct mlx5e_priv *priv;
@@ -640,30 +641,35 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe,
 	priv = netdev_priv(skb->dev);
 	esw = priv->mdev->priv.eswitch;
 
-	err = mlx5_get_chain_for_tag(esw_chains(esw), reg_c0, &chain);
+	err = mlx5_get_mapped_object(esw_chains(esw), reg_c0, &mapped_obj);
 	if (err) {
 		netdev_dbg(priv->netdev,
-			   "Couldn't find chain for chain tag: %d, err: %d\n",
+			   "Couldn't find mapped object for reg_c0: %d, err: %d\n",
 			   reg_c0, err);
 		return false;
 	}
 
-	if (chain) {
-		tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT);
-		if (!tc_skb_ext) {
-			WARN_ON(1);
-			return false;
-		}
+	if (mapped_obj.type == MLX5_MAPPED_OBJ_CHAIN) {
+		if (mapped_obj.chain) {
+			tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT);
+			if (!tc_skb_ext) {
+				WARN_ON(1);
+				return false;
+			}
 
-		tc_skb_ext->chain = chain;
+			tc_skb_ext->chain = mapped_obj.chain;
 
-		zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK;
+			zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK;
 
-		uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
-		uplink_priv = &uplink_rpriv->uplink_priv;
-		if (!mlx5e_tc_ct_restore_flow(uplink_priv->ct_priv, skb,
-					      zone_restore_id))
-			return false;
+			uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+			uplink_priv = &uplink_rpriv->uplink_priv;
+			if (!mlx5e_tc_ct_restore_flow(uplink_priv->ct_priv, skb,
+						      zone_restore_id))
+				return false;
+		}
+	} else {
+		netdev_dbg(priv->netdev, "Invalid mapped object type: %d\n", mapped_obj.type);
+		return false;
 	}
 
 	tunnel_id = reg_c1 >> ESW_TUN_OFFSET;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index bb1e0d442b5c..9b5607ddb9a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -4973,6 +4973,7 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe,
 	u32 chain = 0, chain_tag, reg_b, zone_restore_id;
 	struct mlx5e_priv *priv = netdev_priv(skb->dev);
 	struct mlx5e_tc_table *tc = &priv->fs.tc;
+	struct mlx5_mapped_obj mapped_obj;
 	struct tc_skb_ext *tc_skb_ext;
 	int err;
 
@@ -4980,7 +4981,7 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe,
 
 	chain_tag = reg_b & MLX5E_TC_TABLE_CHAIN_TAG_MASK;
 
-	err = mlx5_get_chain_for_tag(nic_chains(priv), chain_tag, &chain);
+	err = mlx5_get_mapped_object(nic_chains(priv), chain_tag, &mapped_obj);
 	if (err) {
 		netdev_dbg(priv->netdev,
 			   "Couldn't find chain for chain tag: %d, err: %d\n",
@@ -4988,7 +4989,8 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe,
 		return false;
 	}
 
-	if (chain) {
+	if (mapped_obj.type == MLX5_MAPPED_OBJ_CHAIN) {
+		chain = mapped_obj.chain;
 		tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT);
 		if (WARN_ON(!tc_skb_ext))
 			return false;
@@ -5001,6 +5003,9 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe,
 		if (!mlx5e_tc_ct_restore_flow(tc->ct, skb,
 					      zone_restore_id))
 			return false;
+	} else {
+		netdev_dbg(priv->netdev, "Invalid mapped object type: %d\n", mapped_obj.type);
+		return false;
 	}
 #endif /* CONFIG_NET_TC_SKB_EXT */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index e0415676821a..c5b35e7f8aed 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -47,6 +47,17 @@
 #include "sf/sf.h"
 #include "en/tc_ct.h"
 
+enum mlx5_mapped_obj_type {
+	MLX5_MAPPED_OBJ_CHAIN,
+};
+
+struct mlx5_mapped_obj {
+	enum mlx5_mapped_obj_type type;
+	union {
+		u32 chain;
+	};
+};
+
 #ifdef CONFIG_MLX5_ESWITCH
 
 #define ESW_OFFLOADS_DEFAULT_NUM_GROUPS 15
@@ -733,8 +744,6 @@ mlx5_esw_vporttbl_put(struct mlx5_eswitch *esw, struct mlx5_vport_tbl_attr *attr
 
 struct mlx5_flow_handle *
 esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag);
-u32
-esw_get_max_restore_tag(struct mlx5_eswitch *esw);
 
 int esw_offloads_load_rep(struct mlx5_eswitch *esw, u16 vport_num);
 void esw_offloads_unload_rep(struct mlx5_eswitch *esw, u16 vport_num);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 8ac4b60ea225..117d9fa93ff5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1286,7 +1286,7 @@ esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag)
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
 			    misc_parameters_2);
 	MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
-		 ESW_CHAIN_TAG_METADATA_MASK);
+		 ESW_REG_C0_USER_DATA_METADATA_MASK);
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
 			    misc_parameters_2);
 	MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, tag);
@@ -1312,12 +1312,6 @@ esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag)
 	return flow_rule;
 }
 
-u32
-esw_get_max_restore_tag(struct mlx5_eswitch *esw)
-{
-	return ESW_CHAIN_TAG_METADATA_MASK;
-}
-
 #define MAX_PF_SQ 256
 #define MAX_SQ_NVPORTS 32
 
@@ -1434,7 +1428,7 @@ esw_chains_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *miss_fdb)
 	attr.max_ft_sz = fdb_max;
 	attr.max_grp_num = esw->params.large_group_num;
 	attr.default_ft = miss_fdb;
-	attr.max_restore_tag = esw_get_max_restore_tag(esw);
+	attr.max_restore_tag = ESW_REG_C0_USER_DATA_METADATA_MASK;
 
 	chains = mlx5_chains_create(dev, &attr);
 	if (IS_ERR(chains)) {
@@ -1928,7 +1922,7 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw)
 		goto out_free;
 	}
 
-	ft_attr.max_fte = 1 << ESW_CHAIN_TAG_METADATA_BITS;
+	ft_attr.max_fte = 1 << ESW_REG_C0_USER_DATA_METADATA_BITS;
 	ft = mlx5_create_flow_table(ns, &ft_attr);
 	if (IS_ERR(ft)) {
 		err = PTR_ERR(ft);
@@ -1943,7 +1937,7 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw)
 			    misc_parameters_2);
 
 	MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
-		 ESW_CHAIN_TAG_METADATA_MASK);
+		 ESW_REG_C0_USER_DATA_METADATA_MASK);
 	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
 	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index,
 		 ft_attr.max_fte - 1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c
index 381325b4a863..00ff809dcfe8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c
@@ -832,8 +832,7 @@ mlx5_chains_init(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr)
 	if (err)
 		goto init_prios_ht_err;
 
-	mapping = mapping_create(sizeof(u32), attr->max_restore_tag,
-				 true);
+	mapping = mapping_create(sizeof(struct mlx5_mapped_obj), attr->max_restore_tag, true);
 	if (IS_ERR(mapping)) {
 		err = PTR_ERR(mapping);
 		goto mapping_err;
@@ -884,21 +883,28 @@ int
 mlx5_chains_get_chain_mapping(struct mlx5_fs_chains *chains, u32 chain,
 			      u32 *chain_mapping)
 {
-	return mapping_add(chains_mapping(chains), &chain, chain_mapping);
+	struct mapping_ctx *ctx = chains->chains_mapping;
+	struct mlx5_mapped_obj mapped_obj = {};
+
+	mapped_obj.type = MLX5_MAPPED_OBJ_CHAIN;
+	mapped_obj.chain = chain;
+	return mapping_add(ctx, &mapped_obj, chain_mapping);
 }
 
 int
 mlx5_chains_put_chain_mapping(struct mlx5_fs_chains *chains, u32 chain_mapping)
 {
-	return mapping_remove(chains_mapping(chains), chain_mapping);
+	struct mapping_ctx *ctx = chains->chains_mapping;
+
+	return mapping_remove(ctx, chain_mapping);
 }
 
-int mlx5_get_chain_for_tag(struct mlx5_fs_chains *chains, u32 tag,
-			   u32 *chain)
+int
+mlx5_get_mapped_object(struct mlx5_fs_chains *chains, u32 tag, struct mlx5_mapped_obj *obj)
 {
 	int err;
 
-	err = mapping_find(chains_mapping(chains), tag, chain);
+	err = mapping_find(chains->chains_mapping, tag, obj);
 	if (err) {
 		mlx5_core_warn(chains->dev, "Can't find chain for tag: %d\n", tag);
 		return -ENOENT;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h
index 6d5be31b05dd..75a3bba12a78 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h
@@ -7,6 +7,7 @@
 #include <linux/mlx5/fs.h>
 
 struct mlx5_fs_chains;
+struct mlx5_mapped_obj;
 
 enum mlx5_chains_flags {
 	MLX5_CHAINS_AND_PRIOS_SUPPORTED = BIT(0),
@@ -64,7 +65,7 @@ mlx5_chains_create(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr);
 void mlx5_chains_destroy(struct mlx5_fs_chains *chains);
 
 int
-mlx5_get_chain_for_tag(struct mlx5_fs_chains *chains, u32 tag, u32 *chain);
+mlx5_get_mapped_object(struct mlx5_fs_chains *chains, u32 tag, struct mlx5_mapped_obj *obj);
 
 void
 mlx5_chains_set_end_ft(struct mlx5_fs_chains *chains,
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 994c2c8cb4fd..125ae482383b 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -74,20 +74,19 @@ bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw);
 bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw);
 
 /* Reg C0 usage:
- * Reg C0 = < ESW_PFNUM_BITS(4) | ESW_VPORT BITS(12) | ESW_CHAIN_TAG(16) >
+ * Reg C0 = < ESW_PFNUM_BITS(4) | ESW_VPORT BITS(12) | ESW_REG_C0_OBJ(16) >
  *
  * Highest 4 bits of the reg c0 is the PF_NUM (range 0-15), 12 bits of
  * unique non-zero vport id (range 1-4095). The rest (lowest 16 bits) is left
- * for tc chain tag restoration.
+ * for user data objects managed by a common mapping context.
  * PFNUM + VPORT comprise the SOURCE_PORT matching.
  */
 #define ESW_VPORT_BITS 12
 #define ESW_PFNUM_BITS 4
 #define ESW_SOURCE_PORT_METADATA_BITS (ESW_PFNUM_BITS + ESW_VPORT_BITS)
 #define ESW_SOURCE_PORT_METADATA_OFFSET (32 - ESW_SOURCE_PORT_METADATA_BITS)
-#define ESW_CHAIN_TAG_METADATA_BITS (32 - ESW_SOURCE_PORT_METADATA_BITS)
-#define ESW_CHAIN_TAG_METADATA_MASK GENMASK(ESW_CHAIN_TAG_METADATA_BITS - 1,\
-					    0)
+#define ESW_REG_C0_USER_DATA_METADATA_BITS (32 - ESW_SOURCE_PORT_METADATA_BITS)
+#define ESW_REG_C0_USER_DATA_METADATA_MASK GENMASK(ESW_REG_C0_USER_DATA_METADATA_BITS - 1, 0)
 
 static inline u32 mlx5_eswitch_get_vport_metadata_mask(void)
 {
-- 
cgit v1.2.3


From eee87072e2fb9000b12c5e752ebd4a05882da2e4 Mon Sep 17 00:00:00 2001
From: Bhaumik Bhatt <bbhatt@codeaurora.org>
Date: Thu, 1 Apr 2021 14:41:49 -0700
Subject: bus: mhi: core: Remove pre_init flag used for power purposes

Some controllers can choose to skip preparation for power up.
In that case, device context is initialized based on the pre_init
flag not being set during mhi_prepare_for_power_up(). There is no
reason MHI host driver should maintain and provide controllers
with two separate paths for preparing MHI.

Going forward, all controllers will be required to call the
mhi_prepare_for_power_up() API followed by their choice of sync
or async power up. This allows MHI host driver to get rid of the
pre_init flag and sets up a common way for all controllers to use
MHI. This also helps controllers fail early on during preparation
phase in some failure cases.

Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
Reviewed-by: Hemant Kumar <hemantk@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1617313309-24035-1-git-send-email-bbhatt@codeaurora.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/core/init.c |  3 ---
 drivers/bus/mhi/core/pm.c   | 20 --------------------
 include/linux/mhi.h         |  2 --
 3 files changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c
index d1d9b0d9158e..1f6135213251 100644
--- a/drivers/bus/mhi/core/init.c
+++ b/drivers/bus/mhi/core/init.c
@@ -1080,8 +1080,6 @@ int mhi_prepare_for_power_up(struct mhi_controller *mhi_cntrl)
 			mhi_rddm_prepare(mhi_cntrl, mhi_cntrl->rddm_image);
 	}
 
-	mhi_cntrl->pre_init = true;
-
 	mutex_unlock(&mhi_cntrl->pm_mutex);
 
 	return 0;
@@ -1112,7 +1110,6 @@ void mhi_unprepare_after_power_down(struct mhi_controller *mhi_cntrl)
 	}
 
 	mhi_deinit_dev_ctxt(mhi_cntrl);
-	mhi_cntrl->pre_init = false;
 }
 EXPORT_SYMBOL_GPL(mhi_unprepare_after_power_down);
 
diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c
index dafcd9847cc6..e2e59a341fef 100644
--- a/drivers/bus/mhi/core/pm.c
+++ b/drivers/bus/mhi/core/pm.c
@@ -1062,13 +1062,6 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl)
 	mutex_lock(&mhi_cntrl->pm_mutex);
 	mhi_cntrl->pm_state = MHI_PM_DISABLE;
 
-	if (!mhi_cntrl->pre_init) {
-		/* Setup device context */
-		ret = mhi_init_dev_ctxt(mhi_cntrl);
-		if (ret)
-			goto error_dev_ctxt;
-	}
-
 	ret = mhi_init_irq_setup(mhi_cntrl);
 	if (ret)
 		goto error_setup_irq;
@@ -1150,10 +1143,6 @@ error_bhi_offset:
 	mhi_deinit_free_irq(mhi_cntrl);
 
 error_setup_irq:
-	if (!mhi_cntrl->pre_init)
-		mhi_deinit_dev_ctxt(mhi_cntrl);
-
-error_dev_ctxt:
 	mhi_cntrl->pm_state = MHI_PM_DISABLE;
 	mutex_unlock(&mhi_cntrl->pm_mutex);
 
@@ -1203,15 +1192,6 @@ void mhi_power_down(struct mhi_controller *mhi_cntrl, bool graceful)
 	flush_work(&mhi_cntrl->st_worker);
 
 	free_irq(mhi_cntrl->irq[0], mhi_cntrl);
-
-	if (!mhi_cntrl->pre_init) {
-		/* Free all allocated resources */
-		if (mhi_cntrl->fbc_image) {
-			mhi_free_bhie_table(mhi_cntrl, mhi_cntrl->fbc_image);
-			mhi_cntrl->fbc_image = NULL;
-		}
-		mhi_deinit_dev_ctxt(mhi_cntrl);
-	}
 }
 EXPORT_SYMBOL_GPL(mhi_power_down);
 
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index b16afd36b444..c9b36a34cc7f 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -354,7 +354,6 @@ struct mhi_controller_config {
  * @index: Index of the MHI controller instance
  * @bounce_buf: Use of bounce buffer
  * @fbc_download: MHI host needs to do complete image transfer (optional)
- * @pre_init: MHI host needs to do pre-initialization before power up
  * @wake_set: Device wakeup set flag
  * @irq_flags: irq flags passed to request_irq (optional)
  *
@@ -447,7 +446,6 @@ struct mhi_controller {
 	int index;
 	bool bounce_buf;
 	bool fbc_download;
-	bool pre_init;
 	bool wake_set;
 	unsigned long irq_flags;
 };
-- 
cgit v1.2.3


From 6731fefd95671575ceaba8e1c60881d529537352 Mon Sep 17 00:00:00 2001
From: Bhaumik Bhatt <bbhatt@codeaurora.org>
Date: Thu, 1 Apr 2021 14:16:18 -0700
Subject: bus: mhi: Improve documentation on channel transfer setup APIs

The mhi_prepare_for_transfer() and mhi_unprepare_from_transfer()
APIs could use better explanation. Add details on what MHI does
when these APIs are used.

Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
Reviewed-by: Hemant Kumar <hemantk@codeaurora.org>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1617311778-1254-10-git-send-email-bbhatt@codeaurora.org
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index c9b36a34cc7f..d095fba37d1e 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -712,13 +712,27 @@ int mhi_device_get_sync(struct mhi_device *mhi_dev);
 void mhi_device_put(struct mhi_device *mhi_dev);
 
 /**
- * mhi_prepare_for_transfer - Setup channel for data transfer
+ * mhi_prepare_for_transfer - Setup UL and DL channels for data transfer.
+ *                            Allocate and initialize the channel context and
+ *                            also issue the START channel command to both
+ *                            channels. Channels can be started only if both
+ *                            host and device execution environments match and
+ *                            channels are in a DISABLED state.
  * @mhi_dev: Device associated with the channels
  */
 int mhi_prepare_for_transfer(struct mhi_device *mhi_dev);
 
 /**
- * mhi_unprepare_from_transfer - Unprepare the channels
+ * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer.
+ *                               Issue the RESET channel command and let the
+ *                               device clean-up the context so no incoming
+ *                               transfers are seen on the host. Free memory
+ *                               associated with the context on host. If device
+ *                               is unresponsive, only perform a host side
+ *                               clean-up. Channels can be reset only if both
+ *                               host and device execution environments match
+ *                               and channels are in an ENABLED, STOPPED or
+ *                               SUSPENDED state.
  * @mhi_dev: Device associated with the channels
  */
 void mhi_unprepare_from_transfer(struct mhi_device *mhi_dev);
-- 
cgit v1.2.3


From 45b77828b01cdf2af655e4edbc63646fc7e07b48 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sun, 14 Mar 2021 18:14:57 +0000
Subject: iio:event: Add timeout event info type

For adaptive threshold events, the current value is compared with a
(typically) low pass filtered version of the same signal that slowly
tracks large scale changes.  However, sometimes a step change can
result in a large lag before the low pass filtered version begins
to track the signal again.  Timeouts can be used to made an
instantaneous 'correction'.  Documentation of this attribute
is added in a later patch.

Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Link: https://lore.kernel.org/r/20210314181511.531414-11-jic23@kernel.org
---
 drivers/iio/industrialio-event.c | 1 +
 include/linux/iio/types.h        | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 1b3a15bc75fe..d0732eac0f0a 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -245,6 +245,7 @@ static const char * const iio_ev_info_text[] = {
 	[IIO_EV_INFO_PERIOD] = "period",
 	[IIO_EV_INFO_HIGH_PASS_FILTER_3DB] = "high_pass_filter_3db",
 	[IIO_EV_INFO_LOW_PASS_FILTER_3DB] = "low_pass_filter_3db",
+	[IIO_EV_INFO_TIMEOUT] = "timeout",
 };
 
 static enum iio_event_direction iio_ev_attr_dir(struct iio_dev_attr *attr)
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 5aa7f66d4345..84b3f8175cc6 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -16,6 +16,7 @@ enum iio_event_info {
 	IIO_EV_INFO_PERIOD,
 	IIO_EV_INFO_HIGH_PASS_FILTER_3DB,
 	IIO_EV_INFO_LOW_PASS_FILTER_3DB,
+	IIO_EV_INFO_TIMEOUT,
 };
 
 #define IIO_VAL_INT 1
-- 
cgit v1.2.3


From 8cc110478cab83d71f217a9aafec034ed63a5f8e Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Thu, 1 Apr 2021 14:17:45 +0200
Subject: dt-bindings:iio:dac: update microchip,mcp4725.yaml reference

Changeset 6ced946a4bba ("dt-bindings:iio:dac:microchip,mcp4725 yaml conversion")
renamed: Documentation/devicetree/bindings/iio/dac/mcp4725.txt
to: Documentation/devicetree/bindings/iio/dac/microchip,mcp4725.yaml.

Update its cross-reference accordingly.

Fixes: 6ced946a4bba ("dt-bindings:iio:dac:microchip,mcp4725 yaml conversion")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/82fb54974e8a22be15e64343260a6de39a18edda.1617279356.git.mchehab+huawei@kernel.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/dac/mcp4725.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/dac/mcp4725.h b/include/linux/iio/dac/mcp4725.h
index e9801c8d49c0..1f7e53c506b6 100644
--- a/include/linux/iio/dac/mcp4725.h
+++ b/include/linux/iio/dac/mcp4725.h
@@ -15,7 +15,7 @@
  * @vref_buffered: Controls buffering of the external reference voltage.
  *
  * Vref related settings are available only on MCP4756. See
- * Documentation/devicetree/bindings/iio/dac/mcp4725.txt for more information.
+ * Documentation/devicetree/bindings/iio/dac/microchip,mcp4725.yaml for more information.
  */
 struct mcp4725_platform_data {
 	bool use_vref;
-- 
cgit v1.2.3


From 13b89768275d6ca9764bf91449e4cafe46ba706b Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Fri, 2 Apr 2021 20:31:27 +0530
Subject: mtd: rawnand: Add support for secure regions in NAND memory

On a typical end product, a vendor may choose to secure some regions in
the NAND memory which are supposed to stay intact between FW upgrades.
The access to those regions will be blocked by a secure element like
Trustzone. So the normal world software like Linux kernel should not
touch these regions (including reading).

The regions are declared using a NAND chip DT property,
"secure-regions". So let's make use of this property in the raw NAND
core and skip access to the secure regions present in a system.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210402150128.29128-4-manivannan.sadhasivam@linaro.org
---
 drivers/mtd/nand/raw/nand_base.c | 100 ++++++++++++++++++++++++++++++++++++++-
 include/linux/mtd/rawnand.h      |  14 ++++++
 2 files changed, 113 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index a984cda86e2d..fb072c444495 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -278,11 +278,48 @@ static int nand_block_bad(struct nand_chip *chip, loff_t ofs)
 	return 0;
 }
 
+/**
+ * nand_region_is_secured() - Check if the region is secured
+ * @chip: NAND chip object
+ * @offset: Offset of the region to check
+ * @size: Size of the region to check
+ *
+ * Checks if the region is secured by comparing the offset and size with the
+ * list of secure regions obtained from DT. Returns true if the region is
+ * secured else false.
+ */
+static bool nand_region_is_secured(struct nand_chip *chip, loff_t offset, u64 size)
+{
+	int i;
+
+	/* Skip touching the secure regions if present */
+	for (i = 0; i < chip->nr_secure_regions; i++) {
+		const struct nand_secure_region *region = &chip->secure_regions[i];
+
+		if (offset + size <= region->offset ||
+		    offset >= region->offset + region->size)
+			continue;
+
+		pr_debug("%s: Region 0x%llx - 0x%llx is secured!",
+			 __func__, offset, offset + size);
+
+		return true;
+	}
+
+	return false;
+}
+
 static int nand_isbad_bbm(struct nand_chip *chip, loff_t ofs)
 {
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
 	if (chip->options & NAND_NO_BBM_QUIRK)
 		return 0;
 
+	/* Check if the region is secured */
+	if (nand_region_is_secured(chip, ofs, mtd->erasesize))
+		return -EIO;
+
 	if (chip->legacy.block_bad)
 		return chip->legacy.block_bad(chip, ofs);
 
@@ -397,6 +434,10 @@ static int nand_do_write_oob(struct nand_chip *chip, loff_t to,
 		return -EINVAL;
 	}
 
+	/* Check if the region is secured */
+	if (nand_region_is_secured(chip, to, ops->ooblen))
+		return -EIO;
+
 	chipnr = (int)(to >> chip->chip_shift);
 
 	/*
@@ -3128,6 +3169,10 @@ static int nand_do_read_ops(struct nand_chip *chip, loff_t from,
 	int retry_mode = 0;
 	bool ecc_fail = false;
 
+	/* Check if the region is secured */
+	if (nand_region_is_secured(chip, from, readlen))
+		return -EIO;
+
 	chipnr = (int)(from >> chip->chip_shift);
 	nand_select_target(chip, chipnr);
 
@@ -3459,6 +3504,10 @@ static int nand_do_read_oob(struct nand_chip *chip, loff_t from,
 	pr_debug("%s: from = 0x%08Lx, len = %i\n",
 			__func__, (unsigned long long)from, readlen);
 
+	/* Check if the region is secured */
+	if (nand_region_is_secured(chip, from, readlen))
+		return -EIO;
+
 	stats = mtd->ecc_stats;
 
 	len = mtd_oobavail(mtd, ops);
@@ -3980,6 +4029,10 @@ static int nand_do_write_ops(struct nand_chip *chip, loff_t to,
 		return -EINVAL;
 	}
 
+	/* Check if the region is secured */
+	if (nand_region_is_secured(chip, to, writelen))
+		return -EIO;
+
 	column = to & (mtd->writesize - 1);
 
 	chipnr = (int)(to >> chip->chip_shift);
@@ -4181,6 +4234,10 @@ int nand_erase_nand(struct nand_chip *chip, struct erase_info *instr,
 	if (check_offs_len(chip, instr->addr, instr->len))
 		return -EINVAL;
 
+	/* Check if the region is secured */
+	if (nand_region_is_secured(chip, instr->addr, instr->len))
+		return -EIO;
+
 	/* Grab the lock and see if the device is available */
 	ret = nand_get_device(chip);
 	if (ret)
@@ -4996,6 +5053,31 @@ static bool of_get_nand_on_flash_bbt(struct device_node *np)
 	return of_property_read_bool(np, "nand-on-flash-bbt");
 }
 
+static int of_get_nand_secure_regions(struct nand_chip *chip)
+{
+	struct device_node *dn = nand_get_flash_node(chip);
+	int nr_elem, i, j;
+
+	nr_elem = of_property_count_elems_of_size(dn, "secure-regions", sizeof(u64));
+	if (!nr_elem)
+		return 0;
+
+	chip->nr_secure_regions = nr_elem / 2;
+	chip->secure_regions = kcalloc(chip->nr_secure_regions, sizeof(*chip->secure_regions),
+				       GFP_KERNEL);
+	if (!chip->secure_regions)
+		return -ENOMEM;
+
+	for (i = 0, j = 0; i < chip->nr_secure_regions; i++, j += 2) {
+		of_property_read_u64_index(dn, "secure-regions", j,
+					   &chip->secure_regions[i].offset);
+		of_property_read_u64_index(dn, "secure-regions", j + 1,
+					   &chip->secure_regions[i].size);
+	}
+
+	return 0;
+}
+
 static int rawnand_dt_init(struct nand_chip *chip)
 {
 	struct nand_device *nand = mtd_to_nanddev(nand_to_mtd(chip));
@@ -5952,6 +6034,16 @@ static int nand_scan_tail(struct nand_chip *chip)
 			goto err_free_interface_config;
 	}
 
+	/*
+	 * Look for secure regions in the NAND chip. These regions are supposed
+	 * to be protected by a secure element like Trustzone. So the read/write
+	 * accesses to these regions will be blocked in the runtime by this
+	 * driver.
+	 */
+	ret = of_get_nand_secure_regions(chip);
+	if (ret)
+		goto err_free_interface_config;
+
 	/* Check, if we should skip the bad block table scan */
 	if (chip->options & NAND_SKIP_BBTSCAN)
 		return 0;
@@ -5959,10 +6051,13 @@ static int nand_scan_tail(struct nand_chip *chip)
 	/* Build bad block table */
 	ret = nand_create_bbt(chip);
 	if (ret)
-		goto err_free_interface_config;
+		goto err_free_secure_regions;
 
 	return 0;
 
+err_free_secure_regions:
+	kfree(chip->secure_regions);
+
 err_free_interface_config:
 	kfree(chip->best_interface_config);
 
@@ -6050,6 +6145,9 @@ void nand_cleanup(struct nand_chip *chip)
 
 	nanddev_cleanup(&chip->base);
 
+	/* Free secure regions data */
+	kfree(chip->secure_regions);
+
 	/* Free bad block table memory */
 	kfree(chip->bbt);
 	kfree(chip->data_buf);
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 93e8f72beba6..29df2f43dcb5 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1035,6 +1035,16 @@ struct nand_manufacturer {
 	void *priv;
 };
 
+/**
+ * struct nand_secure_region - NAND secure region structure
+ * @offset: Offset of the start of the secure region
+ * @size: Size of the secure region
+ */
+struct nand_secure_region {
+	u64 offset;
+	u64 size;
+};
+
 /**
  * struct nand_chip - NAND Private Flash Chip Data
  * @base: Inherit from the generic NAND device
@@ -1085,6 +1095,8 @@ struct nand_manufacturer {
  *          NAND Controller drivers should not modify this value, but they're
  *          allowed to read it.
  * @read_retries: The number of read retry modes supported
+ * @secure_regions: Structure containing the secure regions info
+ * @nr_secure_regions: Number of secure regions
  * @controller: The hardware controller	structure which is shared among multiple
  *              independent devices
  * @ecc: The ECC controller structure
@@ -1134,6 +1146,8 @@ struct nand_chip {
 	unsigned int suspended : 1;
 	int cur_cs;
 	int read_retries;
+	struct nand_secure_region *secure_regions;
+	u8 nr_secure_regions;
 
 	/* Externals */
 	struct nand_controller *controller;
-- 
cgit v1.2.3


From 2e1a44c1c4acf209c0dd7bc04421d101b9e80d11 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 23 Mar 2021 09:05:57 +0800
Subject: iommu/vt-d: Remove svm_dev_ops

The svm_dev_ops has never been referenced in the tree, and there's no
plan to have anything to use it. Remove it to make the code neat.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210323010600.678627-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/svm.c   | 15 +--------------
 include/linux/intel-iommu.h |  3 ---
 include/linux/intel-svm.h   |  7 -------
 3 files changed, 1 insertion(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 46776db47d16..933fe11797a1 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -462,7 +462,6 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
 /* Caller must hold pasid_mutex, mm reference */
 static int
 intel_svm_bind_mm(struct device *dev, unsigned int flags,
-		  struct svm_dev_ops *ops,
 		  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
@@ -512,10 +511,6 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 
 			/* Find the matching device in svm list */
 			for_each_svm_dev(sdev, svm, dev) {
-				if (sdev->ops != ops) {
-					ret = -EBUSY;
-					goto out;
-				}
 				sdev->users++;
 				goto success;
 			}
@@ -550,7 +545,6 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 
 	/* Finish the setup now we know we're keeping it */
 	sdev->users = 1;
-	sdev->ops = ops;
 	init_rcu_head(&sdev->rcu);
 
 	if (!svm) {
@@ -1006,13 +1000,6 @@ invalid:
 		mmap_read_unlock(svm->mm);
 		mmput(svm->mm);
 bad_req:
-		WARN_ON(!sdev);
-		if (sdev && sdev->ops && sdev->ops->fault_cb) {
-			int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
-				(req->exe_req << 1) | (req->pm_req);
-			sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
-					    req->priv_data, rwxp, result);
-		}
 		/* We get here in the error case where the PASID lookup failed,
 		   and these can be NULL. Do not use them below this point! */
 		sdev = NULL;
@@ -1087,7 +1074,7 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
 	if (drvdata)
 		flags = *(unsigned int *)drvdata;
 	mutex_lock(&pasid_mutex);
-	ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
+	ret = intel_svm_bind_mm(dev, flags, mm, &sdev);
 	if (ret)
 		sva = ERR_PTR(ret);
 	else if (sdev)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 1732298ce888..e0f8c2ade3e8 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -769,14 +769,11 @@ u32 intel_svm_get_pasid(struct iommu_sva *handle);
 int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
 			    struct iommu_page_response *msg);
 
-struct svm_dev_ops;
-
 struct intel_svm_dev {
 	struct list_head list;
 	struct rcu_head rcu;
 	struct device *dev;
 	struct intel_iommu *iommu;
-	struct svm_dev_ops *ops;
 	struct iommu_sva sva;
 	u32 pasid;
 	int users;
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 39d368a810b8..6c9d10c0fb1e 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -8,13 +8,6 @@
 #ifndef __INTEL_SVM_H__
 #define __INTEL_SVM_H__
 
-struct device;
-
-struct svm_dev_ops {
-	void (*fault_cb)(struct device *dev, u32 pasid, u64 address,
-			 void *private, int rwxp, int response);
-};
-
 /* Values for rxwp in fault_cb callback */
 #define SVM_REQ_READ	(1<<3)
 #define SVM_REQ_WRITE	(1<<2)
-- 
cgit v1.2.3


From 06905ea8319731036695cf1a4c53c12b0f9373cb Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 23 Mar 2021 09:05:58 +0800
Subject: iommu/vt-d: Remove SVM_FLAG_PRIVATE_PASID

The SVM_FLAG_PRIVATE_PASID has never been referenced in the tree, and
there's no plan to have anything to use it. So cleanup it.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210323010600.678627-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/svm.c | 40 ++++++++++++++++++----------------------
 include/linux/intel-svm.h | 16 +++-------------
 2 files changed, 21 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 933fe11797a1..f77184a9defb 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -465,9 +465,9 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		  struct mm_struct *mm, struct intel_svm_dev **sd)
 {
 	struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
+	struct intel_svm *svm = NULL, *t;
 	struct device_domain_info *info;
 	struct intel_svm_dev *sdev;
-	struct intel_svm *svm = NULL;
 	unsigned long iflags;
 	int pasid_max;
 	int ret;
@@ -493,30 +493,26 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		}
 	}
 
-	if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
-		struct intel_svm *t;
-
-		list_for_each_entry(t, &global_svm_list, list) {
-			if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
-				continue;
-
-			svm = t;
-			if (svm->pasid >= pasid_max) {
-				dev_warn(dev,
-					 "Limited PASID width. Cannot use existing PASID %d\n",
-					 svm->pasid);
-				ret = -ENOSPC;
-				goto out;
-			}
+	list_for_each_entry(t, &global_svm_list, list) {
+		if (t->mm != mm)
+			continue;
 
-			/* Find the matching device in svm list */
-			for_each_svm_dev(sdev, svm, dev) {
-				sdev->users++;
-				goto success;
-			}
+		svm = t;
+		if (svm->pasid >= pasid_max) {
+			dev_warn(dev,
+				 "Limited PASID width. Cannot use existing PASID %d\n",
+				 svm->pasid);
+			ret = -ENOSPC;
+			goto out;
+		}
 
-			break;
+		/* Find the matching device in svm list */
+		for_each_svm_dev(sdev, svm, dev) {
+			sdev->users++;
+			goto success;
 		}
+
+		break;
 	}
 
 	sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 6c9d10c0fb1e..10fa80eef13a 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -14,16 +14,6 @@
 #define SVM_REQ_EXEC	(1<<1)
 #define SVM_REQ_PRIV	(1<<0)
 
-/*
- * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main"
- * PASID for the current process. Even if a PASID already exists, a new one
- * will be allocated. And the PASID allocated with SVM_FLAG_PRIVATE_PASID
- * will not be given to subsequent callers. This facility allows a driver to
- * disambiguate between multiple device contexts which access the same MM,
- * if there is no other way to do so. It should be used sparingly, if at all.
- */
-#define SVM_FLAG_PRIVATE_PASID		(1<<0)
-
 /*
  * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
  * for access to kernel addresses. No IOTLB flushes are automatically done
@@ -35,18 +25,18 @@
  * It is unlikely that we will ever hook into flush_tlb_kernel_range() to
  * do such IOTLB flushes automatically.
  */
-#define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
+#define SVM_FLAG_SUPERVISOR_MODE	BIT(0)
 /*
  * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest
  * processes. Compared to the host bind, the primary differences are:
  * 1. mm life cycle management
  * 2. fault reporting
  */
-#define SVM_FLAG_GUEST_MODE		(1<<2)
+#define SVM_FLAG_GUEST_MODE		BIT(1)
 /*
  * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space,
  * which requires guest and host PASID translation at both directions.
  */
-#define SVM_FLAG_GUEST_PASID		(1<<3)
+#define SVM_FLAG_GUEST_PASID		BIT(2)
 
 #endif /* __INTEL_SVM_H__ */
-- 
cgit v1.2.3


From 3431c3f660a39f6ced954548a59dba6541ce3eb1 Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66@hisilicon.com>
Date: Thu, 25 Mar 2021 11:38:24 +0800
Subject: iommu: Fix a boundary issue to avoid performance drop

After the change of patch ("iommu: Switch gather->end to the
inclusive end"), the performace drops from 1600+K IOPS to 1200K in our
kunpeng ARM64 platform.
We find that the range [start1, end1) actually is joint from the range
[end1, end2), but it is considered as disjoint after the change,
so it needs more times of TLB sync, and spends more time on it.
So fix the boundary issue to avoid performance drop.

Fixes: 862c3715de8f ("iommu: Switch gather->end to the inclusive end")
Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1616643504-120688-1-git-send-email-chenxiang66@hisilicon.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5e7fe519430a..9ca6e6b8084d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -547,7 +547,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain,
 	 * structure can be rewritten.
 	 */
 	if (gather->pgsize != size ||
-	    end < gather->start || start > gather->end) {
+	    end + 1 < gather->start || start > gather->end + 1) {
 		if (gather->pgsize)
 			iommu_iotlb_sync(domain, gather);
 		gather->pgsize = size;
-- 
cgit v1.2.3


From f598a497bc7dfbec60270bca8b8408db3d23ac07 Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 25 Mar 2021 20:29:58 +0800
Subject: iova: Add CPU hotplug handler to flush rcaches

Like the Intel IOMMU driver already does, flush the per-IOVA domain
CPU rcache when a CPU goes offline - there's no point in keeping it.

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1616675401-151997-2-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c       | 30 +++++++++++++++++++++++++++++-
 include/linux/cpuhotplug.h |  1 +
 include/linux/iova.h       |  1 +
 3 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 471c48dd71e7..8542059ddef8 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -25,6 +25,17 @@ static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
 static void fq_flush_timeout(struct timer_list *t);
+
+static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct iova_domain *iovad;
+
+	iovad = hlist_entry_safe(node, struct iova_domain, cpuhp_dead);
+
+	free_cpu_cached_iovas(cpu, iovad);
+	return 0;
+}
+
 static void free_global_cached_iovas(struct iova_domain *iovad);
 
 static struct iova *to_iova(struct rb_node *node)
@@ -56,6 +67,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
 	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
 	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
+	cpuhp_state_add_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD, &iovad->cpuhp_dead);
 	init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
@@ -299,10 +311,21 @@ int iova_cache_get(void)
 {
 	mutex_lock(&iova_cache_mutex);
 	if (!iova_cache_users) {
+		int ret;
+
+		ret = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", NULL,
+					iova_cpuhp_dead);
+		if (ret) {
+			mutex_unlock(&iova_cache_mutex);
+			pr_err("Couldn't register cpuhp handler\n");
+			return ret;
+		}
+
 		iova_cache = kmem_cache_create(
 			"iommu_iova", sizeof(struct iova), 0,
 			SLAB_HWCACHE_ALIGN, NULL);
 		if (!iova_cache) {
+			cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
 			mutex_unlock(&iova_cache_mutex);
 			pr_err("Couldn't create iova cache\n");
 			return -ENOMEM;
@@ -324,8 +347,10 @@ void iova_cache_put(void)
 		return;
 	}
 	iova_cache_users--;
-	if (!iova_cache_users)
+	if (!iova_cache_users) {
+		cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
 		kmem_cache_destroy(iova_cache);
+	}
 	mutex_unlock(&iova_cache_mutex);
 }
 EXPORT_SYMBOL_GPL(iova_cache_put);
@@ -648,6 +673,9 @@ void put_iova_domain(struct iova_domain *iovad)
 {
 	struct iova *iova, *tmp;
 
+	cpuhp_state_remove_instance_nocalls(CPUHP_IOMMU_IOVA_DEAD,
+					    &iovad->cpuhp_dead);
+
 	free_iova_flush_queue(iovad);
 	free_iova_rcaches(iovad);
 	rbtree_postorder_for_each_entry_safe(iova, tmp, &iovad->rbroot, node)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f14adb882338..cedac9986557 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -58,6 +58,7 @@ enum cpuhp_state {
 	CPUHP_NET_DEV_DEAD,
 	CPUHP_PCI_XGENE_DEAD,
 	CPUHP_IOMMU_INTEL_DEAD,
+	CPUHP_IOMMU_IOVA_DEAD,
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
diff --git a/include/linux/iova.h b/include/linux/iova.h
index c834c01c0a5b..4be6c0ab4997 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -95,6 +95,7 @@ struct iova_domain {
 						   flush-queues */
 	atomic_t fq_timer_on;			/* 1 when timer is active, 0
 						   when not */
+	struct hlist_node	cpuhp_dead;
 };
 
 static inline unsigned long iova_size(struct iova *iova)
-- 
cgit v1.2.3


From 363f266eeff6e22a09483dc922dccd7cd0b9fe9c Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 25 Mar 2021 20:29:59 +0800
Subject: iommu/vt-d: Remove IOVA domain rcache flushing for CPU offlining

Now that the core code handles flushing per-IOVA domain CPU rcaches,
remove the handling here.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Link: https://lore.kernel.org/r/1616675401-151997-3-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 31 -------------------------------
 include/linux/cpuhotplug.h  |  1 -
 2 files changed, 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 1c32522220bc..be5ea16ca26f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4064,35 +4064,6 @@ static struct notifier_block intel_iommu_memory_nb = {
 	.priority = 0
 };
 
-static void free_all_cpu_cached_iovas(unsigned int cpu)
-{
-	int i;
-
-	for (i = 0; i < g_num_of_iommus; i++) {
-		struct intel_iommu *iommu = g_iommus[i];
-		struct dmar_domain *domain;
-		int did;
-
-		if (!iommu)
-			continue;
-
-		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
-			domain = get_iommu_domain(iommu, (u16)did);
-
-			if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
-				continue;
-
-			iommu_dma_free_cpu_cached_iovas(cpu, &domain->domain);
-		}
-	}
-}
-
-static int intel_iommu_cpu_dead(unsigned int cpu)
-{
-	free_all_cpu_cached_iovas(cpu);
-	return 0;
-}
-
 static void intel_disable_iommus(void)
 {
 	struct intel_iommu *iommu = NULL;
@@ -4387,8 +4358,6 @@ int __init intel_iommu_init(void)
 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
 	if (si_domain && !hw_pass_through)
 		register_memory_notifier(&intel_iommu_memory_nb);
-	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
-			  intel_iommu_cpu_dead);
 
 	down_read(&dmar_global_lock);
 	if (probe_acpi_namespace_devices())
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index cedac9986557..85996494bec1 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -57,7 +57,6 @@ enum cpuhp_state {
 	CPUHP_PAGE_ALLOC_DEAD,
 	CPUHP_NET_DEV_DEAD,
 	CPUHP_PCI_XGENE_DEAD,
-	CPUHP_IOMMU_INTEL_DEAD,
 	CPUHP_IOMMU_IOVA_DEAD,
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
-- 
cgit v1.2.3


From 149448b353e2517ecc6eced7d9f46e9f3e08b89e Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 25 Mar 2021 20:30:00 +0800
Subject: iommu: Delete iommu_dma_free_cpu_cached_iovas()

Function iommu_dma_free_cpu_cached_iovas() no longer has any caller, so
delete it.

With that, function free_cpu_cached_iovas() may be made static.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/1616675401-151997-4-git-send-email-john.garry@huawei.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/dma-iommu.c | 9 ---------
 drivers/iommu/iova.c      | 3 ++-
 include/linux/dma-iommu.h | 5 -----
 include/linux/iova.h      | 5 -----
 4 files changed, 2 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index fdd095e1fa52..0af4048f9b3f 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -64,15 +64,6 @@ static int __init iommu_dma_forcedac_setup(char *str)
 }
 early_param("iommu.forcedac", iommu_dma_forcedac_setup);
 
-void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
-		struct iommu_domain *domain)
-{
-	struct iommu_dma_cookie *cookie = domain->iova_cookie;
-	struct iova_domain *iovad = &cookie->iovad;
-
-	free_cpu_cached_iovas(cpu, iovad);
-}
-
 static void iommu_dma_entry_dtor(unsigned long data)
 {
 	struct page *freelist = (struct page *)data;
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 8542059ddef8..c9726e1b9e11 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -22,6 +22,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 				     unsigned long size,
 				     unsigned long limit_pfn);
 static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
 static void fq_flush_timeout(struct timer_list *t);
@@ -1040,7 +1041,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 /*
  * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
  */
-void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
 {
 	struct iova_cpu_rcache *cpu_rcache;
 	struct iova_rcache *rcache;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 13d1f4c14d7b..6e75a2d689b4 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -83,10 +83,5 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
 {
 }
 
-static inline void iommu_dma_free_cpu_cached_iovas(unsigned int cpu,
-		struct iommu_domain *domain)
-{
-}
-
 #endif	/* CONFIG_IOMMU_DMA */
 #endif	/* __DMA_IOMMU_H */
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 4be6c0ab4997..71d8a2de6635 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -157,7 +157,6 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
-void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 #else
 static inline int iova_cache_get(void)
 {
@@ -234,10 +233,6 @@ static inline void put_iova_domain(struct iova_domain *iovad)
 {
 }
 
-static inline void free_cpu_cached_iovas(unsigned int cpu,
-					 struct iova_domain *iovad)
-{
-}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 0d35309ab5e080095190965aa7cfc3ca8fb88af9 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 1 Apr 2021 17:47:10 +0200
Subject: iommu: Fix comment for struct iommu_fwspec

Commit 986d5ecc5699 ("iommu: Move fwspec->iommu_priv to struct
dev_iommu") removed iommu_priv from fwspec and commit 5702ee24182f
("ACPI/IORT: Check ATS capability in root complex nodes") added @flags.
Update the struct doc.

Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20210401154718.307519-2-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5e7fe519430a..1d422bf722a1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -571,7 +571,7 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
  * struct iommu_fwspec - per-device IOMMU instance data
  * @ops: ops for this device's IOMMU
  * @iommu_fwnode: firmware handle for this device's IOMMU
- * @iommu_priv: IOMMU driver private data for this device
+ * @flags: IOMMU_FWSPEC_* flags
  * @num_pasid_bits: number of PASID bits supported by this device
  * @num_ids: number of associated device IDs
  * @ids: IDs which this device may present to the IOMMU
-- 
cgit v1.2.3


From 434b73e61cc65cdd26618af6fa4736c2ba1eb29b Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 1 Apr 2021 17:47:11 +0200
Subject: iommu/arm-smmu-v3: Use device properties for pasid-num-bits

The pasid-num-bits property shouldn't need a dedicated fwspec field,
it's a job for device properties. Add properties for IORT, and access
the number of PASID bits using device_property_read_u32().

Suggested-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/20210401154718.307519-3-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c                   | 13 +++++++------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  3 ++-
 drivers/iommu/of_iommu.c                    |  5 -----
 include/linux/iommu.h                       |  2 --
 4 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 2494138a6905..3912a1f6058e 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -968,15 +968,16 @@ static int iort_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data)
 static void iort_named_component_init(struct device *dev,
 				      struct acpi_iort_node *node)
 {
+	struct property_entry props[2] = {};
 	struct acpi_iort_named_component *nc;
-	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-
-	if (!fwspec)
-		return;
 
 	nc = (struct acpi_iort_named_component *)node->node_data;
-	fwspec->num_pasid_bits = FIELD_GET(ACPI_IORT_NC_PASID_BITS,
-					   nc->node_flags);
+	props[0] = PROPERTY_ENTRY_U32("pasid-num-bits",
+				      FIELD_GET(ACPI_IORT_NC_PASID_BITS,
+						nc->node_flags));
+
+	if (device_add_properties(dev, props))
+		dev_warn(dev, "Could not add device properties\n");
 }
 
 static int iort_nc_iommu_map(struct device *dev, struct acpi_iort_node *node)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8594b4a83043..7edce914c45e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2392,7 +2392,8 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 		}
 	}
 
-	master->ssid_bits = min(smmu->ssid_bits, fwspec->num_pasid_bits);
+	device_property_read_u32(dev, "pasid-num-bits", &master->ssid_bits);
+	master->ssid_bits = min(smmu->ssid_bits, master->ssid_bits);
 
 	/*
 	 * Note that PASID must be enabled before, and disabled after ATS:
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index e505b9130a1c..a9d2df001149 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -210,11 +210,6 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 					     of_pci_iommu_init, &info);
 	} else {
 		err = of_iommu_configure_device(master_np, dev, id);
-
-		fwspec = dev_iommu_fwspec_get(dev);
-		if (!err && fwspec)
-			of_property_read_u32(master_np, "pasid-num-bits",
-					     &fwspec->num_pasid_bits);
 	}
 
 	/*
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1d422bf722a1..16ce75693d83 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -572,7 +572,6 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
  * @ops: ops for this device's IOMMU
  * @iommu_fwnode: firmware handle for this device's IOMMU
  * @flags: IOMMU_FWSPEC_* flags
- * @num_pasid_bits: number of PASID bits supported by this device
  * @num_ids: number of associated device IDs
  * @ids: IDs which this device may present to the IOMMU
  */
@@ -580,7 +579,6 @@ struct iommu_fwspec {
 	const struct iommu_ops	*ops;
 	struct fwnode_handle	*iommu_fwnode;
 	u32			flags;
-	u32			num_pasid_bits;
 	unsigned int		num_ids;
 	u32			ids[];
 };
-- 
cgit v1.2.3


From 34b48c704d194738eef0893aa06e412bdc8a972f Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 1 Apr 2021 17:47:12 +0200
Subject: iommu: Separate IOMMU_DEV_FEAT_IOPF from IOMMU_DEV_FEAT_SVA

Some devices manage I/O Page Faults (IOPF) themselves instead of relying
on PCIe PRI or Arm SMMU stall. Allow their drivers to enable SVA without
mandating IOMMU-managed IOPF. The other device drivers now need to first
enable IOMMU_DEV_FEAT_IOPF before enabling IOMMU_DEV_FEAT_SVA. Enabling
IOMMU_DEV_FEAT_IOPF on its own doesn't have any effect visible to the
device driver, it is used in combination with other features.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20210401154718.307519-4-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 16ce75693d83..45c4eb372f56 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -156,10 +156,24 @@ struct iommu_resv_region {
 	enum iommu_resv_type	type;
 };
 
-/* Per device IOMMU features */
+/**
+ * enum iommu_dev_features - Per device IOMMU features
+ * @IOMMU_DEV_FEAT_AUX: Auxiliary domain feature
+ * @IOMMU_DEV_FEAT_SVA: Shared Virtual Addresses
+ * @IOMMU_DEV_FEAT_IOPF: I/O Page Faults such as PRI or Stall. Generally
+ *			 enabling %IOMMU_DEV_FEAT_SVA requires
+ *			 %IOMMU_DEV_FEAT_IOPF, but some devices manage I/O Page
+ *			 Faults themselves instead of relying on the IOMMU. When
+ *			 supported, this feature must be enabled before and
+ *			 disabled after %IOMMU_DEV_FEAT_SVA.
+ *
+ * Device drivers query whether a feature is supported using
+ * iommu_dev_has_feature(), and enable it using iommu_dev_enable_feature().
+ */
 enum iommu_dev_features {
-	IOMMU_DEV_FEAT_AUX,	/* Aux-domain feature */
-	IOMMU_DEV_FEAT_SVA,	/* Shared Virtual Addresses */
+	IOMMU_DEV_FEAT_AUX,
+	IOMMU_DEV_FEAT_SVA,
+	IOMMU_DEV_FEAT_IOPF,
 };
 
 #define IOMMU_PASID_INVALID	(-1U)
-- 
cgit v1.2.3


From fc36479db74e957c4696b605a32c4afaa15fa6cb Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Thu, 1 Apr 2021 17:47:15 +0200
Subject: iommu: Add a page fault handler

Some systems allow devices to handle I/O Page Faults in the core mm. For
example systems implementing the PCIe PRI extension or Arm SMMU stall
model. Infrastructure for reporting these recoverable page faults was
added to the IOMMU core by commit 0c830e6b3282 ("iommu: Introduce device
fault report API"). Add a page fault handler for host SVA.

IOMMU driver can now instantiate several fault workqueues and link them
to IOPF-capable devices. Drivers can choose between a single global
workqueue, one per IOMMU device, one per low-level fault queue, one per
domain, etc.

When it receives a fault event, most commonly in an IRQ handler, the
IOMMU driver reports the fault using iommu_report_device_fault(), which
calls the registered handler. The page fault handler then calls the mm
fault handler, and reports either success or failure with
iommu_page_response(). After the handler succeeds, the hardware retries
the access.

The iopf_param pointer could be embedded into iommu_fault_param. But
putting iopf_param into the iommu_param structure allows us not to care
about ordering between calls to iopf_queue_add_device() and
iommu_register_device_fault_handler().

Tested-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20210401154718.307519-7-jean-philippe@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/Makefile        |   1 +
 drivers/iommu/io-pgfault.c    | 461 ++++++++++++++++++++++++++++++++++++++++++
 drivers/iommu/iommu-sva-lib.h |  53 +++++
 include/linux/iommu.h         |   2 +
 4 files changed, 517 insertions(+)
 create mode 100644 drivers/iommu/io-pgfault.c

(limited to 'include/linux')

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 61bd30cd8369..60fafc23dee6 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -28,3 +28,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
 obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o
+obj-$(CONFIG_IOMMU_SVA_LIB) += io-pgfault.o
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
new file mode 100644
index 000000000000..1df8c1dcae77
--- /dev/null
+++ b/drivers/iommu/io-pgfault.c
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handle device page faults
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ */
+
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "iommu-sva-lib.h"
+
+/**
+ * struct iopf_queue - IO Page Fault queue
+ * @wq: the fault workqueue
+ * @devices: devices attached to this queue
+ * @lock: protects the device list
+ */
+struct iopf_queue {
+	struct workqueue_struct		*wq;
+	struct list_head		devices;
+	struct mutex			lock;
+};
+
+/**
+ * struct iopf_device_param - IO Page Fault data attached to a device
+ * @dev: the device that owns this param
+ * @queue: IOPF queue
+ * @queue_list: index into queue->devices
+ * @partial: faults that are part of a Page Request Group for which the last
+ *           request hasn't been submitted yet.
+ */
+struct iopf_device_param {
+	struct device			*dev;
+	struct iopf_queue		*queue;
+	struct list_head		queue_list;
+	struct list_head		partial;
+};
+
+struct iopf_fault {
+	struct iommu_fault		fault;
+	struct list_head		list;
+};
+
+struct iopf_group {
+	struct iopf_fault		last_fault;
+	struct list_head		faults;
+	struct work_struct		work;
+	struct device			*dev;
+};
+
+static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
+			       enum iommu_page_response_code status)
+{
+	struct iommu_page_response resp = {
+		.version		= IOMMU_PAGE_RESP_VERSION_1,
+		.pasid			= iopf->fault.prm.pasid,
+		.grpid			= iopf->fault.prm.grpid,
+		.code			= status,
+	};
+
+	if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
+	    (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
+		resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+
+	return iommu_page_response(dev, &resp);
+}
+
+static enum iommu_page_response_code
+iopf_handle_single(struct iopf_fault *iopf)
+{
+	vm_fault_t ret;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	unsigned int access_flags = 0;
+	unsigned int fault_flags = FAULT_FLAG_REMOTE;
+	struct iommu_fault_page_request *prm = &iopf->fault.prm;
+	enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
+
+	if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID))
+		return status;
+
+	mm = iommu_sva_find(prm->pasid);
+	if (IS_ERR_OR_NULL(mm))
+		return status;
+
+	mmap_read_lock(mm);
+
+	vma = find_extend_vma(mm, prm->addr);
+	if (!vma)
+		/* Unmapped area */
+		goto out_put_mm;
+
+	if (prm->perm & IOMMU_FAULT_PERM_READ)
+		access_flags |= VM_READ;
+
+	if (prm->perm & IOMMU_FAULT_PERM_WRITE) {
+		access_flags |= VM_WRITE;
+		fault_flags |= FAULT_FLAG_WRITE;
+	}
+
+	if (prm->perm & IOMMU_FAULT_PERM_EXEC) {
+		access_flags |= VM_EXEC;
+		fault_flags |= FAULT_FLAG_INSTRUCTION;
+	}
+
+	if (!(prm->perm & IOMMU_FAULT_PERM_PRIV))
+		fault_flags |= FAULT_FLAG_USER;
+
+	if (access_flags & ~vma->vm_flags)
+		/* Access fault */
+		goto out_put_mm;
+
+	ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL);
+	status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID :
+		IOMMU_PAGE_RESP_SUCCESS;
+
+out_put_mm:
+	mmap_read_unlock(mm);
+	mmput(mm);
+
+	return status;
+}
+
+static void iopf_handle_group(struct work_struct *work)
+{
+	struct iopf_group *group;
+	struct iopf_fault *iopf, *next;
+	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
+
+	group = container_of(work, struct iopf_group, work);
+
+	list_for_each_entry_safe(iopf, next, &group->faults, list) {
+		/*
+		 * For the moment, errors are sticky: don't handle subsequent
+		 * faults in the group if there is an error.
+		 */
+		if (status == IOMMU_PAGE_RESP_SUCCESS)
+			status = iopf_handle_single(iopf);
+
+		if (!(iopf->fault.prm.flags &
+		      IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE))
+			kfree(iopf);
+	}
+
+	iopf_complete_group(group->dev, &group->last_fault, status);
+	kfree(group);
+}
+
+/**
+ * iommu_queue_iopf - IO Page Fault handler
+ * @fault: fault event
+ * @cookie: struct device, passed to iommu_register_device_fault_handler.
+ *
+ * Add a fault to the device workqueue, to be handled by mm.
+ *
+ * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
+ * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
+ * expect a response. It may be generated when disabling a PASID (issuing a
+ * PASID stop request) by some PCI devices.
+ *
+ * The PASID stop request is issued by the device driver before unbind(). Once
+ * it completes, no page request is generated for this PASID anymore and
+ * outstanding ones have been pushed to the IOMMU (as per PCIe 4.0r1.0 - 6.20.1
+ * and 10.4.1.2 - Managing PASID TLP Prefix Usage). Some PCI devices will wait
+ * for all outstanding page requests to come back with a response before
+ * completing the PASID stop request. Others do not wait for page responses, and
+ * instead issue this Stop Marker that tells us when the PASID can be
+ * reallocated.
+ *
+ * It is safe to discard the Stop Marker because it is an optimization.
+ * a. Page requests, which are posted requests, have been flushed to the IOMMU
+ *    when the stop request completes.
+ * b. The IOMMU driver flushes all fault queues on unbind() before freeing the
+ *    PASID.
+ *
+ * So even though the Stop Marker might be issued by the device *after* the stop
+ * request completes, outstanding faults will have been dealt with by the time
+ * the PASID is freed.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+	int ret;
+	struct iopf_group *group;
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+
+	struct device *dev = cookie;
+	struct dev_iommu *param = dev->iommu;
+
+	lockdep_assert_held(&param->lock);
+
+	if (fault->type != IOMMU_FAULT_PAGE_REQ)
+		/* Not a recoverable page fault */
+		return -EOPNOTSUPP;
+
+	/*
+	 * As long as we're holding param->lock, the queue can't be unlinked
+	 * from the device and therefore cannot disappear.
+	 */
+	iopf_param = param->iopf_param;
+	if (!iopf_param)
+		return -ENODEV;
+
+	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+		iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
+		if (!iopf)
+			return -ENOMEM;
+
+		iopf->fault = *fault;
+
+		/* Non-last request of a group. Postpone until the last one */
+		list_add(&iopf->list, &iopf_param->partial);
+
+		return 0;
+	}
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group) {
+		/*
+		 * The caller will send a response to the hardware. But we do
+		 * need to clean up before leaving, otherwise partial faults
+		 * will be stuck.
+		 */
+		ret = -ENOMEM;
+		goto cleanup_partial;
+	}
+
+	group->dev = dev;
+	group->last_fault.fault = *fault;
+	INIT_LIST_HEAD(&group->faults);
+	list_add(&group->last_fault.list, &group->faults);
+	INIT_WORK(&group->work, iopf_handle_group);
+
+	/* See if we have partial faults for this group */
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+		if (iopf->fault.prm.grpid == fault->prm.grpid)
+			/* Insert *before* the last fault */
+			list_move(&iopf->list, &group->faults);
+	}
+
+	queue_work(iopf_param->queue->wq, &group->work);
+	return 0;
+
+cleanup_partial:
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+		if (iopf->fault.prm.grpid == fault->prm.grpid) {
+			list_del(&iopf->list);
+			kfree(iopf);
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_queue_iopf);
+
+/**
+ * iopf_queue_flush_dev - Ensure that all queued faults have been processed
+ * @dev: the endpoint whose faults need to be flushed.
+ *
+ * The IOMMU driver calls this before releasing a PASID, to ensure that all
+ * pending faults for this PASID have been handled, and won't hit the address
+ * space of the next process that uses this PASID. The driver must make sure
+ * that no new fault is added to the queue. In particular it must flush its
+ * low-level queue before calling this function.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_flush_dev(struct device *dev)
+{
+	int ret = 0;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param)
+		return -ENODEV;
+
+	mutex_lock(&param->lock);
+	iopf_param = param->iopf_param;
+	if (iopf_param)
+		flush_workqueue(iopf_param->queue->wq);
+	else
+		ret = -ENODEV;
+	mutex_unlock(&param->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
+
+/**
+ * iopf_queue_discard_partial - Remove all pending partial fault
+ * @queue: the queue whose partial faults need to be discarded
+ *
+ * When the hardware queue overflows, last page faults in a group may have been
+ * lost and the IOMMU driver calls this to discard all partial faults. The
+ * driver shouldn't be adding new faults to this queue concurrently.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+
+	if (!queue)
+		return -EINVAL;
+
+	mutex_lock(&queue->lock);
+	list_for_each_entry(iopf_param, &queue->devices, queue_list) {
+		list_for_each_entry_safe(iopf, next, &iopf_param->partial,
+					 list) {
+			list_del(&iopf->list);
+			kfree(iopf);
+		}
+	}
+	mutex_unlock(&queue->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_discard_partial);
+
+/**
+ * iopf_queue_add_device - Add producer to the fault queue
+ * @queue: IOPF queue
+ * @dev: device to add
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
+{
+	int ret = -EBUSY;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param)
+		return -ENODEV;
+
+	iopf_param = kzalloc(sizeof(*iopf_param), GFP_KERNEL);
+	if (!iopf_param)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&iopf_param->partial);
+	iopf_param->queue = queue;
+	iopf_param->dev = dev;
+
+	mutex_lock(&queue->lock);
+	mutex_lock(&param->lock);
+	if (!param->iopf_param) {
+		list_add(&iopf_param->queue_list, &queue->devices);
+		param->iopf_param = iopf_param;
+		ret = 0;
+	}
+	mutex_unlock(&param->lock);
+	mutex_unlock(&queue->lock);
+
+	if (ret)
+		kfree(iopf_param);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_add_device);
+
+/**
+ * iopf_queue_remove_device - Remove producer from fault queue
+ * @queue: IOPF queue
+ * @dev: device to remove
+ *
+ * Caller makes sure that no more faults are reported for this device.
+ *
+ * Return: 0 on success and <0 on error.
+ */
+int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
+{
+	int ret = -EINVAL;
+	struct iopf_fault *iopf, *next;
+	struct iopf_device_param *iopf_param;
+	struct dev_iommu *param = dev->iommu;
+
+	if (!param || !queue)
+		return -EINVAL;
+
+	mutex_lock(&queue->lock);
+	mutex_lock(&param->lock);
+	iopf_param = param->iopf_param;
+	if (iopf_param && iopf_param->queue == queue) {
+		list_del(&iopf_param->queue_list);
+		param->iopf_param = NULL;
+		ret = 0;
+	}
+	mutex_unlock(&param->lock);
+	mutex_unlock(&queue->lock);
+	if (ret)
+		return ret;
+
+	/* Just in case some faults are still stuck */
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list)
+		kfree(iopf);
+
+	kfree(iopf_param);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_remove_device);
+
+/**
+ * iopf_queue_alloc - Allocate and initialize a fault queue
+ * @name: a unique string identifying the queue (for workqueue)
+ *
+ * Return: the queue on success and NULL on error.
+ */
+struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+	struct iopf_queue *queue;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+
+	/*
+	 * The WQ is unordered because the low-level handler enqueues faults by
+	 * group. PRI requests within a group have to be ordered, but once
+	 * that's dealt with, the high-level function can handle groups out of
+	 * order.
+	 */
+	queue->wq = alloc_workqueue("iopf_queue/%s", WQ_UNBOUND, 0, name);
+	if (!queue->wq) {
+		kfree(queue);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&queue->devices);
+	mutex_init(&queue->lock);
+
+	return queue;
+}
+EXPORT_SYMBOL_GPL(iopf_queue_alloc);
+
+/**
+ * iopf_queue_free - Free IOPF queue
+ * @queue: queue to free
+ *
+ * Counterpart to iopf_queue_alloc(). The driver must not be queuing faults or
+ * adding/removing devices on this queue anymore.
+ */
+void iopf_queue_free(struct iopf_queue *queue)
+{
+	struct iopf_device_param *iopf_param, *next;
+
+	if (!queue)
+		return;
+
+	list_for_each_entry_safe(iopf_param, next, &queue->devices, queue_list)
+		iopf_queue_remove_device(queue, iopf_param->dev);
+
+	destroy_workqueue(queue->wq);
+	kfree(queue);
+}
+EXPORT_SYMBOL_GPL(iopf_queue_free);
diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva-lib.h
index b40990aef3fd..031155010ca8 100644
--- a/drivers/iommu/iommu-sva-lib.h
+++ b/drivers/iommu/iommu-sva-lib.h
@@ -12,4 +12,57 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max);
 void iommu_sva_free_pasid(struct mm_struct *mm);
 struct mm_struct *iommu_sva_find(ioasid_t pasid);
 
+/* I/O Page fault */
+struct device;
+struct iommu_fault;
+struct iopf_queue;
+
+#ifdef CONFIG_IOMMU_SVA_LIB
+int iommu_queue_iopf(struct iommu_fault *fault, void *cookie);
+
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
+int iopf_queue_remove_device(struct iopf_queue *queue,
+			     struct device *dev);
+int iopf_queue_flush_dev(struct device *dev);
+struct iopf_queue *iopf_queue_alloc(const char *name);
+void iopf_queue_free(struct iopf_queue *queue);
+int iopf_queue_discard_partial(struct iopf_queue *queue);
+
+#else /* CONFIG_IOMMU_SVA_LIB */
+static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_add_device(struct iopf_queue *queue,
+					struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_remove_device(struct iopf_queue *queue,
+					   struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_flush_dev(struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+	return NULL;
+}
+
+static inline void iopf_queue_free(struct iopf_queue *queue)
+{
+}
+
+static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_IOMMU_SVA_LIB */
 #endif /* _IOMMU_SVA_LIB_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 45c4eb372f56..86d688c4418f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -367,6 +367,7 @@ struct iommu_fault_param {
  * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
+ * @iopf_param:	 I/O Page Fault queue and data
  * @fwspec:	 IOMMU fwspec data
  * @iommu_dev:	 IOMMU device this device is linked to
  * @priv:	 IOMMU Driver private data
@@ -377,6 +378,7 @@ struct iommu_fault_param {
 struct dev_iommu {
 	struct mutex lock;
 	struct iommu_fault_param	*fault_param;
+	struct iopf_device_param	*iopf_param;
 	struct iommu_fwspec		*fwspec;
 	struct iommu_device		*iommu_dev;
 	void				*priv;
-- 
cgit v1.2.3


From 47685cb202d1aff6f70a2bb91e8271392fefea84 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:37 +0200
Subject: iommu: remove the unused domain_window_disable method

domain_window_disable is wired up by fsl_pamu, but never actually called.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-2-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/fsl_pamu_domain.c | 48 -----------------------------------------
 include/linux/iommu.h           |  2 --
 2 files changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index b2110767caf4..53380cf1fa45 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -473,53 +473,6 @@ static int update_domain_mapping(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
 	return ret;
 }
 
-static int disable_domain_win(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
-{
-	struct device_domain_info *info;
-	int ret = 0;
-
-	list_for_each_entry(info, &dma_domain->devices, link) {
-		if (dma_domain->win_cnt == 1 && dma_domain->enabled) {
-			ret = pamu_disable_liodn(info->liodn);
-			if (!ret)
-				dma_domain->enabled = 0;
-		} else {
-			ret = pamu_disable_spaace(info->liodn, wnd_nr);
-		}
-	}
-
-	return ret;
-}
-
-static void fsl_pamu_window_disable(struct iommu_domain *domain, u32 wnd_nr)
-{
-	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-	if (!dma_domain->win_arr) {
-		pr_debug("Number of windows not configured\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return;
-	}
-
-	if (wnd_nr >= dma_domain->win_cnt) {
-		pr_debug("Invalid window index\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return;
-	}
-
-	if (dma_domain->win_arr[wnd_nr].valid) {
-		ret = disable_domain_win(dma_domain, wnd_nr);
-		if (!ret) {
-			dma_domain->win_arr[wnd_nr].valid = 0;
-			dma_domain->mapped--;
-		}
-	}
-
-	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-}
 
 static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				  phys_addr_t paddr, u64 size, int prot)
@@ -1032,7 +985,6 @@ static const struct iommu_ops fsl_pamu_ops = {
 	.attach_dev	= fsl_pamu_attach_device,
 	.detach_dev	= fsl_pamu_detach_device,
 	.domain_window_enable = fsl_pamu_window_enable,
-	.domain_window_disable = fsl_pamu_window_disable,
 	.iova_to_phys	= fsl_pamu_iova_to_phys,
 	.domain_set_attr = fsl_pamu_set_domain_attr,
 	.domain_get_attr = fsl_pamu_get_domain_attr,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 86d688c4418f..565b8810354d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -223,7 +223,6 @@ struct iommu_iotlb_gather {
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
  * @domain_window_enable: Configure and enable a particular window for a domain
- * @domain_window_disable: Disable a particular window for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @is_attach_deferred: Check if domain attach should be deferred from iommu
  *                      driver init to device driver init (default no)
@@ -284,7 +283,6 @@ struct iommu_ops {
 	/* Window handling functions */
 	int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
 				    phys_addr_t paddr, u64 size, int prot);
-	void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr);
 
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 	bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
-- 
cgit v1.2.3


From 392825e0c76cf9aca33e5a3bf981cde2a2c87251 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:38 +0200
Subject: iommu/fsl_pamu: remove fsl_pamu_get_domain_attr

None of the values returned by this function are ever queried.  Also
remove the DOMAIN_ATTR_FSL_PAMUV1 enum value that is not otherwise used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-3-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/fsl_pamu_domain.c | 30 ------------------------------
 include/linux/iommu.h           |  4 ----
 2 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 53380cf1fa45..e587ec43f7e7 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -832,35 +832,6 @@ static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
 	return ret;
 }
 
-static int fsl_pamu_get_domain_attr(struct iommu_domain *domain,
-				    enum iommu_attr attr_type, void *data)
-{
-	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-	int ret = 0;
-
-	switch (attr_type) {
-	case DOMAIN_ATTR_FSL_PAMU_STASH:
-		memcpy(data, &dma_domain->dma_stash,
-		       sizeof(struct pamu_stash_attribute));
-		break;
-	case DOMAIN_ATTR_FSL_PAMU_ENABLE:
-		*(int *)data = dma_domain->enabled;
-		break;
-	case DOMAIN_ATTR_FSL_PAMUV1:
-		*(int *)data = DOMAIN_ATTR_FSL_PAMUV1;
-		break;
-	case DOMAIN_ATTR_WINDOWS:
-		*(u32 *)data = dma_domain->win_cnt;
-		break;
-	default:
-		pr_debug("Unsupported attribute type\n");
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
 static struct iommu_group *get_device_iommu_group(struct device *dev)
 {
 	struct iommu_group *group;
@@ -987,7 +958,6 @@ static const struct iommu_ops fsl_pamu_ops = {
 	.domain_window_enable = fsl_pamu_window_enable,
 	.iova_to_phys	= fsl_pamu_iova_to_phys,
 	.domain_set_attr = fsl_pamu_set_domain_attr,
-	.domain_get_attr = fsl_pamu_get_domain_attr,
 	.probe_device	= fsl_pamu_probe_device,
 	.release_device	= fsl_pamu_release_device,
 	.device_group   = fsl_pamu_device_group,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 565b8810354d..95ce205347a9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -104,9 +104,6 @@ enum iommu_cap {
  *  -the actual size of the mapped region of a window must be power
  *   of 2 starting with 4KB and physical address must be naturally
  *   aligned.
- * DOMAIN_ATTR_FSL_PAMUV1 corresponds to the above mentioned contraints.
- * The caller can invoke iommu_domain_get_attr to check if the underlying
- * iommu implementation supports these constraints.
  */
 
 enum iommu_attr {
@@ -115,7 +112,6 @@ enum iommu_attr {
 	DOMAIN_ATTR_WINDOWS,
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
-	DOMAIN_ATTR_FSL_PAMUV1,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_IO_PGTABLE_CFG,
-- 
cgit v1.2.3


From ba58d1216e2b2d2320b50591b767f50b13c623a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:41 +0200
Subject: iommu/fsl_pamu: remove support for multiple windows

The only domains allocated forces use of a single window.  Remove all
the code related to multiple window support, as well as the need for
qman_portal to force a single window.

Remove the now unused DOMAIN_ATTR_WINDOWS iommu_attr.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-6-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/fsl_pamu.c            | 264 ++--------------------------------
 drivers/iommu/fsl_pamu.h            |  10 +-
 drivers/iommu/fsl_pamu_domain.c     | 275 +++++++-----------------------------
 drivers/iommu/fsl_pamu_domain.h     |  16 +--
 drivers/soc/fsl/qbman/qman_portal.c |   7 -
 include/linux/iommu.h               |   1 -
 6 files changed, 60 insertions(+), 513 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c
index b9a974d97831..3e1647cd5ad4 100644
--- a/drivers/iommu/fsl_pamu.c
+++ b/drivers/iommu/fsl_pamu.c
@@ -63,19 +63,6 @@ static const struct of_device_id l3_device_ids[] = {
 /* maximum subwindows permitted per liodn */
 static u32 max_subwindow_count;
 
-/* Pool for fspi allocation */
-static struct gen_pool *spaace_pool;
-
-/**
- * pamu_get_max_subwin_cnt() - Return the maximum supported
- * subwindow count per liodn.
- *
- */
-u32 pamu_get_max_subwin_cnt(void)
-{
-	return max_subwindow_count;
-}
-
 /**
  * pamu_get_ppaace() - Return the primary PACCE
  * @liodn: liodn PAACT index for desired PAACE
@@ -155,13 +142,6 @@ static unsigned int map_addrspace_size_to_wse(phys_addr_t addrspace_size)
 	return fls64(addrspace_size) - 2;
 }
 
-/* Derive the PAACE window count encoding for the subwindow count */
-static unsigned int map_subwindow_cnt_to_wce(u32 subwindow_cnt)
-{
-	/* window count is 2^(WCE+1) bytes */
-	return __ffs(subwindow_cnt) - 1;
-}
-
 /*
  * Set the PAACE type as primary and set the coherency required domain
  * attribute
@@ -174,89 +154,11 @@ static void pamu_init_ppaace(struct paace *ppaace)
 	       PAACE_M_COHERENCE_REQ);
 }
 
-/*
- * Set the PAACE type as secondary and set the coherency required domain
- * attribute.
- */
-static void pamu_init_spaace(struct paace *spaace)
-{
-	set_bf(spaace->addr_bitfields, PAACE_AF_PT, PAACE_PT_SECONDARY);
-	set_bf(spaace->domain_attr.to_host.coherency_required, PAACE_DA_HOST_CR,
-	       PAACE_M_COHERENCE_REQ);
-}
-
-/*
- * Return the spaace (corresponding to the secondary window index)
- * for a particular ppaace.
- */
-static struct paace *pamu_get_spaace(struct paace *paace, u32 wnum)
-{
-	u32 subwin_cnt;
-	struct paace *spaace = NULL;
-
-	subwin_cnt = 1UL << (get_bf(paace->impl_attr, PAACE_IA_WCE) + 1);
-
-	if (wnum < subwin_cnt)
-		spaace = &spaact[paace->fspi + wnum];
-	else
-		pr_debug("secondary paace out of bounds\n");
-
-	return spaace;
-}
-
-/**
- * pamu_get_fspi_and_allocate() - Allocates fspi index and reserves subwindows
- *                                required for primary PAACE in the secondary
- *                                PAACE table.
- * @subwin_cnt: Number of subwindows to be reserved.
- *
- * A PPAACE entry may have a number of associated subwindows. A subwindow
- * corresponds to a SPAACE entry in the SPAACT table. Each PAACE entry stores
- * the index (fspi) of the first SPAACE entry in the SPAACT table. This
- * function returns the index of the first SPAACE entry. The remaining
- * SPAACE entries are reserved contiguously from that index.
- *
- * Returns a valid fspi index in the range of 0 - SPAACE_NUMBER_ENTRIES on success.
- * If no SPAACE entry is available or the allocator can not reserve the required
- * number of contiguous entries function returns ULONG_MAX indicating a failure.
- *
- */
-static unsigned long pamu_get_fspi_and_allocate(u32 subwin_cnt)
-{
-	unsigned long spaace_addr;
-
-	spaace_addr = gen_pool_alloc(spaace_pool, subwin_cnt * sizeof(struct paace));
-	if (!spaace_addr)
-		return ULONG_MAX;
-
-	return (spaace_addr - (unsigned long)spaact) / (sizeof(struct paace));
-}
-
-/* Release the subwindows reserved for a particular LIODN */
-void pamu_free_subwins(int liodn)
-{
-	struct paace *ppaace;
-	u32 subwin_cnt, size;
-
-	ppaace = pamu_get_ppaace(liodn);
-	if (!ppaace) {
-		pr_debug("Invalid liodn entry\n");
-		return;
-	}
-
-	if (get_bf(ppaace->addr_bitfields, PPAACE_AF_MW)) {
-		subwin_cnt = 1UL << (get_bf(ppaace->impl_attr, PAACE_IA_WCE) + 1);
-		size = (subwin_cnt - 1) * sizeof(struct paace);
-		gen_pool_free(spaace_pool, (unsigned long)&spaact[ppaace->fspi], size);
-		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
-	}
-}
-
 /*
  * Function used for updating stash destination for the coressponding
  * LIODN.
  */
-int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
+int pamu_update_paace_stash(int liodn, u32 value)
 {
 	struct paace *paace;
 
@@ -265,11 +167,6 @@ int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
 		pr_debug("Invalid liodn entry\n");
 		return -ENOENT;
 	}
-	if (subwin) {
-		paace = pamu_get_spaace(paace, subwin - 1);
-		if (!paace)
-			return -ENOENT;
-	}
 	set_bf(paace->impl_attr, PAACE_IA_CID, value);
 
 	mb();
@@ -277,31 +174,6 @@ int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value)
 	return 0;
 }
 
-/* Disable a subwindow corresponding to the LIODN */
-int pamu_disable_spaace(int liodn, u32 subwin)
-{
-	struct paace *paace;
-
-	paace = pamu_get_ppaace(liodn);
-	if (!paace) {
-		pr_debug("Invalid liodn entry\n");
-		return -ENOENT;
-	}
-	if (subwin) {
-		paace = pamu_get_spaace(paace, subwin - 1);
-		if (!paace)
-			return -ENOENT;
-		set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_INVALID);
-	} else {
-		set_bf(paace->addr_bitfields, PAACE_AF_AP,
-		       PAACE_AP_PERMS_DENIED);
-	}
-
-	mb();
-
-	return 0;
-}
-
 /**
  * pamu_config_paace() - Sets up PPAACE entry for specified liodn
  *
@@ -314,17 +186,15 @@ int pamu_disable_spaace(int liodn, u32 subwin)
  *	     stashid not defined
  * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
  *	     snoopid not defined
- * @subwin_cnt: number of sub-windows
  * @prot: window permissions
  *
  * Returns 0 upon success else error code < 0 returned
  */
 int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
 		       u32 omi, unsigned long rpn, u32 snoopid, u32 stashid,
-		       u32 subwin_cnt, int prot)
+		       int prot)
 {
 	struct paace *ppaace;
-	unsigned long fspi;
 
 	if ((win_size & (win_size - 1)) || win_size < PAMU_PAGE_SIZE) {
 		pr_debug("window size too small or not a power of two %pa\n",
@@ -368,116 +238,12 @@ int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
 	if (~snoopid != 0)
 		ppaace->domain_attr.to_host.snpid = snoopid;
 
-	if (subwin_cnt) {
-		/* The first entry is in the primary PAACE instead */
-		fspi = pamu_get_fspi_and_allocate(subwin_cnt - 1);
-		if (fspi == ULONG_MAX) {
-			pr_debug("spaace indexes exhausted\n");
-			return -EINVAL;
-		}
-
-		/* window count is 2^(WCE+1) bytes */
-		set_bf(ppaace->impl_attr, PAACE_IA_WCE,
-		       map_subwindow_cnt_to_wce(subwin_cnt));
-		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0x1);
-		ppaace->fspi = fspi;
-	} else {
-		set_bf(ppaace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
-		ppaace->twbah = rpn >> 20;
-		set_bf(ppaace->win_bitfields, PAACE_WIN_TWBAL, rpn);
-		set_bf(ppaace->addr_bitfields, PAACE_AF_AP, prot);
-		set_bf(ppaace->impl_attr, PAACE_IA_WCE, 0);
-		set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
-	}
-	mb();
-
-	return 0;
-}
-
-/**
- * pamu_config_spaace() - Sets up SPAACE entry for specified subwindow
- *
- * @liodn:  Logical IO device number
- * @subwin_cnt:  number of sub-windows associated with dma-window
- * @subwin: subwindow index
- * @subwin_size: size of subwindow
- * @omi: Operation mapping index
- * @rpn: real (true physical) page number
- * @snoopid: snoop id for hardware coherency -- if ~snoopid == 0 then
- *			  snoopid not defined
- * @stashid: cache stash id for associated cpu
- * @enable: enable/disable subwindow after reconfiguration
- * @prot: sub window permissions
- *
- * Returns 0 upon success else error code < 0 returned
- */
-int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin,
-		       phys_addr_t subwin_size, u32 omi, unsigned long rpn,
-		       u32 snoopid, u32 stashid, int enable, int prot)
-{
-	struct paace *paace;
-
-	/* setup sub-windows */
-	if (!subwin_cnt) {
-		pr_debug("Invalid subwindow count\n");
-		return -EINVAL;
-	}
-
-	paace = pamu_get_ppaace(liodn);
-	if (subwin > 0 && subwin < subwin_cnt && paace) {
-		paace = pamu_get_spaace(paace, subwin - 1);
-
-		if (paace && !(paace->addr_bitfields & PAACE_V_VALID)) {
-			pamu_init_spaace(paace);
-			set_bf(paace->addr_bitfields, SPAACE_AF_LIODN, liodn);
-		}
-	}
-
-	if (!paace) {
-		pr_debug("Invalid liodn entry\n");
-		return -ENOENT;
-	}
-
-	if ((subwin_size & (subwin_size - 1)) || subwin_size < PAMU_PAGE_SIZE) {
-		pr_debug("subwindow size out of range, or not a power of 2\n");
-		return -EINVAL;
-	}
-
-	if (rpn == ULONG_MAX) {
-		pr_debug("real page number out of range\n");
-		return -EINVAL;
-	}
-
-	/* window size is 2^(WSE+1) bytes */
-	set_bf(paace->win_bitfields, PAACE_WIN_SWSE,
-	       map_addrspace_size_to_wse(subwin_size));
-
-	set_bf(paace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
-	paace->twbah = rpn >> 20;
-	set_bf(paace->win_bitfields, PAACE_WIN_TWBAL, rpn);
-	set_bf(paace->addr_bitfields, PAACE_AF_AP, prot);
-
-	/* configure snoop id */
-	if (~snoopid != 0)
-		paace->domain_attr.to_host.snpid = snoopid;
-
-	/* set up operation mapping if it's configured */
-	if (omi < OME_NUMBER_ENTRIES) {
-		set_bf(paace->impl_attr, PAACE_IA_OTM, PAACE_OTM_INDEXED);
-		paace->op_encode.index_ot.omi = omi;
-	} else if (~omi != 0) {
-		pr_debug("bad operation mapping index: %d\n", omi);
-		return -EINVAL;
-	}
-
-	if (~stashid != 0)
-		set_bf(paace->impl_attr, PAACE_IA_CID, stashid);
-
-	smp_wmb();
-
-	if (enable)
-		set_bf(paace->addr_bitfields, PAACE_AF_V, PAACE_V_VALID);
-
+	set_bf(ppaace->impl_attr, PAACE_IA_ATM, PAACE_ATM_WINDOW_XLATE);
+	ppaace->twbah = rpn >> 20;
+	set_bf(ppaace->win_bitfields, PAACE_WIN_TWBAL, rpn);
+	set_bf(ppaace->addr_bitfields, PAACE_AF_AP, prot);
+	set_bf(ppaace->impl_attr, PAACE_IA_WCE, 0);
+	set_bf(ppaace->addr_bitfields, PPAACE_AF_MW, 0);
 	mb();
 
 	return 0;
@@ -1129,17 +895,6 @@ static int fsl_pamu_probe(struct platform_device *pdev)
 	spaact_phys = virt_to_phys(spaact);
 	omt_phys = virt_to_phys(omt);
 
-	spaace_pool = gen_pool_create(ilog2(sizeof(struct paace)), -1);
-	if (!spaace_pool) {
-		ret = -ENOMEM;
-		dev_err(dev, "Failed to allocate spaace gen pool\n");
-		goto error;
-	}
-
-	ret = gen_pool_add(spaace_pool, (unsigned long)spaact, SPAACT_SIZE, -1);
-	if (ret)
-		goto error_genpool;
-
 	pamubypenr = in_be32(&guts_regs->pamubypenr);
 
 	for (pamu_reg_off = 0, pamu_counter = 0x80000000; pamu_reg_off < size;
@@ -1167,9 +922,6 @@ static int fsl_pamu_probe(struct platform_device *pdev)
 
 	return 0;
 
-error_genpool:
-	gen_pool_destroy(spaace_pool);
-
 error:
 	if (irq != NO_IRQ)
 		free_irq(irq, data);
diff --git a/drivers/iommu/fsl_pamu.h b/drivers/iommu/fsl_pamu.h
index e1496ba96160..04fd843d718d 100644
--- a/drivers/iommu/fsl_pamu.h
+++ b/drivers/iommu/fsl_pamu.h
@@ -383,18 +383,12 @@ struct ome {
 int pamu_domain_init(void);
 int pamu_enable_liodn(int liodn);
 int pamu_disable_liodn(int liodn);
-void pamu_free_subwins(int liodn);
 int pamu_config_ppaace(int liodn, phys_addr_t win_addr, phys_addr_t win_size,
 		       u32 omi, unsigned long rpn, u32 snoopid, uint32_t stashid,
-		       u32 subwin_cnt, int prot);
-int pamu_config_spaace(int liodn, u32 subwin_cnt, u32 subwin_addr,
-		       phys_addr_t subwin_size, u32 omi, unsigned long rpn,
-		       uint32_t snoopid, u32 stashid, int enable, int prot);
+		       int prot);
 
 u32 get_stash_id(u32 stash_dest_hint, u32 vcpu);
 void get_ome_index(u32 *omi_index, struct device *dev);
-int  pamu_update_paace_stash(int liodn, u32 subwin, u32 value);
-int pamu_disable_spaace(int liodn, u32 subwin);
-u32 pamu_get_max_subwin_cnt(void);
+int  pamu_update_paace_stash(int liodn, u32 value);
 
 #endif  /* __FSL_PAMU_H */
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index a4da5597755d..e6bdd38fc184 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -56,65 +56,19 @@ static int __init iommu_init_mempool(void)
 
 static phys_addr_t get_phys_addr(struct fsl_dma_domain *dma_domain, dma_addr_t iova)
 {
-	u32 win_cnt = dma_domain->win_cnt;
 	struct dma_window *win_ptr = &dma_domain->win_arr[0];
 	struct iommu_domain_geometry *geom;
 
 	geom = &dma_domain->iommu_domain.geometry;
 
-	if (!win_cnt) {
-		pr_debug("Number of windows/geometry not configured for the domain\n");
-		return 0;
-	}
-
-	if (win_cnt > 1) {
-		u64 subwin_size;
-		dma_addr_t subwin_iova;
-		u32 wnd;
-
-		subwin_size = (geom->aperture_end + 1) >> ilog2(win_cnt);
-		subwin_iova = iova & ~(subwin_size - 1);
-		wnd = (subwin_iova - geom->aperture_start) >> ilog2(subwin_size);
-		win_ptr = &dma_domain->win_arr[wnd];
-	}
-
 	if (win_ptr->valid)
 		return win_ptr->paddr + (iova & (win_ptr->size - 1));
 
 	return 0;
 }
 
-static int map_subwins(int liodn, struct fsl_dma_domain *dma_domain)
-{
-	struct dma_window *sub_win_ptr = &dma_domain->win_arr[0];
-	int i, ret;
-	unsigned long rpn, flags;
-
-	for (i = 0; i < dma_domain->win_cnt; i++) {
-		if (sub_win_ptr[i].valid) {
-			rpn = sub_win_ptr[i].paddr >> PAMU_PAGE_SHIFT;
-			spin_lock_irqsave(&iommu_lock, flags);
-			ret = pamu_config_spaace(liodn, dma_domain->win_cnt, i,
-						 sub_win_ptr[i].size,
-						 ~(u32)0,
-						 rpn,
-						 dma_domain->snoop_id,
-						 dma_domain->stash_id,
-						 (i > 0) ? 1 : 0,
-						 sub_win_ptr[i].prot);
-			spin_unlock_irqrestore(&iommu_lock, flags);
-			if (ret) {
-				pr_debug("SPAACE configuration failed for liodn %d\n",
-					 liodn);
-				return ret;
-			}
-		}
-	}
-
-	return ret;
-}
-
-static int map_win(int liodn, struct fsl_dma_domain *dma_domain)
+/* Map the DMA window corresponding to the LIODN */
+static int map_liodn(int liodn, struct fsl_dma_domain *dma_domain)
 {
 	int ret;
 	struct dma_window *wnd = &dma_domain->win_arr[0];
@@ -127,7 +81,7 @@ static int map_win(int liodn, struct fsl_dma_domain *dma_domain)
 				 ~(u32)0,
 				 wnd->paddr >> PAMU_PAGE_SHIFT,
 				 dma_domain->snoop_id, dma_domain->stash_id,
-				 0, wnd->prot);
+				 wnd->prot);
 	spin_unlock_irqrestore(&iommu_lock, flags);
 	if (ret)
 		pr_debug("PAACE configuration failed for liodn %d\n", liodn);
@@ -135,50 +89,27 @@ static int map_win(int liodn, struct fsl_dma_domain *dma_domain)
 	return ret;
 }
 
-/* Map the DMA window corresponding to the LIODN */
-static int map_liodn(int liodn, struct fsl_dma_domain *dma_domain)
-{
-	if (dma_domain->win_cnt > 1)
-		return map_subwins(liodn, dma_domain);
-	else
-		return map_win(liodn, dma_domain);
-}
-
 /* Update window/subwindow mapping for the LIODN */
 static int update_liodn(int liodn, struct fsl_dma_domain *dma_domain, u32 wnd_nr)
 {
 	int ret;
 	struct dma_window *wnd = &dma_domain->win_arr[wnd_nr];
+	phys_addr_t wnd_addr;
 	unsigned long flags;
 
 	spin_lock_irqsave(&iommu_lock, flags);
-	if (dma_domain->win_cnt > 1) {
-		ret = pamu_config_spaace(liodn, dma_domain->win_cnt, wnd_nr,
-					 wnd->size,
-					 ~(u32)0,
-					 wnd->paddr >> PAMU_PAGE_SHIFT,
-					 dma_domain->snoop_id,
-					 dma_domain->stash_id,
-					 (wnd_nr > 0) ? 1 : 0,
-					 wnd->prot);
-		if (ret)
-			pr_debug("Subwindow reconfiguration failed for liodn %d\n",
-				 liodn);
-	} else {
-		phys_addr_t wnd_addr;
 
-		wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
+	wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
 
-		ret = pamu_config_ppaace(liodn, wnd_addr,
-					 wnd->size,
-					 ~(u32)0,
-					 wnd->paddr >> PAMU_PAGE_SHIFT,
-					 dma_domain->snoop_id, dma_domain->stash_id,
-					 0, wnd->prot);
-		if (ret)
-			pr_debug("Window reconfiguration failed for liodn %d\n",
-				 liodn);
-	}
+	ret = pamu_config_ppaace(liodn, wnd_addr,
+				 wnd->size,
+				 ~(u32)0,
+				 wnd->paddr >> PAMU_PAGE_SHIFT,
+				 dma_domain->snoop_id, dma_domain->stash_id,
+				 wnd->prot);
+	if (ret)
+		pr_debug("Window reconfiguration failed for liodn %d\n",
+			 liodn);
 
 	spin_unlock_irqrestore(&iommu_lock, flags);
 
@@ -192,21 +123,12 @@ static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
 	unsigned long flags;
 
 	spin_lock_irqsave(&iommu_lock, flags);
-	if (!dma_domain->win_arr) {
-		pr_debug("Windows not configured, stash destination update failed for liodn %d\n",
-			 liodn);
+	ret = pamu_update_paace_stash(liodn, val);
+	if (ret) {
+		pr_debug("Failed to update SPAACE %d field for liodn %d\n ",
+			 i, liodn);
 		spin_unlock_irqrestore(&iommu_lock, flags);
-		return -EINVAL;
-	}
-
-	for (i = 0; i < dma_domain->win_cnt; i++) {
-		ret = pamu_update_paace_stash(liodn, i, val);
-		if (ret) {
-			pr_debug("Failed to update SPAACE %d field for liodn %d\n ",
-				 i, liodn);
-			spin_unlock_irqrestore(&iommu_lock, flags);
-			return ret;
-		}
+		return ret;
 	}
 
 	spin_unlock_irqrestore(&iommu_lock, flags);
@@ -217,14 +139,12 @@ static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
 /* Set the geometry parameters for a LIODN */
 static int pamu_set_liodn(int liodn, struct device *dev,
 			  struct fsl_dma_domain *dma_domain,
-			  struct iommu_domain_geometry *geom_attr,
-			  u32 win_cnt)
+			  struct iommu_domain_geometry *geom_attr)
 {
 	phys_addr_t window_addr, window_size;
-	phys_addr_t subwin_size;
-	int ret = 0, i;
 	u32 omi_index = ~(u32)0;
 	unsigned long flags;
+	int ret;
 
 	/*
 	 * Configure the omi_index at the geometry setup time.
@@ -241,34 +161,14 @@ static int pamu_set_liodn(int liodn, struct device *dev,
 	if (!ret)
 		ret = pamu_config_ppaace(liodn, window_addr, window_size, omi_index,
 					 0, dma_domain->snoop_id,
-					 dma_domain->stash_id, win_cnt, 0);
+					 dma_domain->stash_id, 0);
 	spin_unlock_irqrestore(&iommu_lock, flags);
 	if (ret) {
-		pr_debug("PAACE configuration failed for liodn %d, win_cnt =%d\n",
-			 liodn, win_cnt);
+		pr_debug("PAACE configuration failed for liodn %d\n",
+			 liodn);
 		return ret;
 	}
 
-	if (win_cnt > 1) {
-		subwin_size = window_size >> ilog2(win_cnt);
-		for (i = 0; i < win_cnt; i++) {
-			spin_lock_irqsave(&iommu_lock, flags);
-			ret = pamu_disable_spaace(liodn, i);
-			if (!ret)
-				ret = pamu_config_spaace(liodn, win_cnt, i,
-							 subwin_size, omi_index,
-							 0, dma_domain->snoop_id,
-							 dma_domain->stash_id,
-							 0, 0);
-			spin_unlock_irqrestore(&iommu_lock, flags);
-			if (ret) {
-				pr_debug("SPAACE configuration failed for liodn %d\n",
-					 liodn);
-				return ret;
-			}
-		}
-	}
-
 	return ret;
 }
 
@@ -292,14 +192,12 @@ static int check_size(u64 size, dma_addr_t iova)
 	return 0;
 }
 
-static void remove_device_ref(struct device_domain_info *info, u32 win_cnt)
+static void remove_device_ref(struct device_domain_info *info)
 {
 	unsigned long flags;
 
 	list_del(&info->link);
 	spin_lock_irqsave(&iommu_lock, flags);
-	if (win_cnt > 1)
-		pamu_free_subwins(info->liodn);
 	pamu_disable_liodn(info->liodn);
 	spin_unlock_irqrestore(&iommu_lock, flags);
 	spin_lock_irqsave(&device_domain_lock, flags);
@@ -317,7 +215,7 @@ static void detach_device(struct device *dev, struct fsl_dma_domain *dma_domain)
 	/* Remove the device from the domain device list */
 	list_for_each_entry_safe(info, tmp, &dma_domain->devices, link) {
 		if (!dev || (info->dev == dev))
-			remove_device_ref(info, dma_domain->win_cnt);
+			remove_device_ref(info);
 	}
 	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 }
@@ -399,7 +297,6 @@ static struct iommu_domain *fsl_pamu_domain_alloc(unsigned type)
 
 	dma_domain->stash_id = ~(u32)0;
 	dma_domain->snoop_id = ~(u32)0;
-	dma_domain->win_cnt = pamu_get_max_subwin_cnt();
 	INIT_LIST_HEAD(&dma_domain->devices);
 	spin_lock_init(&dma_domain->domain_lock);
 
@@ -411,24 +308,6 @@ static struct iommu_domain *fsl_pamu_domain_alloc(unsigned type)
 	return &dma_domain->iommu_domain;
 }
 
-/* Configure geometry settings for all LIODNs associated with domain */
-static int pamu_set_domain_geometry(struct fsl_dma_domain *dma_domain,
-				    struct iommu_domain_geometry *geom_attr,
-				    u32 win_cnt)
-{
-	struct device_domain_info *info;
-	int ret = 0;
-
-	list_for_each_entry(info, &dma_domain->devices, link) {
-		ret = pamu_set_liodn(info->liodn, info->dev, dma_domain,
-				     geom_attr, win_cnt);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
 /* Update stash destination for all LIODNs associated with the domain */
 static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
 {
@@ -475,39 +354,30 @@ static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 		pamu_prot |= PAACE_AP_PERMS_UPDATE;
 
 	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-	if (!dma_domain->win_arr) {
-		pr_debug("Number of windows not configured\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return -ENODEV;
-	}
-
-	if (wnd_nr >= dma_domain->win_cnt) {
+	if (wnd_nr > 0) {
 		pr_debug("Invalid window index\n");
 		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 		return -EINVAL;
 	}
 
-	win_size = (domain->geometry.aperture_end + 1) >>
-			ilog2(dma_domain->win_cnt);
+	win_size = (domain->geometry.aperture_end + 1) >> ilog2(1);
 	if (size > win_size) {
 		pr_debug("Invalid window size\n");
 		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 		return -EINVAL;
 	}
 
-	if (dma_domain->win_cnt == 1) {
-		if (dma_domain->enabled) {
-			pr_debug("Disable the window before updating the mapping\n");
-			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-			return -EBUSY;
-		}
+	if (dma_domain->enabled) {
+		pr_debug("Disable the window before updating the mapping\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EBUSY;
+	}
 
-		ret = check_size(size, domain->geometry.aperture_start);
-		if (ret) {
-			pr_debug("Aperture start not aligned to the size\n");
-			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-			return -EINVAL;
-		}
+	ret = check_size(size, domain->geometry.aperture_start);
+	if (ret) {
+		pr_debug("Aperture start not aligned to the size\n");
+		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
+		return -EINVAL;
 	}
 
 	wnd = &dma_domain->win_arr[wnd_nr];
@@ -560,22 +430,18 @@ static int handle_attach_device(struct fsl_dma_domain *dma_domain,
 		 * for the domain. If yes, set the geometry for
 		 * the LIODN.
 		 */
-		if (dma_domain->win_arr) {
-			u32 win_cnt = dma_domain->win_cnt > 1 ? dma_domain->win_cnt : 0;
-
-			ret = pamu_set_liodn(liodn[i], dev, dma_domain,
-					     &domain->geometry, win_cnt);
+		ret = pamu_set_liodn(liodn[i], dev, dma_domain,
+				     &domain->geometry);
+		if (ret)
+			break;
+		if (dma_domain->mapped) {
+			/*
+			 * Create window/subwindow mapping for
+			 * the LIODN.
+			 */
+			ret = map_liodn(liodn[i], dma_domain);
 			if (ret)
 				break;
-			if (dma_domain->mapped) {
-				/*
-				 * Create window/subwindow mapping for
-				 * the LIODN.
-				 */
-				ret = map_liodn(liodn[i], dma_domain);
-				if (ret)
-					break;
-			}
 		}
 	}
 	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
@@ -706,48 +572,6 @@ static int configure_domain_dma_state(struct fsl_dma_domain *dma_domain, bool en
 	return 0;
 }
 
-static int fsl_pamu_set_windows(struct iommu_domain *domain, u32 w_count)
-{
-	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-	/* Ensure domain is inactive i.e. DMA should be disabled for the domain */
-	if (dma_domain->enabled) {
-		pr_debug("Can't set geometry attributes as domain is active\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return  -EBUSY;
-	}
-
-	/*
-	 * Ensure we have valid window count i.e. it should be less than
-	 * maximum permissible limit and should be a power of two.
-	 */
-	if (w_count > pamu_get_max_subwin_cnt() || !is_power_of_2(w_count)) {
-		pr_debug("Invalid window count\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return -EINVAL;
-	}
-
-	ret = pamu_set_domain_geometry(dma_domain, &domain->geometry,
-				       w_count > 1 ? w_count : 0);
-	if (!ret) {
-		kfree(dma_domain->win_arr);
-		dma_domain->win_arr = kcalloc(w_count,
-					      sizeof(*dma_domain->win_arr),
-					      GFP_ATOMIC);
-		if (!dma_domain->win_arr) {
-			spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-			return -ENOMEM;
-		}
-		dma_domain->win_cnt = w_count;
-	}
-	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-
-	return ret;
-}
-
 static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
 				    enum iommu_attr attr_type, void *data)
 {
@@ -761,9 +585,6 @@ static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
 	case DOMAIN_ATTR_FSL_PAMU_ENABLE:
 		ret = configure_domain_dma_state(dma_domain, *(int *)data);
 		break;
-	case DOMAIN_ATTR_WINDOWS:
-		ret = fsl_pamu_set_windows(domain, *(u32 *)data);
-		break;
 	default:
 		pr_debug("Unsupported attribute type\n");
 		ret = -EINVAL;
diff --git a/drivers/iommu/fsl_pamu_domain.h b/drivers/iommu/fsl_pamu_domain.h
index 53d359d66fe5..d3523ee9999d 100644
--- a/drivers/iommu/fsl_pamu_domain.h
+++ b/drivers/iommu/fsl_pamu_domain.h
@@ -17,23 +17,11 @@ struct dma_window {
 };
 
 struct fsl_dma_domain {
-	/*
-	 * Number of windows assocaited with this domain.
-	 * During domain initialization, it is set to the
-	 * the maximum number of subwindows allowed for a LIODN.
-	 * Minimum value for this is 1 indicating a single PAMU
-	 * window, without any sub windows. Value can be set/
-	 * queried by set_attr/get_attr API for DOMAIN_ATTR_WINDOWS.
-	 * Value can only be set once the geometry has been configured.
-	 */
-	u32				win_cnt;
 	/*
 	 * win_arr contains information of the configured
-	 * windows for a domain. This is allocated only
-	 * when the number of windows for the domain are
-	 * set.
+	 * windows for a domain.
 	 */
-	struct dma_window		*win_arr;
+	struct dma_window		win_arr[1];
 	/* list of devices associated with the domain */
 	struct list_head		devices;
 	/* dma_domain states:
diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c
index c958e6310d30..3d56ec4b373b 100644
--- a/drivers/soc/fsl/qbman/qman_portal.c
+++ b/drivers/soc/fsl/qbman/qman_portal.c
@@ -55,13 +55,6 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
 		dev_err(dev, "%s(): iommu_domain_alloc() failed", __func__);
 		goto no_iommu;
 	}
-	ret = iommu_domain_set_attr(pcfg->iommu_domain, DOMAIN_ATTR_WINDOWS,
-				    &window_count);
-	if (ret < 0) {
-		dev_err(dev, "%s(): iommu_domain_set_attr() = %d", __func__,
-			ret);
-		goto out_domain_free;
-	}
 	stash_attr.cpu = cpu;
 	stash_attr.cache = PAMU_ATTR_CACHE_L1;
 	ret = iommu_domain_set_attr(pcfg->iommu_domain,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 95ce205347a9..761aa8b69936 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -109,7 +109,6 @@ enum iommu_cap {
 enum iommu_attr {
 	DOMAIN_ATTR_GEOMETRY,
 	DOMAIN_ATTR_PAGING,
-	DOMAIN_ATTR_WINDOWS,
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
-- 
cgit v1.2.3


From 376dfd2a2ff41596a6efc8ea56f8b0de172b04a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:42 +0200
Subject: iommu/fsl_pamu: remove ->domain_window_enable

The only thing that fsl_pamu_window_enable does for the current caller
is to fill in the prot value in the only dma_window structure, and to
propagate a few values from the iommu_domain_geometry struture into the
dma_window.  Remove the dma_window entirely, hardcode the prot value and
otherwise use the iommu_domain_geometry structure instead.

Remove the now unused ->domain_window_enable iommu method.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-7-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/fsl_pamu_domain.c     | 182 +++---------------------------------
 drivers/iommu/fsl_pamu_domain.h     |  15 ---
 drivers/iommu/iommu.c               |  11 ---
 drivers/soc/fsl/qbman/qman_portal.c |   7 --
 include/linux/iommu.h               |  17 ----
 5 files changed, 14 insertions(+), 218 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index e6bdd38fc184..689035e9d409 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -54,34 +54,18 @@ static int __init iommu_init_mempool(void)
 	return 0;
 }
 
-static phys_addr_t get_phys_addr(struct fsl_dma_domain *dma_domain, dma_addr_t iova)
-{
-	struct dma_window *win_ptr = &dma_domain->win_arr[0];
-	struct iommu_domain_geometry *geom;
-
-	geom = &dma_domain->iommu_domain.geometry;
-
-	if (win_ptr->valid)
-		return win_ptr->paddr + (iova & (win_ptr->size - 1));
-
-	return 0;
-}
-
 /* Map the DMA window corresponding to the LIODN */
 static int map_liodn(int liodn, struct fsl_dma_domain *dma_domain)
 {
 	int ret;
-	struct dma_window *wnd = &dma_domain->win_arr[0];
-	phys_addr_t wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
+	struct iommu_domain_geometry *geom = &dma_domain->iommu_domain.geometry;
 	unsigned long flags;
 
 	spin_lock_irqsave(&iommu_lock, flags);
-	ret = pamu_config_ppaace(liodn, wnd_addr,
-				 wnd->size,
-				 ~(u32)0,
-				 wnd->paddr >> PAMU_PAGE_SHIFT,
-				 dma_domain->snoop_id, dma_domain->stash_id,
-				 wnd->prot);
+	ret = pamu_config_ppaace(liodn, geom->aperture_start,
+				 geom->aperture_end + 1, ~(u32)0,
+				 0, dma_domain->snoop_id, dma_domain->stash_id,
+				 PAACE_AP_PERMS_QUERY | PAACE_AP_PERMS_UPDATE);
 	spin_unlock_irqrestore(&iommu_lock, flags);
 	if (ret)
 		pr_debug("PAACE configuration failed for liodn %d\n", liodn);
@@ -89,33 +73,6 @@ static int map_liodn(int liodn, struct fsl_dma_domain *dma_domain)
 	return ret;
 }
 
-/* Update window/subwindow mapping for the LIODN */
-static int update_liodn(int liodn, struct fsl_dma_domain *dma_domain, u32 wnd_nr)
-{
-	int ret;
-	struct dma_window *wnd = &dma_domain->win_arr[wnd_nr];
-	phys_addr_t wnd_addr;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu_lock, flags);
-
-	wnd_addr = dma_domain->iommu_domain.geometry.aperture_start;
-
-	ret = pamu_config_ppaace(liodn, wnd_addr,
-				 wnd->size,
-				 ~(u32)0,
-				 wnd->paddr >> PAMU_PAGE_SHIFT,
-				 dma_domain->snoop_id, dma_domain->stash_id,
-				 wnd->prot);
-	if (ret)
-		pr_debug("Window reconfiguration failed for liodn %d\n",
-			 liodn);
-
-	spin_unlock_irqrestore(&iommu_lock, flags);
-
-	return ret;
-}
-
 static int update_liodn_stash(int liodn, struct fsl_dma_domain *dma_domain,
 			      u32 val)
 {
@@ -172,26 +129,6 @@ static int pamu_set_liodn(int liodn, struct device *dev,
 	return ret;
 }
 
-static int check_size(u64 size, dma_addr_t iova)
-{
-	/*
-	 * Size must be a power of two and at least be equal
-	 * to PAMU page size.
-	 */
-	if ((size & (size - 1)) || size < PAMU_PAGE_SIZE) {
-		pr_debug("Size too small or not a power of two\n");
-		return -EINVAL;
-	}
-
-	/* iova must be page size aligned */
-	if (iova & (size - 1)) {
-		pr_debug("Address is not aligned with window size\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static void remove_device_ref(struct device_domain_info *info)
 {
 	unsigned long flags;
@@ -257,13 +194,10 @@ static void attach_device(struct fsl_dma_domain *dma_domain, int liodn, struct d
 static phys_addr_t fsl_pamu_iova_to_phys(struct iommu_domain *domain,
 					 dma_addr_t iova)
 {
-	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-
 	if (iova < domain->geometry.aperture_start ||
 	    iova > domain->geometry.aperture_end)
 		return 0;
-
-	return get_phys_addr(dma_domain, iova);
+	return iova;
 }
 
 static bool fsl_pamu_capable(enum iommu_cap cap)
@@ -279,7 +213,6 @@ static void fsl_pamu_domain_free(struct iommu_domain *domain)
 	detach_device(NULL, dma_domain);
 
 	dma_domain->enabled = 0;
-	dma_domain->mapped = 0;
 
 	kmem_cache_free(fsl_pamu_domain_cache, dma_domain);
 }
@@ -323,84 +256,6 @@ static int update_domain_stash(struct fsl_dma_domain *dma_domain, u32 val)
 	return ret;
 }
 
-/* Update domain mappings for all LIODNs associated with the domain */
-static int update_domain_mapping(struct fsl_dma_domain *dma_domain, u32 wnd_nr)
-{
-	struct device_domain_info *info;
-	int ret = 0;
-
-	list_for_each_entry(info, &dma_domain->devices, link) {
-		ret = update_liodn(info->liodn, dma_domain, wnd_nr);
-		if (ret)
-			break;
-	}
-	return ret;
-}
-
-
-static int fsl_pamu_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-				  phys_addr_t paddr, u64 size, int prot)
-{
-	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-	struct dma_window *wnd;
-	int pamu_prot = 0;
-	int ret;
-	unsigned long flags;
-	u64 win_size;
-
-	if (prot & IOMMU_READ)
-		pamu_prot |= PAACE_AP_PERMS_QUERY;
-	if (prot & IOMMU_WRITE)
-		pamu_prot |= PAACE_AP_PERMS_UPDATE;
-
-	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-	if (wnd_nr > 0) {
-		pr_debug("Invalid window index\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return -EINVAL;
-	}
-
-	win_size = (domain->geometry.aperture_end + 1) >> ilog2(1);
-	if (size > win_size) {
-		pr_debug("Invalid window size\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return -EINVAL;
-	}
-
-	if (dma_domain->enabled) {
-		pr_debug("Disable the window before updating the mapping\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return -EBUSY;
-	}
-
-	ret = check_size(size, domain->geometry.aperture_start);
-	if (ret) {
-		pr_debug("Aperture start not aligned to the size\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return -EINVAL;
-	}
-
-	wnd = &dma_domain->win_arr[wnd_nr];
-	if (!wnd->valid) {
-		wnd->paddr = paddr;
-		wnd->size = size;
-		wnd->prot = pamu_prot;
-
-		ret = update_domain_mapping(dma_domain, wnd_nr);
-		if (!ret) {
-			wnd->valid = 1;
-			dma_domain->mapped++;
-		}
-	} else {
-		pr_debug("Disable the window before updating the mapping\n");
-		ret = -EBUSY;
-	}
-
-	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-
-	return ret;
-}
-
 /*
  * Attach the LIODN to the DMA domain and configure the geometry
  * and window mappings.
@@ -434,15 +289,14 @@ static int handle_attach_device(struct fsl_dma_domain *dma_domain,
 				     &domain->geometry);
 		if (ret)
 			break;
-		if (dma_domain->mapped) {
-			/*
-			 * Create window/subwindow mapping for
-			 * the LIODN.
-			 */
-			ret = map_liodn(liodn[i], dma_domain);
-			if (ret)
-				break;
-		}
+
+		/*
+		 * Create window/subwindow mapping for
+		 * the LIODN.
+		 */
+		ret = map_liodn(liodn[i], dma_domain);
+		if (ret)
+			break;
 	}
 	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 
@@ -552,13 +406,6 @@ static int configure_domain_dma_state(struct fsl_dma_domain *dma_domain, bool en
 	int ret;
 
 	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-
-	if (enable && !dma_domain->mapped) {
-		pr_debug("Can't enable DMA domain without valid mapping\n");
-		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-		return -ENODEV;
-	}
-
 	dma_domain->enabled = enable;
 	list_for_each_entry(info, &dma_domain->devices, link) {
 		ret = (enable) ? pamu_enable_liodn(info->liodn) :
@@ -717,7 +564,6 @@ static const struct iommu_ops fsl_pamu_ops = {
 	.domain_free    = fsl_pamu_domain_free,
 	.attach_dev	= fsl_pamu_attach_device,
 	.detach_dev	= fsl_pamu_detach_device,
-	.domain_window_enable = fsl_pamu_window_enable,
 	.iova_to_phys	= fsl_pamu_iova_to_phys,
 	.domain_set_attr = fsl_pamu_set_domain_attr,
 	.probe_device	= fsl_pamu_probe_device,
diff --git a/drivers/iommu/fsl_pamu_domain.h b/drivers/iommu/fsl_pamu_domain.h
index d3523ee9999d..13ee06e0ef01 100644
--- a/drivers/iommu/fsl_pamu_domain.h
+++ b/drivers/iommu/fsl_pamu_domain.h
@@ -9,24 +9,10 @@
 
 #include "fsl_pamu.h"
 
-struct dma_window {
-	phys_addr_t paddr;
-	u64 size;
-	int valid;
-	int prot;
-};
-
 struct fsl_dma_domain {
-	/*
-	 * win_arr contains information of the configured
-	 * windows for a domain.
-	 */
-	struct dma_window		win_arr[1];
 	/* list of devices associated with the domain */
 	struct list_head		devices;
 	/* dma_domain states:
-	 * mapped - A particular mapping has been created
-	 * within the configured geometry.
 	 * enabled - DMA has been enabled for the given
 	 * domain. This translates to setting of the
 	 * valid bit for the primary PAACE in the PAMU
@@ -35,7 +21,6 @@ struct fsl_dma_domain {
 	 * enabled for it.
 	 *
 	 */
-	int				mapped;
 	int				enabled;
 	/* stash_id obtained from the stash attribute details */
 	u32				stash_id;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e10cfa99057c..33bc0f8f0a74 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2610,17 +2610,6 @@ size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
 	return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
 }
 
-int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-			       phys_addr_t paddr, u64 size, int prot)
-{
-	if (unlikely(domain->ops->domain_window_enable == NULL))
-		return -ENODEV;
-
-	return domain->ops->domain_window_enable(domain, wnd_nr, paddr, size,
-						 prot);
-}
-EXPORT_SYMBOL_GPL(iommu_domain_window_enable);
-
 /**
  * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
  * @domain: the iommu domain where the fault has happened
diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c
index 3d56ec4b373b..9ee1663f422c 100644
--- a/drivers/soc/fsl/qbman/qman_portal.c
+++ b/drivers/soc/fsl/qbman/qman_portal.c
@@ -65,13 +65,6 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
 			__func__, ret);
 		goto out_domain_free;
 	}
-	ret = iommu_domain_window_enable(pcfg->iommu_domain, 0, 0, 1ULL << 36,
-					 IOMMU_READ | IOMMU_WRITE);
-	if (ret < 0) {
-		dev_err(dev, "%s(): iommu_domain_window_enable() = %d",
-			__func__, ret);
-		goto out_domain_free;
-	}
 	ret = iommu_attach_device(pcfg->iommu_domain, dev);
 	if (ret < 0) {
 		dev_err(dev, "%s(): iommu_device_attach() = %d", __func__,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 761aa8b69936..69537e83aae4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -217,7 +217,6 @@ struct iommu_iotlb_gather {
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
- * @domain_window_enable: Configure and enable a particular window for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @is_attach_deferred: Check if domain attach should be deferred from iommu
  *                      driver init to device driver init (default no)
@@ -275,10 +274,6 @@ struct iommu_ops {
 				  struct iommu_domain *domain,
 				  struct iommu_resv_region *region);
 
-	/* Window handling functions */
-	int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
-				    phys_addr_t paddr, u64 size, int prot);
-
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 	bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
 
@@ -521,11 +516,6 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 
-/* Window handling function prototypes */
-extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-				      phys_addr_t offset, u64 size,
-				      int prot);
-
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
@@ -749,13 +739,6 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain,
 {
 }
 
-static inline int iommu_domain_window_enable(struct iommu_domain *domain,
-					     u32 wnd_nr, phys_addr_t paddr,
-					     u64 size, int prot)
-{
-	return -ENODEV;
-}
-
 static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
 	return 0;
-- 
cgit v1.2.3


From 4eeb96f6efac10e66fd10e718d2adeece3879121 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:43 +0200
Subject: iommu/fsl_pamu: replace DOMAIN_ATTR_FSL_PAMU_STASH with a direct call

Add a fsl_pamu_configure_l1_stash API that qman_portal can call directly
instead of indirecting through the iommu attr API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-8-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/powerpc/include/asm/fsl_pamu_stash.h | 12 +++---------
 drivers/iommu/fsl_pamu_domain.c           | 16 +++-------------
 drivers/iommu/fsl_pamu_domain.h           |  2 --
 drivers/soc/fsl/qbman/qman_portal.c       | 18 +++---------------
 include/linux/iommu.h                     |  1 -
 5 files changed, 9 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/fsl_pamu_stash.h b/arch/powerpc/include/asm/fsl_pamu_stash.h
index 30a31ad2123d..c0fbadb70b5d 100644
--- a/arch/powerpc/include/asm/fsl_pamu_stash.h
+++ b/arch/powerpc/include/asm/fsl_pamu_stash.h
@@ -7,6 +7,8 @@
 #ifndef __FSL_PAMU_STASH_H
 #define __FSL_PAMU_STASH_H
 
+struct iommu_domain;
+
 /* cache stash targets */
 enum pamu_stash_target {
 	PAMU_ATTR_CACHE_L1 = 1,
@@ -14,14 +16,6 @@ enum pamu_stash_target {
 	PAMU_ATTR_CACHE_L3,
 };
 
-/*
- * This attribute allows configuring stashig specific parameters
- * in the PAMU hardware.
- */
-
-struct pamu_stash_attribute {
-	u32	cpu;	/* cpu number */
-	u32	cache;	/* cache to stash to: L1,L2,L3 */
-};
+int fsl_pamu_configure_l1_stash(struct iommu_domain *domain, u32 cpu);
 
 #endif  /* __FSL_PAMU_STASH_H */
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 689035e9d409..557a152c1d2c 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -372,27 +372,20 @@ static void fsl_pamu_detach_device(struct iommu_domain *domain,
 }
 
 /* Set the domain stash attribute */
-static int configure_domain_stash(struct fsl_dma_domain *dma_domain, void *data)
+int fsl_pamu_configure_l1_stash(struct iommu_domain *domain, u32 cpu)
 {
-	struct pamu_stash_attribute *stash_attr = data;
+	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-
-	memcpy(&dma_domain->dma_stash, stash_attr,
-	       sizeof(struct pamu_stash_attribute));
-
-	dma_domain->stash_id = get_stash_id(stash_attr->cache,
-					    stash_attr->cpu);
+	dma_domain->stash_id = get_stash_id(PAMU_ATTR_CACHE_L1, cpu);
 	if (dma_domain->stash_id == ~(u32)0) {
 		pr_debug("Invalid stash attributes\n");
 		spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 		return -EINVAL;
 	}
-
 	ret = update_domain_stash(dma_domain, dma_domain->stash_id);
-
 	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 
 	return ret;
@@ -426,9 +419,6 @@ static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
 	int ret = 0;
 
 	switch (attr_type) {
-	case DOMAIN_ATTR_FSL_PAMU_STASH:
-		ret = configure_domain_stash(dma_domain, data);
-		break;
 	case DOMAIN_ATTR_FSL_PAMU_ENABLE:
 		ret = configure_domain_dma_state(dma_domain, *(int *)data);
 		break;
diff --git a/drivers/iommu/fsl_pamu_domain.h b/drivers/iommu/fsl_pamu_domain.h
index 13ee06e0ef01..cd488004acd1 100644
--- a/drivers/iommu/fsl_pamu_domain.h
+++ b/drivers/iommu/fsl_pamu_domain.h
@@ -22,9 +22,7 @@ struct fsl_dma_domain {
 	 *
 	 */
 	int				enabled;
-	/* stash_id obtained from the stash attribute details */
 	u32				stash_id;
-	struct pamu_stash_attribute	dma_stash;
 	u32				snoop_id;
 	struct iommu_domain		iommu_domain;
 	spinlock_t			domain_lock;
diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c
index 9ee1663f422c..798b3a1ffd0b 100644
--- a/drivers/soc/fsl/qbman/qman_portal.c
+++ b/drivers/soc/fsl/qbman/qman_portal.c
@@ -47,7 +47,6 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
 #ifdef CONFIG_FSL_PAMU
 	struct device *dev = pcfg->dev;
 	int window_count = 1;
-	struct pamu_stash_attribute stash_attr;
 	int ret;
 
 	pcfg->iommu_domain = iommu_domain_alloc(&platform_bus_type);
@@ -55,13 +54,9 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
 		dev_err(dev, "%s(): iommu_domain_alloc() failed", __func__);
 		goto no_iommu;
 	}
-	stash_attr.cpu = cpu;
-	stash_attr.cache = PAMU_ATTR_CACHE_L1;
-	ret = iommu_domain_set_attr(pcfg->iommu_domain,
-				    DOMAIN_ATTR_FSL_PAMU_STASH,
-				    &stash_attr);
+	ret = fsl_pamu_configure_l1_stash(pcfg->iommu_domain, cpu);
 	if (ret < 0) {
-		dev_err(dev, "%s(): iommu_domain_set_attr() = %d",
+		dev_err(dev, "%s(): fsl_pamu_configure_l1_stash() = %d",
 			__func__, ret);
 		goto out_domain_free;
 	}
@@ -143,15 +138,8 @@ static void qman_portal_update_sdest(const struct qm_portal_config *pcfg,
 							unsigned int cpu)
 {
 #ifdef CONFIG_FSL_PAMU /* TODO */
-	struct pamu_stash_attribute stash_attr;
-	int ret;
-
 	if (pcfg->iommu_domain) {
-		stash_attr.cpu = cpu;
-		stash_attr.cache = PAMU_ATTR_CACHE_L1;
-		ret = iommu_domain_set_attr(pcfg->iommu_domain,
-				DOMAIN_ATTR_FSL_PAMU_STASH, &stash_attr);
-		if (ret < 0) {
+		if (fsl_pamu_configure_l1_stash(pcfg->iommu_domain, cpu) < 0) {
 			dev_err(pcfg->dev,
 				"Failed to update pamu stash setting\n");
 			return;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 69537e83aae4..a3968122aa69 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -109,7 +109,6 @@ enum iommu_cap {
 enum iommu_attr {
 	DOMAIN_ATTR_GEOMETRY,
 	DOMAIN_ATTR_PAGING,
-	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
-- 
cgit v1.2.3


From 7d61cb6ff0122a017ae907aed62478a4db9c5991 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:46 +0200
Subject: iommu/fsl_pamu: enable the liodn when attaching a device

Instead of a separate call to enable all devices from the list, just
enable the liodn once the device is attached to the iommu domain.

This also remove the DOMAIN_ATTR_FSL_PAMU_ENABLE iommu_attr.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-11-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/fsl_pamu_domain.c     | 47 +++----------------------------------
 drivers/iommu/fsl_pamu_domain.h     | 10 --------
 drivers/soc/fsl/qbman/qman_portal.c | 11 ---------
 include/linux/iommu.h               |  1 -
 4 files changed, 3 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 41927c3c4177..c2e7e17570e7 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -195,9 +195,6 @@ static void fsl_pamu_domain_free(struct iommu_domain *domain)
 
 	/* remove all the devices from the device list */
 	detach_device(NULL, dma_domain);
-
-	dma_domain->enabled = 0;
-
 	kmem_cache_free(fsl_pamu_domain_cache, dma_domain);
 }
 
@@ -285,6 +282,9 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain,
 		ret = pamu_set_liodn(dma_domain, dev, liodn[i]);
 		if (ret)
 			break;
+		ret = pamu_enable_liodn(liodn[i]);
+		if (ret)
+			break;
 	}
 	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
 	return ret;
@@ -341,46 +341,6 @@ int fsl_pamu_configure_l1_stash(struct iommu_domain *domain, u32 cpu)
 	return ret;
 }
 
-/* Configure domain dma state i.e. enable/disable DMA */
-static int configure_domain_dma_state(struct fsl_dma_domain *dma_domain, bool enable)
-{
-	struct device_domain_info *info;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&dma_domain->domain_lock, flags);
-	dma_domain->enabled = enable;
-	list_for_each_entry(info, &dma_domain->devices, link) {
-		ret = (enable) ? pamu_enable_liodn(info->liodn) :
-			pamu_disable_liodn(info->liodn);
-		if (ret)
-			pr_debug("Unable to set dma state for liodn %d",
-				 info->liodn);
-	}
-	spin_unlock_irqrestore(&dma_domain->domain_lock, flags);
-
-	return 0;
-}
-
-static int fsl_pamu_set_domain_attr(struct iommu_domain *domain,
-				    enum iommu_attr attr_type, void *data)
-{
-	struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain);
-	int ret = 0;
-
-	switch (attr_type) {
-	case DOMAIN_ATTR_FSL_PAMU_ENABLE:
-		ret = configure_domain_dma_state(dma_domain, *(int *)data);
-		break;
-	default:
-		pr_debug("Unsupported attribute type\n");
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
 static struct iommu_group *get_device_iommu_group(struct device *dev)
 {
 	struct iommu_group *group;
@@ -505,7 +465,6 @@ static const struct iommu_ops fsl_pamu_ops = {
 	.attach_dev	= fsl_pamu_attach_device,
 	.detach_dev	= fsl_pamu_detach_device,
 	.iova_to_phys	= fsl_pamu_iova_to_phys,
-	.domain_set_attr = fsl_pamu_set_domain_attr,
 	.probe_device	= fsl_pamu_probe_device,
 	.release_device	= fsl_pamu_release_device,
 	.device_group   = fsl_pamu_device_group,
diff --git a/drivers/iommu/fsl_pamu_domain.h b/drivers/iommu/fsl_pamu_domain.h
index cd488004acd1..5f4ed253f61b 100644
--- a/drivers/iommu/fsl_pamu_domain.h
+++ b/drivers/iommu/fsl_pamu_domain.h
@@ -12,16 +12,6 @@
 struct fsl_dma_domain {
 	/* list of devices associated with the domain */
 	struct list_head		devices;
-	/* dma_domain states:
-	 * enabled - DMA has been enabled for the given
-	 * domain. This translates to setting of the
-	 * valid bit for the primary PAACE in the PAMU
-	 * PAACT table. Domain geometry should be set and
-	 * it must have a valid mapping before DMA can be
-	 * enabled for it.
-	 *
-	 */
-	int				enabled;
 	u32				stash_id;
 	u32				snoop_id;
 	struct iommu_domain		iommu_domain;
diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c
index 798b3a1ffd0b..bf38eb0042ed 100644
--- a/drivers/soc/fsl/qbman/qman_portal.c
+++ b/drivers/soc/fsl/qbman/qman_portal.c
@@ -46,7 +46,6 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
 {
 #ifdef CONFIG_FSL_PAMU
 	struct device *dev = pcfg->dev;
-	int window_count = 1;
 	int ret;
 
 	pcfg->iommu_domain = iommu_domain_alloc(&platform_bus_type);
@@ -66,14 +65,6 @@ static void portal_set_cpu(struct qm_portal_config *pcfg, int cpu)
 			ret);
 		goto out_domain_free;
 	}
-	ret = iommu_domain_set_attr(pcfg->iommu_domain,
-				    DOMAIN_ATTR_FSL_PAMU_ENABLE,
-				    &window_count);
-	if (ret < 0) {
-		dev_err(dev, "%s(): iommu_domain_set_attr() = %d", __func__,
-			ret);
-		goto out_detach_device;
-	}
 
 no_iommu:
 #endif
@@ -82,8 +73,6 @@ no_iommu:
 	return;
 
 #ifdef CONFIG_FSL_PAMU
-out_detach_device:
-	iommu_detach_device(pcfg->iommu_domain, NULL);
 out_domain_free:
 	iommu_domain_free(pcfg->iommu_domain);
 	pcfg->iommu_domain = NULL;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a3968122aa69..f5caaa8d39be 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -109,7 +109,6 @@ enum iommu_cap {
 enum iommu_attr {
 	DOMAIN_ATTR_GEOMETRY,
 	DOMAIN_ATTR_PAGING,
-	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_IO_PGTABLE_CFG,
-- 
cgit v1.2.3


From 9fb5fad562fa0a41c84691714d99c23f54168a9e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:50 +0200
Subject: iommu: remove DOMAIN_ATTR_PAGING

DOMAIN_ATTR_PAGING is never used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-15-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 5 -----
 include/linux/iommu.h | 1 -
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 33bc0f8f0a74..93b3c2048c21 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2668,7 +2668,6 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
 {
 	struct iommu_domain_geometry *geometry;
-	bool *paging;
 	int ret = 0;
 
 	switch (attr) {
@@ -2676,10 +2675,6 @@ int iommu_domain_get_attr(struct iommu_domain *domain,
 		geometry  = data;
 		*geometry = domain->geometry;
 
-		break;
-	case DOMAIN_ATTR_PAGING:
-		paging  = data;
-		*paging = (domain->pgsize_bitmap != 0UL);
 		break;
 	default:
 		if (!domain->ops->domain_get_attr)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f5caaa8d39be..85b51084b820 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -108,7 +108,6 @@ enum iommu_cap {
 
 enum iommu_attr {
 	DOMAIN_ATTR_GEOMETRY,
-	DOMAIN_ATTR_PAGING,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_IO_PGTABLE_CFG,
-- 
cgit v1.2.3


From bc9a05eef113e75cfa792fdf24dae011bc3d5294 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:51 +0200
Subject: iommu: remove DOMAIN_ATTR_GEOMETRY

The geometry information can be trivially queried from the iommu_domain
struture.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-16-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c           | 20 +++-----------------
 drivers/vfio/vfio_iommu_type1.c | 26 ++++++++++++--------------
 drivers/vhost/vdpa.c            | 10 +++-------
 include/linux/iommu.h           |  1 -
 4 files changed, 18 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 93b3c2048c21..703e67d8cb04 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2667,23 +2667,9 @@ core_initcall(iommu_init);
 int iommu_domain_get_attr(struct iommu_domain *domain,
 			  enum iommu_attr attr, void *data)
 {
-	struct iommu_domain_geometry *geometry;
-	int ret = 0;
-
-	switch (attr) {
-	case DOMAIN_ATTR_GEOMETRY:
-		geometry  = data;
-		*geometry = domain->geometry;
-
-		break;
-	default:
-		if (!domain->ops->domain_get_attr)
-			return -EINVAL;
-
-		ret = domain->ops->domain_get_attr(domain, attr, data);
-	}
-
-	return ret;
+	if (!domain->ops->domain_get_attr)
+		return -EINVAL;
+	return domain->ops->domain_get_attr(domain, attr, data);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
 
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4bb162c1d649..c8e57f22f421 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2252,7 +2252,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	int ret;
 	bool resv_msi, msi_remap;
 	phys_addr_t resv_msi_base = 0;
-	struct iommu_domain_geometry geo;
+	struct iommu_domain_geometry *geo;
 	LIST_HEAD(iova_copy);
 	LIST_HEAD(group_resv_regions);
 
@@ -2333,10 +2333,9 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_domain;
 
 	/* Get aperture info */
-	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
-
-	if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
-				     geo.aperture_end)) {
+	geo = &domain->domain->geometry;
+	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
+				     geo->aperture_end)) {
 		ret = -EINVAL;
 		goto out_detach;
 	}
@@ -2359,8 +2358,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
-	ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
-				     geo.aperture_end);
+	ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
+				     geo->aperture_end);
 	if (ret)
 		goto out_detach;
 
@@ -2493,7 +2492,6 @@ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
 				   struct list_head *iova_copy)
 {
 	struct vfio_domain *domain;
-	struct iommu_domain_geometry geo;
 	struct vfio_iova *node;
 	dma_addr_t start = 0;
 	dma_addr_t end = (dma_addr_t)~0;
@@ -2502,12 +2500,12 @@ static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
 		return;
 
 	list_for_each_entry(domain, &iommu->domain_list, next) {
-		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
-				      &geo);
-		if (geo.aperture_start > start)
-			start = geo.aperture_start;
-		if (geo.aperture_end < end)
-			end = geo.aperture_end;
+		struct iommu_domain_geometry *geo = &domain->domain->geometry;
+
+		if (geo->aperture_start > start)
+			start = geo->aperture_start;
+		if (geo->aperture_end < end)
+			end = geo->aperture_end;
 	}
 
 	/* Modify aperture limits. The new aper is either same or bigger */
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ef688c8c0e0e..25824fab433d 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -826,18 +826,14 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
 static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
 {
 	struct vdpa_iova_range *range = &v->range;
-	struct iommu_domain_geometry geo;
 	struct vdpa_device *vdpa = v->vdpa;
 	const struct vdpa_config_ops *ops = vdpa->config;
 
 	if (ops->get_iova_range) {
 		*range = ops->get_iova_range(vdpa);
-	} else if (v->domain &&
-		   !iommu_domain_get_attr(v->domain,
-		   DOMAIN_ATTR_GEOMETRY, &geo) &&
-		   geo.force_aperture) {
-		range->first = geo.aperture_start;
-		range->last = geo.aperture_end;
+	} else if (v->domain && v->domain->geometry.force_aperture) {
+		range->first = v->domain->geometry.aperture_start;
+		range->last = v->domain->geometry.aperture_end;
 	} else {
 		range->first = 0;
 		range->last = ULLONG_MAX;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 85b51084b820..207446cc6d3f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -107,7 +107,6 @@ enum iommu_cap {
  */
 
 enum iommu_attr {
-	DOMAIN_ATTR_GEOMETRY,
 	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_IO_PGTABLE_CFG,
-- 
cgit v1.2.3


From 7e147547783a9035df816864b6a45ffbb254d700 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:52 +0200
Subject: iommu: remove DOMAIN_ATTR_NESTING

Use an explicit enable_nesting method instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-17-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 43 +++++++++++------------------
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 30 ++++++++++----------
 drivers/iommu/intel/iommu.c                 | 31 ++++++---------------
 drivers/iommu/iommu.c                       | 10 +++++++
 drivers/vfio/vfio_iommu_type1.c             |  5 +---
 include/linux/iommu.h                       |  4 ++-
 6 files changed, 55 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 1c7721b89fec..7f3a122e6b5f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2555,15 +2555,6 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
 	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-			return 0;
-		default:
-			return -ENODEV;
-		}
-		break;
 	case IOMMU_DOMAIN_DMA:
 		switch (attr) {
 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
@@ -2587,23 +2578,6 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 	mutex_lock(&smmu_domain->init_mutex);
 
 	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			if (smmu_domain->smmu) {
-				ret = -EPERM;
-				goto out_unlock;
-			}
-
-			if (*(int *)data)
-				smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-			else
-				smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
-			break;
-		default:
-			ret = -ENODEV;
-		}
-		break;
 	case IOMMU_DOMAIN_DMA:
 		switch(attr) {
 		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
@@ -2617,11 +2591,25 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 		ret = -EINVAL;
 	}
 
-out_unlock:
 	mutex_unlock(&smmu_domain->init_mutex);
 	return ret;
 }
 
+static int arm_smmu_enable_nesting(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	int ret = 0;
+
+	mutex_lock(&smmu_domain->init_mutex);
+	if (smmu_domain->smmu)
+		ret = -EPERM;
+	else
+		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+	mutex_unlock(&smmu_domain->init_mutex);
+
+	return ret;
+}
+
 static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
 	return iommu_fwspec_add_ids(dev, args->args, 1);
@@ -2721,6 +2709,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.domain_get_attr	= arm_smmu_domain_get_attr,
 	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.enable_nesting		= arm_smmu_enable_nesting,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index d8c6bfde6a61..0aa6d6672749 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1489,9 +1489,6 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	switch(domain->type) {
 	case IOMMU_DOMAIN_UNMANAGED:
 		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-			return 0;
 		case DOMAIN_ATTR_IO_PGTABLE_CFG: {
 			struct io_pgtable_domain_attr *pgtbl_cfg = data;
 			*pgtbl_cfg = smmu_domain->pgtbl_cfg;
@@ -1519,6 +1516,21 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	}
 }
 
+static int arm_smmu_enable_nesting(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	int ret = 0;
+
+	mutex_lock(&smmu_domain->init_mutex);
+	if (smmu_domain->smmu)
+		ret = -EPERM;
+	else
+		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
+	mutex_unlock(&smmu_domain->init_mutex);
+
+	return ret;
+}
+
 static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 				    enum iommu_attr attr, void *data)
 {
@@ -1530,17 +1542,6 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 	switch(domain->type) {
 	case IOMMU_DOMAIN_UNMANAGED:
 		switch (attr) {
-		case DOMAIN_ATTR_NESTING:
-			if (smmu_domain->smmu) {
-				ret = -EPERM;
-				goto out_unlock;
-			}
-
-			if (*(int *)data)
-				smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-			else
-				smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
-			break;
 		case DOMAIN_ATTR_IO_PGTABLE_CFG: {
 			struct io_pgtable_domain_attr *pgtbl_cfg = data;
 
@@ -1633,6 +1634,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.device_group		= arm_smmu_device_group,
 	.domain_get_attr	= arm_smmu_domain_get_attr,
 	.domain_set_attr	= arm_smmu_domain_set_attr,
+	.enable_nesting		= arm_smmu_enable_nesting,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 1ea0f0bd6973..761feac91767 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5396,32 +5396,19 @@ static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
 }
 
 static int
-intel_iommu_domain_set_attr(struct iommu_domain *domain,
-			    enum iommu_attr attr, void *data)
+intel_iommu_enable_nesting(struct iommu_domain *domain)
 {
 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	unsigned long flags;
-	int ret = 0;
-
-	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
-		return -EINVAL;
+	int ret = -ENODEV;
 
-	switch (attr) {
-	case DOMAIN_ATTR_NESTING:
-		spin_lock_irqsave(&device_domain_lock, flags);
-		if (nested_mode_support() &&
-		    list_empty(&dmar_domain->devices)) {
-			dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
-			dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
-		} else {
-			ret = -ENODEV;
-		}
-		spin_unlock_irqrestore(&device_domain_lock, flags);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
+	spin_lock_irqsave(&device_domain_lock, flags);
+	if (nested_mode_support() && list_empty(&dmar_domain->devices)) {
+		dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
+		dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
+		ret = 0;
 	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	return ret;
 }
@@ -5550,7 +5537,7 @@ const struct iommu_ops intel_iommu_ops = {
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
 	.domain_get_attr        = intel_iommu_domain_get_attr,
-	.domain_set_attr	= intel_iommu_domain_set_attr,
+	.enable_nesting		= intel_iommu_enable_nesting,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
 	.aux_attach_dev		= intel_iommu_aux_attach_device,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 703e67d8cb04..4738bc024240 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2690,6 +2690,16 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
 
+int iommu_enable_nesting(struct iommu_domain *domain)
+{
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+	if (!domain->ops->enable_nesting)
+		return -EINVAL;
+	return domain->ops->enable_nesting(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_enable_nesting);
+
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c8e57f22f421..1201482e6eec 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2320,10 +2320,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	}
 
 	if (iommu->nesting) {
-		int attr = 1;
-
-		ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
-					    &attr);
+		ret = iommu_enable_nesting(domain->domain);
 		if (ret)
 			goto out_domain;
 	}
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 207446cc6d3f..d7d76b5e1192 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -107,7 +107,6 @@ enum iommu_cap {
  */
 
 enum iommu_attr {
-	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_IO_PGTABLE_CFG,
 	DOMAIN_ATTR_MAX,
@@ -210,6 +209,7 @@ struct iommu_iotlb_gather {
  * @device_group: find iommu group for a particular device
  * @domain_get_attr: Query domain attributes
  * @domain_set_attr: Change domain attributes
+ * @enable_nesting: Enable nesting
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
@@ -262,6 +262,7 @@ struct iommu_ops {
 			       enum iommu_attr attr, void *data);
 	int (*domain_set_attr)(struct iommu_domain *domain,
 			       enum iommu_attr attr, void *data);
+	int (*enable_nesting)(struct iommu_domain *domain);
 
 	/* Request/Free a list of reserved regions for a device */
 	void (*get_resv_regions)(struct device *dev, struct list_head *list);
@@ -511,6 +512,7 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
+int iommu_enable_nesting(struct iommu_domain *domain);
 
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
-- 
cgit v1.2.3


From a250c23f15c21c556becd4986f453255e545807c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 1 Apr 2021 17:52:54 +0200
Subject: iommu: remove DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE

Instead make the global iommu_dma_strict paramete in iommu.c canonical by
exporting helpers to get and set it and use those directly in the drivers.

This make sure that the iommu.strict parameter also works for the AMD and
Intel IOMMU drivers on x86.  As those default to lazy flushing a new
IOMMU_CMD_LINE_STRICT is used to turn the value into a tristate to
represent the default if not overriden by an explicit parameter.

[ported on top of the other iommu_attr changes and added a few small
 missing bits]

Signed-off-by: Robin Murphy <robin.murphy@arm.com>.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210401155256.298656-19-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/iommu.c                   | 23 +----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 50 +---------------------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 -
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 27 ++----------
 drivers/iommu/dma-iommu.c                   |  9 +---
 drivers/iommu/intel/iommu.c                 | 64 ++++++-----------------------
 drivers/iommu/iommu.c                       | 30 ++++++++++----
 include/linux/iommu.h                       |  4 +-
 8 files changed, 43 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index a69a8b573e40..ce6393d2224d 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1771,26 +1771,6 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev)
 	return acpihid_device_group(dev);
 }
 
-static int amd_iommu_domain_get_attr(struct iommu_domain *domain,
-		enum iommu_attr attr, void *data)
-{
-	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		return -ENODEV;
-	case IOMMU_DOMAIN_DMA:
-		switch (attr) {
-		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-			*(int *)data = !amd_iommu_unmap_flush;
-			return 0;
-		default:
-			return -ENODEV;
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-}
-
 /*****************************************************************************
  *
  * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1855,7 +1835,7 @@ int __init amd_iommu_init_dma_ops(void)
 		pr_info("IO/TLB flush on unmap enabled\n");
 	else
 		pr_info("Lazy IO/TLB flushing enabled\n");
-
+	iommu_set_dma_strict(amd_iommu_unmap_flush);
 	return 0;
 
 }
@@ -2257,7 +2237,6 @@ const struct iommu_ops amd_iommu_ops = {
 	.release_device = amd_iommu_release_device,
 	.probe_finalize = amd_iommu_probe_finalize,
 	.device_group = amd_iommu_device_group,
-	.domain_get_attr = amd_iommu_domain_get_attr,
 	.get_resv_regions = amd_iommu_get_resv_regions,
 	.put_resv_regions = generic_iommu_put_resv_regions,
 	.is_attach_deferred = amd_iommu_is_attach_deferred,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 7f3a122e6b5f..7fe45838298c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2040,7 +2040,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain,
 		.iommu_dev	= smmu->dev,
 	};
 
-	if (smmu_domain->non_strict)
+	if (!iommu_get_dma_strict(domain))
 		pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
 	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
@@ -2549,52 +2549,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
-				    enum iommu_attr attr, void *data)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-
-	switch (domain->type) {
-	case IOMMU_DOMAIN_DMA:
-		switch (attr) {
-		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-			*(int *)data = smmu_domain->non_strict;
-			return 0;
-		default:
-			return -ENODEV;
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-}
-
-static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
-				    enum iommu_attr attr, void *data)
-{
-	int ret = 0;
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-
-	mutex_lock(&smmu_domain->init_mutex);
-
-	switch (domain->type) {
-	case IOMMU_DOMAIN_DMA:
-		switch(attr) {
-		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-			smmu_domain->non_strict = *(int *)data;
-			break;
-		default:
-			ret = -ENODEV;
-		}
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	mutex_unlock(&smmu_domain->init_mutex);
-	return ret;
-}
-
 static int arm_smmu_enable_nesting(struct iommu_domain *domain)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
@@ -2707,8 +2661,6 @@ static struct iommu_ops arm_smmu_ops = {
 	.probe_device		= arm_smmu_probe_device,
 	.release_device		= arm_smmu_release_device,
 	.device_group		= arm_smmu_device_group,
-	.domain_get_attr	= arm_smmu_domain_get_attr,
-	.domain_set_attr	= arm_smmu_domain_set_attr,
 	.enable_nesting		= arm_smmu_enable_nesting,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 7b15b7580c6e..05cf2bf42d33 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -677,7 +677,6 @@ struct arm_smmu_domain {
 	struct mutex			init_mutex; /* Protects smmu pointer */
 
 	struct io_pgtable_ops		*pgtbl_ops;
-	bool				non_strict;
 	atomic_t			nr_ats_masters;
 
 	enum arm_smmu_domain_stage	stage;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 0aa6d6672749..f17c54a76ef6 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -761,6 +761,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 		.iommu_dev	= smmu->dev,
 	};
 
+	if (!iommu_get_dma_strict(domain))
+		pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
+
 	if (smmu->impl && smmu->impl->init_context) {
 		ret = smmu->impl->init_context(smmu_domain, &pgtbl_cfg, dev);
 		if (ret)
@@ -1499,18 +1502,6 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 			return -ENODEV;
 		}
 		break;
-	case IOMMU_DOMAIN_DMA:
-		switch (attr) {
-		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: {
-			bool non_strict = smmu_domain->pgtbl_cfg.quirks &
-					  IO_PGTABLE_QUIRK_NON_STRICT;
-			*(int *)data = non_strict;
-			return 0;
-		}
-		default:
-			return -ENODEV;
-		}
-		break;
 	default:
 		return -EINVAL;
 	}
@@ -1557,18 +1548,6 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 			ret = -ENODEV;
 		}
 		break;
-	case IOMMU_DOMAIN_DMA:
-		switch (attr) {
-		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-			if (*(int *)data)
-				smmu_domain->pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-			else
-				smmu_domain->pgtbl_cfg.quirks &= ~IO_PGTABLE_QUIRK_NON_STRICT;
-			break;
-		default:
-			ret = -ENODEV;
-		}
-		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 0af4048f9b3f..efaf5eab40a1 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -306,10 +306,7 @@ static void iommu_dma_flush_iotlb_all(struct iova_domain *iovad)
 
 	cookie = container_of(iovad, struct iommu_dma_cookie, iovad);
 	domain = cookie->fq_domain;
-	/*
-	 * The IOMMU driver supporting DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE
-	 * implies that ops->flush_iotlb_all must be non-NULL.
-	 */
+
 	domain->ops->flush_iotlb_all(domain);
 }
 
@@ -336,7 +333,6 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
 	unsigned long order, base_pfn;
 	struct iova_domain *iovad;
-	int attr;
 
 	if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE)
 		return -EINVAL;
@@ -373,8 +369,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	init_iova_domain(iovad, 1UL << order, base_pfn);
 
 	if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) &&
-	    !iommu_domain_get_attr(domain, DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, &attr) &&
-	    attr) {
+	    domain->ops->flush_iotlb_all && !iommu_get_dma_strict(domain)) {
 		if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all,
 					  iommu_dma_entry_dtor))
 			pr_warn("iova flush queue initialization failed\n");
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 761feac91767..1db4d8934526 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4347,6 +4347,17 @@ int __init intel_iommu_init(void)
 
 	down_read(&dmar_global_lock);
 	for_each_active_iommu(iommu, drhd) {
+		/*
+		 * The flush queue implementation does not perform
+		 * page-selective invalidations that are required for efficient
+		 * TLB flushes in virtual environments.  The benefit of batching
+		 * is likely to be much lower than the overhead of synchronizing
+		 * the virtual and physical IOMMU page-tables.
+		 */
+		if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
+			pr_warn("IOMMU batching is disabled due to virtualization");
+			intel_iommu_strict = 1;
+		}
 		iommu_device_sysfs_add(&iommu->iommu, NULL,
 				       intel_iommu_groups,
 				       "%s", iommu->name);
@@ -4355,6 +4366,7 @@ int __init intel_iommu_init(void)
 	}
 	up_read(&dmar_global_lock);
 
+	iommu_set_dma_strict(intel_iommu_strict);
 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
 	if (si_domain && !hw_pass_through)
 		register_memory_notifier(&intel_iommu_memory_nb);
@@ -5413,57 +5425,6 @@ intel_iommu_enable_nesting(struct iommu_domain *domain)
 	return ret;
 }
 
-static bool domain_use_flush_queue(void)
-{
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
-	bool r = true;
-
-	if (intel_iommu_strict)
-		return false;
-
-	/*
-	 * The flush queue implementation does not perform page-selective
-	 * invalidations that are required for efficient TLB flushes in virtual
-	 * environments. The benefit of batching is likely to be much lower than
-	 * the overhead of synchronizing the virtual and physical IOMMU
-	 * page-tables.
-	 */
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd) {
-		if (!cap_caching_mode(iommu->cap))
-			continue;
-
-		pr_warn_once("IOMMU batching is disabled due to virtualization");
-		r = false;
-		break;
-	}
-	rcu_read_unlock();
-
-	return r;
-}
-
-static int
-intel_iommu_domain_get_attr(struct iommu_domain *domain,
-			    enum iommu_attr attr, void *data)
-{
-	switch (domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		return -ENODEV;
-	case IOMMU_DOMAIN_DMA:
-		switch (attr) {
-		case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-			*(int *)data = domain_use_flush_queue();
-			return 0;
-		default:
-			return -ENODEV;
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-}
-
 /*
  * Check that the device does not live on an external facing PCI port that is
  * marked as untrusted. Such devices should not be able to apply quirks and
@@ -5536,7 +5497,6 @@ const struct iommu_ops intel_iommu_ops = {
 	.capable		= intel_iommu_capable,
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_free		= intel_iommu_domain_free,
-	.domain_get_attr        = intel_iommu_domain_get_attr,
 	.enable_nesting		= intel_iommu_enable_nesting,
 	.attach_dev		= intel_iommu_attach_device,
 	.detach_dev		= intel_iommu_detach_device,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7d1f50ac3709..b09d3f07f3df 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -69,6 +69,7 @@ static const char * const iommu_group_resv_type_string[] = {
 };
 
 #define IOMMU_CMD_LINE_DMA_API		BIT(0)
+#define IOMMU_CMD_LINE_STRICT		BIT(1)
 
 static int iommu_alloc_default_domain(struct iommu_group *group,
 				      struct device *dev);
@@ -318,10 +319,29 @@ early_param("iommu.passthrough", iommu_set_def_domain_type);
 
 static int __init iommu_dma_setup(char *str)
 {
-	return kstrtobool(str, &iommu_dma_strict);
+	int ret = kstrtobool(str, &iommu_dma_strict);
+
+	if (!ret)
+		iommu_cmd_line |= IOMMU_CMD_LINE_STRICT;
+	return ret;
 }
 early_param("iommu.strict", iommu_dma_setup);
 
+void iommu_set_dma_strict(bool strict)
+{
+	if (strict || !(iommu_cmd_line & IOMMU_CMD_LINE_STRICT))
+		iommu_dma_strict = strict;
+}
+
+bool iommu_get_dma_strict(struct iommu_domain *domain)
+{
+	/* only allow lazy flushing for DMA domains */
+	if (domain->type == IOMMU_DOMAIN_DMA)
+		return iommu_dma_strict;
+	return true;
+}
+EXPORT_SYMBOL_GPL(iommu_get_dma_strict);
+
 static ssize_t iommu_group_attr_show(struct kobject *kobj,
 				     struct attribute *__attr, char *buf)
 {
@@ -1500,14 +1520,6 @@ static int iommu_group_alloc_default_domain(struct bus_type *bus,
 	group->default_domain = dom;
 	if (!group->domain)
 		group->domain = dom;
-
-	if (!iommu_dma_strict) {
-		int attr = 1;
-		iommu_domain_set_attr(dom,
-				      DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
-				      &attr);
-	}
-
 	return 0;
 }
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d7d76b5e1192..9349bdd62e91 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -107,7 +107,6 @@ enum iommu_cap {
  */
 
 enum iommu_attr {
-	DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
 	DOMAIN_ATTR_IO_PGTABLE_CFG,
 	DOMAIN_ATTR_MAX,
 };
@@ -514,6 +513,9 @@ extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 int iommu_enable_nesting(struct iommu_domain *domain);
 
+void iommu_set_dma_strict(bool val);
+bool iommu_get_dma_strict(struct iommu_domain *domain);
+
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
-- 
cgit v1.2.3


From 4fc52b81e87be583efb834df5b58245cb9ddd3e7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:55 +0200
Subject: iommu: remove DOMAIN_ATTR_IO_PGTABLE_CFG

Use an explicit set_pgtable_quirks method instead that just passes
the actual quirk bitmask instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Li Yang <leoyang.li@nxp.com>
Link: https://lore.kernel.org/r/20210401155256.298656-20-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  5 +--
 drivers/iommu/arm/arm-smmu/arm-smmu.c   | 64 ++++++---------------------------
 drivers/iommu/arm/arm-smmu/arm-smmu.h   |  2 +-
 drivers/iommu/iommu.c                   | 11 ++++++
 include/linux/io-pgtable.h              |  4 ---
 include/linux/iommu.h                   | 12 ++++++-
 6 files changed, 35 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 0f184c3dd9d9..4a0b14dad93e 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -188,10 +188,7 @@ int adreno_zap_shader_load(struct msm_gpu *gpu, u32 pasid)
 
 void adreno_set_llc_attributes(struct iommu_domain *iommu)
 {
-	struct io_pgtable_domain_attr pgtbl_cfg;
-
-	pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
-	iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg);
+	iommu_set_pgtable_quirks(iommu, IO_PGTABLE_QUIRK_ARM_OUTER_WBWA);
 }
 
 struct msm_gem_address_space *
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index f17c54a76ef6..3c6adcdb201b 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -770,8 +770,8 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 			goto out_clear_smmu;
 	}
 
-	if (smmu_domain->pgtbl_cfg.quirks)
-		pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
+	if (smmu_domain->pgtbl_quirks)
+		pgtbl_cfg.quirks |= smmu_domain->pgtbl_quirks;
 
 	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
 	if (!pgtbl_ops) {
@@ -1484,29 +1484,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
-				    enum iommu_attr attr, void *data)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-
-	switch(domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		switch (attr) {
-		case DOMAIN_ATTR_IO_PGTABLE_CFG: {
-			struct io_pgtable_domain_attr *pgtbl_cfg = data;
-			*pgtbl_cfg = smmu_domain->pgtbl_cfg;
-
-			return 0;
-		}
-		default:
-			return -ENODEV;
-		}
-		break;
-	default:
-		return -EINVAL;
-	}
-}
-
 static int arm_smmu_enable_nesting(struct iommu_domain *domain)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
@@ -1522,37 +1499,19 @@ static int arm_smmu_enable_nesting(struct iommu_domain *domain)
 	return ret;
 }
 
-static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
-				    enum iommu_attr attr, void *data)
+static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
+		unsigned long quirks)
 {
-	int ret = 0;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	int ret = 0;
 
 	mutex_lock(&smmu_domain->init_mutex);
-
-	switch(domain->type) {
-	case IOMMU_DOMAIN_UNMANAGED:
-		switch (attr) {
-		case DOMAIN_ATTR_IO_PGTABLE_CFG: {
-			struct io_pgtable_domain_attr *pgtbl_cfg = data;
-
-			if (smmu_domain->smmu) {
-				ret = -EPERM;
-				goto out_unlock;
-			}
-
-			smmu_domain->pgtbl_cfg = *pgtbl_cfg;
-			break;
-		}
-		default:
-			ret = -ENODEV;
-		}
-		break;
-	default:
-		ret = -EINVAL;
-	}
-out_unlock:
+	if (smmu_domain->smmu)
+		ret = -EPERM;
+	else
+		smmu_domain->pgtbl_quirks = quirks;
 	mutex_unlock(&smmu_domain->init_mutex);
+
 	return ret;
 }
 
@@ -1611,9 +1570,8 @@ static struct iommu_ops arm_smmu_ops = {
 	.probe_device		= arm_smmu_probe_device,
 	.release_device		= arm_smmu_release_device,
 	.device_group		= arm_smmu_device_group,
-	.domain_get_attr	= arm_smmu_domain_get_attr,
-	.domain_set_attr	= arm_smmu_domain_set_attr,
 	.enable_nesting		= arm_smmu_enable_nesting,
+	.set_pgtable_quirks	= arm_smmu_set_pgtable_quirks,
 	.of_xlate		= arm_smmu_of_xlate,
 	.get_resv_regions	= arm_smmu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index d2a2d1bc58ba..c31a59d35c64 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -364,7 +364,7 @@ enum arm_smmu_domain_stage {
 struct arm_smmu_domain {
 	struct arm_smmu_device		*smmu;
 	struct io_pgtable_ops		*pgtbl_ops;
-	struct io_pgtable_domain_attr	pgtbl_cfg;
+	unsigned long			pgtbl_quirks;
 	const struct iommu_flush_ops	*flush_ops;
 	struct arm_smmu_cfg		cfg;
 	enum arm_smmu_domain_stage	stage;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b09d3f07f3df..7ea5135b1431 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2701,6 +2701,17 @@ int iommu_enable_nesting(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL_GPL(iommu_enable_nesting);
 
+int iommu_set_pgtable_quirks(struct iommu_domain *domain,
+		unsigned long quirk)
+{
+	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
+		return -EINVAL;
+	if (!domain->ops->set_pgtable_quirks)
+		return -EINVAL;
+	return domain->ops->set_pgtable_quirks(domain, quirk);
+}
+EXPORT_SYMBOL_GPL(iommu_set_pgtable_quirks);
+
 void iommu_get_resv_regions(struct device *dev, struct list_head *list)
 {
 	const struct iommu_ops *ops = dev->bus->iommu_ops;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index a4c9ca2c31f1..4d40dfa75b55 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -204,10 +204,6 @@ struct io_pgtable {
 
 #define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
 
-struct io_pgtable_domain_attr {
-	unsigned long quirks;
-};
-
 static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
 {
 	if (iop->cfg.tlb && iop->cfg.tlb->tlb_flush_all)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9349bdd62e91..fbac49fe0880 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -107,7 +107,6 @@ enum iommu_cap {
  */
 
 enum iommu_attr {
-	DOMAIN_ATTR_IO_PGTABLE_CFG,
 	DOMAIN_ATTR_MAX,
 };
 
@@ -209,6 +208,7 @@ struct iommu_iotlb_gather {
  * @domain_get_attr: Query domain attributes
  * @domain_set_attr: Change domain attributes
  * @enable_nesting: Enable nesting
+ * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @get_resv_regions: Request list of reserved regions for a device
  * @put_resv_regions: Free list of reserved regions for a device
  * @apply_resv_region: Temporary helper call-back for iova reserved ranges
@@ -262,6 +262,8 @@ struct iommu_ops {
 	int (*domain_set_attr)(struct iommu_domain *domain,
 			       enum iommu_attr attr, void *data);
 	int (*enable_nesting)(struct iommu_domain *domain);
+	int (*set_pgtable_quirks)(struct iommu_domain *domain,
+				  unsigned long quirks);
 
 	/* Request/Free a list of reserved regions for a device */
 	void (*get_resv_regions)(struct device *dev, struct list_head *list);
@@ -512,6 +514,8 @@ extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 int iommu_enable_nesting(struct iommu_domain *domain);
+int iommu_set_pgtable_quirks(struct iommu_domain *domain,
+		unsigned long quirks);
 
 void iommu_set_dma_strict(bool val);
 bool iommu_get_dma_strict(struct iommu_domain *domain);
@@ -891,6 +895,12 @@ static inline int iommu_domain_set_attr(struct iommu_domain *domain,
 	return -EINVAL;
 }
 
+static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain,
+		unsigned long quirks)
+{
+	return 0;
+}
+
 static inline int  iommu_device_register(struct iommu_device *iommu)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 7876a83ffe8c23c7049a63c747a7b96cafaf10a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Apr 2021 17:52:56 +0200
Subject: iommu: remove iommu_domain_{get,set}_attr

Remove the now unused iommu attr infrastructure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210401155256.298656-21-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 26 --------------------------
 include/linux/iommu.h | 36 ------------------------------------
 2 files changed, 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7ea5135b1431..c4ad9c644802 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2665,32 +2665,6 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
-int iommu_domain_get_attr(struct iommu_domain *domain,
-			  enum iommu_attr attr, void *data)
-{
-	if (!domain->ops->domain_get_attr)
-		return -EINVAL;
-	return domain->ops->domain_get_attr(domain, attr, data);
-}
-EXPORT_SYMBOL_GPL(iommu_domain_get_attr);
-
-int iommu_domain_set_attr(struct iommu_domain *domain,
-			  enum iommu_attr attr, void *data)
-{
-	int ret = 0;
-
-	switch (attr) {
-	default:
-		if (domain->ops->domain_set_attr == NULL)
-			return -EINVAL;
-
-		ret = domain->ops->domain_set_attr(domain, attr, data);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
-
 int iommu_enable_nesting(struct iommu_domain *domain)
 {
 	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index fbac49fe0880..a5b3af54fbb8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -96,20 +96,6 @@ enum iommu_cap {
 	IOMMU_CAP_NOEXEC,		/* IOMMU_NOEXEC flag */
 };
 
-/*
- * Following constraints are specifc to FSL_PAMUV1:
- *  -aperture must be power of 2, and naturally aligned
- *  -number of windows must be power of 2, and address space size
- *   of each window is determined by aperture size / # of windows
- *  -the actual size of the mapped region of a window must be power
- *   of 2 starting with 4KB and physical address must be naturally
- *   aligned.
- */
-
-enum iommu_attr {
-	DOMAIN_ATTR_MAX,
-};
-
 /* These are the possible reserved region types */
 enum iommu_resv_type {
 	/* Memory regions which must be mapped 1:1 at all times */
@@ -205,8 +191,6 @@ struct iommu_iotlb_gather {
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
  *                  group and attached to the groups domain
  * @device_group: find iommu group for a particular device
- * @domain_get_attr: Query domain attributes
- * @domain_set_attr: Change domain attributes
  * @enable_nesting: Enable nesting
  * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @get_resv_regions: Request list of reserved regions for a device
@@ -257,10 +241,6 @@ struct iommu_ops {
 	void (*release_device)(struct device *dev);
 	void (*probe_finalize)(struct device *dev);
 	struct iommu_group *(*device_group)(struct device *dev);
-	int (*domain_get_attr)(struct iommu_domain *domain,
-			       enum iommu_attr attr, void *data);
-	int (*domain_set_attr)(struct iommu_domain *domain,
-			       enum iommu_attr attr, void *data);
 	int (*enable_nesting)(struct iommu_domain *domain);
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
 				  unsigned long quirks);
@@ -509,10 +489,6 @@ extern int iommu_page_response(struct device *dev,
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
-extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
-				 void *data);
-extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
-				 void *data);
 int iommu_enable_nesting(struct iommu_domain *domain);
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks);
@@ -883,18 +859,6 @@ static inline int iommu_group_id(struct iommu_group *group)
 	return -ENODEV;
 }
 
-static inline int iommu_domain_get_attr(struct iommu_domain *domain,
-					enum iommu_attr attr, void *data)
-{
-	return -EINVAL;
-}
-
-static inline int iommu_domain_set_attr(struct iommu_domain *domain,
-					enum iommu_attr attr, void *data)
-{
-	return -EINVAL;
-}
-
 static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks)
 {
-- 
cgit v1.2.3


From d151c85c52a314c6ecb91ab35b3f696a6778b509 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Apr 2021 16:33:09 +0200
Subject: iommu/amd: Remove the unused device errata code

The device errata mechism is entirely unused, so remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210402143312.372386-2-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/amd_iommu_types.h |  1 -
 drivers/iommu/amd/iommu.c           | 56 ++-----------------------------------
 include/linux/amd-iommu.h           | 18 ------------
 3 files changed, 3 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 6937e3674a16..94c1a7a9876d 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -693,7 +693,6 @@ struct iommu_dev_data {
 	} ats;				  /* ATS state */
 	bool pri_tlp;			  /* PASID TLB required for
 					     PPR completions */
-	u32 errata;			  /* Bitmap for errata to apply */
 	bool use_vapic;			  /* Enable device to use vapic mode */
 	bool defer_attach;
 
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 70814ab0b374..9b1a01602bb3 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -290,15 +290,6 @@ static bool pci_iommuv2_capable(struct pci_dev *pdev)
 	return true;
 }
 
-static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum)
-{
-	struct iommu_dev_data *dev_data;
-
-	dev_data = dev_iommu_priv_get(&pdev->dev);
-
-	return dev_data->errata & (1 << erratum) ? true : false;
-}
-
 /*
  * This function checks if the driver got a valid device from the caller to
  * avoid dereferencing invalid pointers.
@@ -1531,33 +1522,9 @@ static void pdev_iommuv2_disable(struct pci_dev *pdev)
 	pci_disable_pasid(pdev);
 }
 
-/* FIXME: Change generic reset-function to do the same */
-static int pri_reset_while_enabled(struct pci_dev *pdev)
-{
-	u16 control;
-	int pos;
-
-	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
-	if (!pos)
-		return -EINVAL;
-
-	pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control);
-	control |= PCI_PRI_CTRL_RESET;
-	pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control);
-
-	return 0;
-}
-
 static int pdev_iommuv2_enable(struct pci_dev *pdev)
 {
-	bool reset_enable;
-	int reqs, ret;
-
-	/* FIXME: Hardcode number of outstanding requests for now */
-	reqs = 32;
-	if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE))
-		reqs = 1;
-	reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET);
+	int ret;
 
 	/* Only allow access to user-accessible pages */
 	ret = pci_enable_pasid(pdev, 0);
@@ -1570,16 +1537,11 @@ static int pdev_iommuv2_enable(struct pci_dev *pdev)
 		goto out_err;
 
 	/* Enable PRI */
-	ret = pci_enable_pri(pdev, reqs);
+	/* FIXME: Hardcode number of outstanding requests for now */
+	ret = pci_enable_pri(pdev, 32);
 	if (ret)
 		goto out_err;
 
-	if (reset_enable) {
-		ret = pri_reset_while_enabled(pdev);
-		if (ret)
-			goto out_err;
-	}
-
 	ret = pci_enable_ats(pdev, PAGE_SHIFT);
 	if (ret)
 		goto out_err;
@@ -2594,18 +2556,6 @@ struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL(amd_iommu_get_v2_domain);
 
-void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum)
-{
-	struct iommu_dev_data *dev_data;
-
-	if (!amd_iommu_v2_supported())
-		return;
-
-	dev_data = dev_iommu_priv_get(&pdev->dev);
-	dev_data->errata |= (1 << erratum);
-}
-EXPORT_SYMBOL(amd_iommu_enable_device_erratum);
-
 int amd_iommu_device_info(struct pci_dev *pdev,
                           struct amd_iommu_device_info *info)
 {
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 450717299928..474065ed88a4 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -32,24 +32,6 @@ struct pci_dev;
 extern int amd_iommu_detect(void);
 extern int amd_iommu_init_hardware(void);
 
-/**
- * amd_iommu_enable_device_erratum() - Enable erratum workaround for device
- *				       in the IOMMUv2 driver
- * @pdev: The PCI device the workaround is necessary for
- * @erratum: The erratum workaround to enable
- *
- * The function needs to be called before amd_iommu_init_device().
- * Possible values for the erratum number are for now:
- * - AMD_PRI_DEV_ERRATUM_ENABLE_RESET - Reset PRI capability when PRI
- *					is enabled
- * - AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE - Limit number of outstanding PRI
- *					 requests to one
- */
-#define AMD_PRI_DEV_ERRATUM_ENABLE_RESET		0
-#define AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE		1
-
-extern void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum);
-
 /**
  * amd_iommu_init_device() - Init device for use with IOMMUv2 driver
  * @pdev: The PCI device to initialize
-- 
cgit v1.2.3


From fc1b6620501f1a4b88f583549c63666180bea177 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Apr 2021 16:33:12 +0200
Subject: iommu/amd: Move a few prototypes to include/linux/amd-iommu.h

A few functions that were intentended for the perf events support are
currently declared in arch/x86/events/amd/iommu.h, which mens they are
not in scope for the actual function definition.  Also amdkfd has started
using a few of them using externs in a .c file.  End that misery by
moving the prototypes to the proper header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210402143312.372386-5-hch@lst.de
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/x86/events/amd/iommu.c            |  1 +
 arch/x86/events/amd/iommu.h            | 19 -------------------
 drivers/gpu/drm/amd/amdkfd/kfd_iommu.c |  4 ----
 drivers/iommu/amd/amd_iommu.h          |  1 -
 include/linux/amd-iommu.h              | 12 ++++++++++++
 5 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index be50ef8572cc..a573fac1ece8 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/cpumask.h>
 #include <linux/slab.h>
+#include <linux/amd-iommu.h>
 
 #include "../perf_event.h"
 #include "iommu.h"
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
index 0e5c036fd7be..095505c6213b 100644
--- a/arch/x86/events/amd/iommu.h
+++ b/arch/x86/events/amd/iommu.h
@@ -21,23 +21,4 @@
 #define PC_MAX_SPEC_BNKS			64
 #define PC_MAX_SPEC_CNTRS			16
 
-struct amd_iommu;
-
-/* amd_iommu_init.c external support functions */
-extern int amd_iommu_get_num_iommus(void);
-
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
-
-extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
-
-extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
-				u8 fxn, u64 *value);
-
-extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
-				u8 fxn, u64 *value);
-
-extern struct amd_iommu *get_amd_iommu(int idx);
-
 #endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
index 66bbca61e3ef..3c56333bbb9c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
@@ -329,10 +329,6 @@ int kfd_iommu_resume(struct kfd_dev *kfd)
 	return 0;
 }
 
-extern bool amd_iommu_pc_supported(void);
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-
 /** kfd_iommu_add_perf_counters - Add IOMMU performance counters to topology
  */
 int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev)
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index ed46dc754a2a..55dd38d814d9 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -11,7 +11,6 @@
 
 #include "amd_iommu_types.h"
 
-extern int amd_iommu_get_num_iommus(void);
 extern int amd_iommu_init_dma_ops(void);
 extern int amd_iommu_init_passthrough(void);
 extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 474065ed88a4..58e6c3806c09 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -10,6 +10,8 @@
 
 #include <linux/types.h>
 
+struct amd_iommu;
+
 /*
  * This is mainly used to communicate information back-and-forth
  * between SVM and IOMMU for setting up and tearing down posted
@@ -194,4 +196,14 @@ static inline int amd_iommu_deactivate_guest_mode(void *data)
 }
 #endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
 
+int amd_iommu_get_num_iommus(void);
+bool amd_iommu_pc_supported(void);
+u8 amd_iommu_pc_get_max_banks(unsigned int idx);
+u8 amd_iommu_pc_get_max_counters(unsigned int idx);
+int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn,
+		u64 *value);
+int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn,
+		u64 *value);
+struct amd_iommu *get_amd_iommu(unsigned int idx);
+
 #endif /* _ASM_X86_AMD_IOMMU_H */
-- 
cgit v1.2.3


From c0474a606ecb9326227b4d68059942f9db88a897 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Sat, 20 Mar 2021 10:54:13 +0800
Subject: iommu/vt-d: Invalidate PASID cache when root/context entry changed

When the Intel IOMMU is operating in the scalable mode, some information
from the root and context table may be used to tag entries in the PASID
cache. Software should invalidate the PASID-cache when changing root or
context table entries.

Suggested-by: Ashok Raj <ashok.raj@intel.com>
Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210320025415.641201-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.c | 18 +++++++++---------
 include/linux/intel-iommu.h |  1 +
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index b0f901ee6358..c981e69bc107 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1347,6 +1347,11 @@ static void iommu_set_root_entry(struct intel_iommu *iommu)
 		      readl, (sts & DMA_GSTS_RTPS), sts);
 
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+	if (sm_supported(iommu))
+		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }
 
 void iommu_flush_write_buffer(struct intel_iommu *iommu)
@@ -2430,6 +2435,10 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
 				   (((u16)bus) << 8) | devfn,
 				   DMA_CCMD_MASK_NOBIT,
 				   DMA_CCMD_DEVICE_INVL);
+
+	if (sm_supported(iommu))
+		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
+
 	iommu->flush.flush_iotlb(iommu,
 				 did_old,
 				 0,
@@ -3278,8 +3287,6 @@ static int __init init_dmars(void)
 		register_pasid_allocator(iommu);
 #endif
 		iommu_set_root_entry(iommu);
-		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 	}
 
 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
@@ -3469,12 +3476,7 @@ static int init_iommu_hw(void)
 		}
 
 		iommu_flush_write_buffer(iommu);
-
 		iommu_set_root_entry(iommu);
-
-		iommu->flush.flush_context(iommu, 0, 0, 0,
-					   DMA_CCMD_GLOBAL_INVL);
-		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 		iommu_enable_translation(iommu);
 		iommu_disable_protect_mem_regions(iommu);
 	}
@@ -3857,8 +3859,6 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
 		goto disable_iommu;
 
 	iommu_set_root_entry(iommu);
-	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 	iommu_enable_translation(iommu);
 
 	iommu_disable_protect_mem_regions(iommu);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index e0f8c2ade3e8..03faf20a6817 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -378,6 +378,7 @@ enum {
 /* PASID cache invalidation granu */
 #define QI_PC_ALL_PASIDS	0
 #define QI_PC_PASID_SEL		1
+#define QI_PC_GLOBAL		3
 
 #define QI_EIOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
 #define QI_EIOTLB_IH(ih)	(((u64)ih) << 6)
-- 
cgit v1.2.3


From 9de07a4e8d4cb269f9876b2ffa282b5ffd09e05b Mon Sep 17 00:00:00 2001
From: John Chen <johnchen902@gmail.com>
Date: Tue, 30 Mar 2021 19:33:19 +0800
Subject: HID: input: map battery capacity (00850065)

This is the capacity in percentage, relative to design capacity.
Specifically, it is present in Apple Magic Mouse 2.

In contrast, usage 00850064 is also the capacity in percentage, but is
relative to full capacity. It is not mapped here because I don't have
such device.

Signed-off-by: John Chen <johnchen902@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-debug.c |  1 +
 drivers/hid/hid-input.c | 22 ++++++++++++++++------
 include/linux/hid.h     |  3 +++
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index d7eaf9100370..59f8d716d78f 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -417,6 +417,7 @@ static const struct hid_usage_entry hid_usage_table[] = {
     { 0x85, 0x44, "Charging" },
     { 0x85, 0x45, "Discharging" },
     { 0x85, 0x4b, "NeedReplacement" },
+    { 0x85, 0x65, "AbsoluteStateOfCharge" },
     { 0x85, 0x66, "RemainingCapacity" },
     { 0x85, 0x68, "RunTimeToEmpty" },
     { 0x85, 0x6a, "AverageTimeToFull" },
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 236bccd37760..18f5e28d475c 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -435,7 +435,8 @@ static int hidinput_get_battery_property(struct power_supply *psy,
 	return ret;
 }
 
-static int hidinput_setup_battery(struct hid_device *dev, unsigned report_type, struct hid_field *field)
+static int hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
+				  struct hid_field *field, bool is_percentage)
 {
 	struct power_supply_desc *psy_desc;
 	struct power_supply_config psy_cfg = { .drv_data = dev, };
@@ -475,7 +476,7 @@ static int hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
 	min = field->logical_minimum;
 	max = field->logical_maximum;
 
-	if (quirks & HID_BATTERY_QUIRK_PERCENT) {
+	if (is_percentage || (quirks & HID_BATTERY_QUIRK_PERCENT)) {
 		min = 0;
 		max = 100;
 	}
@@ -552,7 +553,7 @@ static void hidinput_update_battery(struct hid_device *dev, int value)
 }
 #else  /* !CONFIG_HID_BATTERY_STRENGTH */
 static int hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
-				  struct hid_field *field)
+				  struct hid_field *field, bool is_percentage)
 {
 	return 0;
 }
@@ -806,7 +807,7 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 			break;
 
 		case 0x3b: /* Battery Strength */
-			hidinput_setup_battery(device, HID_INPUT_REPORT, field);
+			hidinput_setup_battery(device, HID_INPUT_REPORT, field, false);
 			usage->type = EV_PWR;
 			return;
 
@@ -1068,7 +1069,16 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 	case HID_UP_GENDEVCTRLS:
 		switch (usage->hid) {
 		case HID_DC_BATTERYSTRENGTH:
-			hidinput_setup_battery(device, HID_INPUT_REPORT, field);
+			hidinput_setup_battery(device, HID_INPUT_REPORT, field, false);
+			usage->type = EV_PWR;
+			return;
+		}
+		goto unknown;
+
+	case HID_UP_BATTERY:
+		switch (usage->hid) {
+		case HID_BAT_ABSOLUTESTATEOFCHARGE:
+			hidinput_setup_battery(device, HID_INPUT_REPORT, field, true);
 			usage->type = EV_PWR;
 			return;
 		}
@@ -1672,7 +1682,7 @@ static void report_features(struct hid_device *hid)
 				/* Verify if Battery Strength feature is available */
 				if (usage->hid == HID_DC_BATTERYSTRENGTH)
 					hidinput_setup_battery(hid, HID_FEATURE_REPORT,
-							       rep->field[i]);
+							       rep->field[i], false);
 
 				if (drv->feature_mapping)
 					drv->feature_mapping(hid, rep->field[i], usage);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index ef702b3f56e3..b40e1abbe11d 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -153,6 +153,7 @@ struct hid_item {
 #define HID_UP_CONSUMER		0x000c0000
 #define HID_UP_DIGITIZER	0x000d0000
 #define HID_UP_PID		0x000f0000
+#define HID_UP_BATTERY		0x00850000
 #define HID_UP_HPVENDOR         0xff7f0000
 #define HID_UP_HPVENDOR2        0xff010000
 #define HID_UP_MSVENDOR		0xff000000
@@ -297,6 +298,8 @@ struct hid_item {
 #define HID_DG_TOOLSERIALNUMBER	0x000d005b
 #define HID_DG_LATENCYMODE	0x000d0060
 
+#define HID_BAT_ABSOLUTESTATEOFCHARGE	0x00850065
+
 #define HID_VD_ASUS_CUSTOM_MEDIA_KEYS	0xff310076
 /*
  * HID report types --- Ouch! HID spec says 1 2 3!
-- 
cgit v1.2.3


From 4a35d6a03744ded782c9301f5f5d78ad68ce680f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 7 Apr 2021 13:17:10 +0100
Subject: irqdomain: Get rid of irq_create_identity_mapping()

The sole user of irq_create_identity_mapping() having been converted,
get rid of the unused helper.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdomain.h | 6 ------
 kernel/irq/irqdomain.c    | 3 ---
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 33cacc8af26d..d2c61de208a8 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -419,12 +419,6 @@ extern int irq_create_strict_mappings(struct irq_domain *domain,
 				      unsigned int irq_base,
 				      irq_hw_number_t hwirq_base, int count);
 
-static inline int irq_create_identity_mapping(struct irq_domain *host,
-					      irq_hw_number_t hwirq)
-{
-	return irq_create_strict_mappings(host, hwirq, hwirq, 1);
-}
-
 extern const struct irq_domain_ops irq_domain_simple_ops;
 
 /* stock xlate functions */
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d10ab1d689d5..35c5a99f8884 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -715,9 +715,6 @@ EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
  * locations. For use by controllers that already have static mappings
  * to insert in to the domain.
  *
- * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
- * domain insertion.
- *
  * 0 is returned upon success, while any failure to establish a static
  * mapping is treated as an error.
  */
-- 
cgit v1.2.3


From a8cf291bdac5d415eadb55e79df1fca8c3f0dfef Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Wed, 9 Dec 2020 14:09:26 +0800
Subject: ptp: Reorganize ptp_kvm.c to make it arch-independent

Currently, the ptp_kvm module contains a lot of x86-specific code.
Let's move this code into a new arch-specific file in the same directory,
and rename the arch-independent file to ptp_kvm_common.c.

Acked-by: Richard Cochran <richardcochran@gmail.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-4-jianyong.wu@arm.com
---
 drivers/ptp/Makefile         |   1 +
 drivers/ptp/ptp_kvm.c        | 197 -------------------------------------------
 drivers/ptp/ptp_kvm_common.c | 157 ++++++++++++++++++++++++++++++++++
 drivers/ptp/ptp_kvm_x86.c    |  97 +++++++++++++++++++++
 include/linux/ptp_kvm.h      |  19 +++++
 5 files changed, 274 insertions(+), 197 deletions(-)
 delete mode 100644 drivers/ptp/ptp_kvm.c
 create mode 100644 drivers/ptp/ptp_kvm_common.c
 create mode 100644 drivers/ptp/ptp_kvm_x86.c
 create mode 100644 include/linux/ptp_kvm.h

(limited to 'include/linux')

diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index db5aef3bddc6..d11eeb5811d1 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -4,6 +4,7 @@
 #
 
 ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp_kvm-$(CONFIG_X86)			:= ptp_kvm_x86.o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_INES)	+= ptp_ines.o
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c
deleted file mode 100644
index 658d33fc3195..000000000000
--- a/drivers/ptp/ptp_kvm.c
+++ /dev/null
@@ -1,197 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Virtual PTP 1588 clock for use with KVM guests
- *
- * Copyright (C) 2017 Red Hat Inc.
- */
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <uapi/linux/kvm_para.h>
-#include <asm/kvm_para.h>
-#include <asm/pvclock.h>
-#include <asm/kvmclock.h>
-#include <uapi/asm/kvm_para.h>
-
-#include <linux/ptp_clock_kernel.h>
-
-struct kvm_ptp_clock {
-	struct ptp_clock *ptp_clock;
-	struct ptp_clock_info caps;
-};
-
-static DEFINE_SPINLOCK(kvm_ptp_lock);
-
-static struct pvclock_vsyscall_time_info *hv_clock;
-
-static struct kvm_clock_pairing clock_pair;
-static phys_addr_t clock_pair_gpa;
-
-static int ptp_kvm_get_time_fn(ktime_t *device_time,
-			       struct system_counterval_t *system_counter,
-			       void *ctx)
-{
-	unsigned long ret;
-	struct timespec64 tspec;
-	unsigned version;
-	int cpu;
-	struct pvclock_vcpu_time_info *src;
-
-	spin_lock(&kvm_ptp_lock);
-
-	preempt_disable_notrace();
-	cpu = smp_processor_id();
-	src = &hv_clock[cpu].pvti;
-
-	do {
-		/*
-		 * We are using a TSC value read in the hosts
-		 * kvm_hc_clock_pairing handling.
-		 * So any changes to tsc_to_system_mul
-		 * and tsc_shift or any other pvclock
-		 * data invalidate that measurement.
-		 */
-		version = pvclock_read_begin(src);
-
-		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-				     clock_pair_gpa,
-				     KVM_CLOCK_PAIRING_WALLCLOCK);
-		if (ret != 0) {
-			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
-			spin_unlock(&kvm_ptp_lock);
-			preempt_enable_notrace();
-			return -EOPNOTSUPP;
-		}
-
-		tspec.tv_sec = clock_pair.sec;
-		tspec.tv_nsec = clock_pair.nsec;
-		ret = __pvclock_read_cycles(src, clock_pair.tsc);
-	} while (pvclock_read_retry(src, version));
-
-	preempt_enable_notrace();
-
-	system_counter->cycles = ret;
-	system_counter->cs = &kvm_clock;
-
-	*device_time = timespec64_to_ktime(tspec);
-
-	spin_unlock(&kvm_ptp_lock);
-
-	return 0;
-}
-
-static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp,
-				  struct system_device_crosststamp *xtstamp)
-{
-	return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL,
-					     NULL, xtstamp);
-}
-
-/*
- * PTP clock operations
- */
-
-static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
-{
-	return -EOPNOTSUPP;
-}
-
-static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta)
-{
-	return -EOPNOTSUPP;
-}
-
-static int ptp_kvm_settime(struct ptp_clock_info *ptp,
-			   const struct timespec64 *ts)
-{
-	return -EOPNOTSUPP;
-}
-
-static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
-{
-	unsigned long ret;
-	struct timespec64 tspec;
-
-	spin_lock(&kvm_ptp_lock);
-
-	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-			     clock_pair_gpa,
-			     KVM_CLOCK_PAIRING_WALLCLOCK);
-	if (ret != 0) {
-		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
-		spin_unlock(&kvm_ptp_lock);
-		return -EOPNOTSUPP;
-	}
-
-	tspec.tv_sec = clock_pair.sec;
-	tspec.tv_nsec = clock_pair.nsec;
-	spin_unlock(&kvm_ptp_lock);
-
-	memcpy(ts, &tspec, sizeof(struct timespec64));
-
-	return 0;
-}
-
-static int ptp_kvm_enable(struct ptp_clock_info *ptp,
-			  struct ptp_clock_request *rq, int on)
-{
-	return -EOPNOTSUPP;
-}
-
-static const struct ptp_clock_info ptp_kvm_caps = {
-	.owner		= THIS_MODULE,
-	.name		= "KVM virtual PTP",
-	.max_adj	= 0,
-	.n_ext_ts	= 0,
-	.n_pins		= 0,
-	.pps		= 0,
-	.adjfreq	= ptp_kvm_adjfreq,
-	.adjtime	= ptp_kvm_adjtime,
-	.gettime64	= ptp_kvm_gettime,
-	.settime64	= ptp_kvm_settime,
-	.enable		= ptp_kvm_enable,
-	.getcrosststamp = ptp_kvm_getcrosststamp,
-};
-
-/* module operations */
-
-static struct kvm_ptp_clock kvm_ptp_clock;
-
-static void __exit ptp_kvm_exit(void)
-{
-	ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
-}
-
-static int __init ptp_kvm_init(void)
-{
-	long ret;
-
-	if (!kvm_para_available())
-		return -ENODEV;
-
-	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-	hv_clock = pvclock_get_pvti_cpu0_va();
-
-	if (!hv_clock)
-		return -ENODEV;
-
-	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
-			KVM_CLOCK_PAIRING_WALLCLOCK);
-	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
-		return -ENODEV;
-
-	kvm_ptp_clock.caps = ptp_kvm_caps;
-
-	kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
-
-	return PTR_ERR_OR_ZERO(kvm_ptp_clock.ptp_clock);
-}
-
-module_init(ptp_kvm_init);
-module_exit(ptp_kvm_exit);
-
-MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
-MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ptp/ptp_kvm_common.c b/drivers/ptp/ptp_kvm_common.c
new file mode 100644
index 000000000000..721ddcede5e1
--- /dev/null
+++ b/drivers/ptp/ptp_kvm_common.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ptp_kvm.h>
+#include <uapi/linux/kvm_para.h>
+#include <asm/kvm_para.h>
+#include <uapi/asm/kvm_para.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+struct kvm_ptp_clock {
+	struct ptp_clock *ptp_clock;
+	struct ptp_clock_info caps;
+};
+
+static DEFINE_SPINLOCK(kvm_ptp_lock);
+
+static int ptp_kvm_get_time_fn(ktime_t *device_time,
+			       struct system_counterval_t *system_counter,
+			       void *ctx)
+{
+	long ret;
+	u64 cycle;
+	struct timespec64 tspec;
+	struct clocksource *cs;
+
+	spin_lock(&kvm_ptp_lock);
+
+	preempt_disable_notrace();
+	ret = kvm_arch_ptp_get_crosststamp(&cycle, &tspec, &cs);
+	if (ret) {
+		spin_unlock(&kvm_ptp_lock);
+		preempt_enable_notrace();
+		return ret;
+	}
+
+	preempt_enable_notrace();
+
+	system_counter->cycles = cycle;
+	system_counter->cs = cs;
+
+	*device_time = timespec64_to_ktime(tspec);
+
+	spin_unlock(&kvm_ptp_lock);
+
+	return 0;
+}
+
+static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp,
+				  struct system_device_crosststamp *xtstamp)
+{
+	return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL,
+					     NULL, xtstamp);
+}
+
+/*
+ * PTP clock operations
+ */
+
+static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_settime(struct ptp_clock_info *ptp,
+			   const struct timespec64 *ts)
+{
+	return -EOPNOTSUPP;
+}
+
+static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	long ret;
+	struct timespec64 tspec;
+
+	spin_lock(&kvm_ptp_lock);
+
+	ret = kvm_arch_ptp_get_clock(&tspec);
+	if (ret) {
+		spin_unlock(&kvm_ptp_lock);
+		return ret;
+	}
+
+	spin_unlock(&kvm_ptp_lock);
+
+	memcpy(ts, &tspec, sizeof(struct timespec64));
+
+	return 0;
+}
+
+static int ptp_kvm_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info ptp_kvm_caps = {
+	.owner		= THIS_MODULE,
+	.name		= "KVM virtual PTP",
+	.max_adj	= 0,
+	.n_ext_ts	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfreq	= ptp_kvm_adjfreq,
+	.adjtime	= ptp_kvm_adjtime,
+	.gettime64	= ptp_kvm_gettime,
+	.settime64	= ptp_kvm_settime,
+	.enable		= ptp_kvm_enable,
+	.getcrosststamp = ptp_kvm_getcrosststamp,
+};
+
+/* module operations */
+
+static struct kvm_ptp_clock kvm_ptp_clock;
+
+static void __exit ptp_kvm_exit(void)
+{
+	ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
+}
+
+static int __init ptp_kvm_init(void)
+{
+	long ret;
+
+	ret = kvm_arch_ptp_init();
+	if (ret) {
+		pr_err("fail to initialize ptp_kvm");
+		return ret;
+	}
+
+	kvm_ptp_clock.caps = ptp_kvm_caps;
+
+	kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
+
+	return PTR_ERR_OR_ZERO(kvm_ptp_clock.ptp_clock);
+}
+
+module_init(ptp_kvm_init);
+module_exit(ptp_kvm_exit);
+
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
+MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
+MODULE_LICENSE("GPL");
diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c
new file mode 100644
index 000000000000..3dd519dfc473
--- /dev/null
+++ b/drivers/ptp/ptp_kvm_x86.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <linux/module.h>
+#include <uapi/asm/kvm_para.h>
+#include <uapi/linux/kvm_para.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/ptp_kvm.h>
+
+struct pvclock_vsyscall_time_info *hv_clock;
+
+static phys_addr_t clock_pair_gpa;
+static struct kvm_clock_pairing clock_pair;
+
+int kvm_arch_ptp_init(void)
+{
+	long ret;
+
+	if (!kvm_para_available())
+		return -ENODEV;
+
+	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+	hv_clock = pvclock_get_pvti_cpu0_va();
+	if (!hv_clock)
+		return -ENODEV;
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+		return -ENODEV;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+	long ret;
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+			     clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret != 0) {
+		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
+		return -EOPNOTSUPP;
+	}
+
+	ts->tv_sec = clock_pair.sec;
+	ts->tv_nsec = clock_pair.nsec;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec,
+			      struct clocksource **cs)
+{
+	struct pvclock_vcpu_time_info *src;
+	unsigned int version;
+	long ret;
+	int cpu;
+
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
+
+	do {
+		/*
+		 * We are using a TSC value read in the hosts
+		 * kvm_hc_clock_pairing handling.
+		 * So any changes to tsc_to_system_mul
+		 * and tsc_shift or any other pvclock
+		 * data invalidate that measurement.
+		 */
+		version = pvclock_read_begin(src);
+
+		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+				     clock_pair_gpa,
+				     KVM_CLOCK_PAIRING_WALLCLOCK);
+		if (ret != 0) {
+			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+			return -EOPNOTSUPP;
+		}
+		tspec->tv_sec = clock_pair.sec;
+		tspec->tv_nsec = clock_pair.nsec;
+		*cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+	} while (pvclock_read_retry(src, version));
+
+	*cs = &kvm_clock;
+
+	return 0;
+}
diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h
new file mode 100644
index 000000000000..f960a719f0d5
--- /dev/null
+++ b/include/linux/ptp_kvm.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+#ifndef _PTP_KVM_H_
+#define _PTP_KVM_H_
+
+struct timespec64;
+struct clocksource;
+
+int kvm_arch_ptp_init(void);
+int kvm_arch_ptp_get_clock(struct timespec64 *ts);
+int kvm_arch_ptp_get_crosststamp(u64 *cycle,
+		struct timespec64 *tspec, struct clocksource **cs);
+
+#endif /* _PTP_KVM_H_ */
-- 
cgit v1.2.3


From b2c67cbe9f447312f5cdd7c6641b463f2349aec0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Dec 2020 14:09:27 +0800
Subject: time: Add mechanism to recognize clocksource in time_get_snapshot

System time snapshots are not conveying information about the current
clocksource which was used, but callers like the PTP KVM guest
implementation have the requirement to evaluate the clocksource type to
select the appropriate mechanism.

Introduce a clocksource id field in struct clocksource which is by default
set to CSID_GENERIC (0). Clocksource implementations can set that field to
a value which allows to identify the clocksource.

Store the clocksource id of the current clocksource in the
system_time_snapshot so callers can evaluate which clocksource was used to
take the snapshot and act accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-5-jianyong.wu@arm.com
---
 include/linux/clocksource.h     |  6 ++++++
 include/linux/clocksource_ids.h | 11 +++++++++++
 include/linux/timekeeping.h     | 12 +++++++-----
 kernel/time/clocksource.c       |  2 ++
 kernel/time/timekeeping.c       |  1 +
 5 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/clocksource_ids.h

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 86d143db6523..1290d0dce840 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -17,6 +17,7 @@
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/of.h>
+#include <linux/clocksource_ids.h>
 #include <asm/div64.h>
 #include <asm/io.h>
 
@@ -62,6 +63,10 @@ struct module;
  *			400-499: Perfect
  *				The ideal clocksource. A must-use where
  *				available.
+ * @id:			Defaults to CSID_GENERIC. The id value is captured
+ *			in certain snapshot functions to allow callers to
+ *			validate the clocksource from which the snapshot was
+ *			taken.
  * @flags:		Flags describing special properties
  * @enable:		Optional function to enable the clocksource
  * @disable:		Optional function to disable the clocksource
@@ -100,6 +105,7 @@ struct clocksource {
 	const char		*name;
 	struct list_head	list;
 	int			rating;
+	enum clocksource_ids	id;
 	enum vdso_clock_mode	vdso_clock_mode;
 	unsigned long		flags;
 
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
new file mode 100644
index 000000000000..4d8e19e05328
--- /dev/null
+++ b/include/linux/clocksource_ids.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOCKSOURCE_IDS_H
+#define _LINUX_CLOCKSOURCE_IDS_H
+
+/* Enum to give clocksources a unique identifier */
+enum clocksource_ids {
+	CSID_GENERIC		= 0,
+	CSID_MAX,
+};
+
+#endif
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index c6792cf01bc7..78a98bdff76d 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -3,6 +3,7 @@
 #define _LINUX_TIMEKEEPING_H
 
 #include <linux/errno.h>
+#include <linux/clocksource_ids.h>
 
 /* Included from linux/ktime.h */
 
@@ -243,11 +244,12 @@ struct ktime_timestamps {
  * @cs_was_changed_seq:	The sequence number of clocksource change events
  */
 struct system_time_snapshot {
-	u64		cycles;
-	ktime_t		real;
-	ktime_t		raw;
-	unsigned int	clock_was_set_seq;
-	u8		cs_was_changed_seq;
+	u64			cycles;
+	ktime_t			real;
+	ktime_t			raw;
+	enum clocksource_ids	cs_id;
+	unsigned int		clock_was_set_seq;
+	u8			cs_was_changed_seq;
 };
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cce484a2cc7c..4fe1df894ee5 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -920,6 +920,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 
 	clocksource_arch_init(cs);
 
+	if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+		cs->id = CSID_GENERIC;
 	if (cs->vdso_clock_mode < 0 ||
 	    cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
 		pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6aee5768c86f..06f55f9258bf 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1048,6 +1048,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 		now = tk_clock_read(&tk->tkr_mono);
+		systime_snapshot->cs_id = tk->tkr_mono.clock->id;
 		systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
 		systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
 		base_real = ktime_add(tk->tkr_mono.base,
-- 
cgit v1.2.3


From 100148d0fc7dcf8672fe0ac83f44dc5749b4da5c Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Wed, 9 Dec 2020 14:09:28 +0800
Subject: clocksource: Add clocksource id for arm arch counter

Add clocksource id to the ARM generic counter so that it can be easily
identified from callers such as ptp_kvm.

Cc: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-6-jianyong.wu@arm.com
---
 drivers/clocksource/arm_arch_timer.c | 2 ++
 include/linux/clocksource_ids.h      | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index d0177824c518..8f12e223703f 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -16,6 +16,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
+#include <linux/clocksource_ids.h>
 #include <linux/interrupt.h>
 #include <linux/of_irq.h>
 #include <linux/of_address.h>
@@ -191,6 +192,7 @@ static u64 arch_counter_read_cc(const struct cyclecounter *cc)
 
 static struct clocksource clocksource_counter = {
 	.name	= "arch_sys_counter",
+	.id	= CSID_ARM_ARCH_COUNTER,
 	.rating	= 400,
 	.read	= arch_counter_read,
 	.mask	= CLOCKSOURCE_MASK(56),
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
index 4d8e19e05328..16775d7d8f8d 100644
--- a/include/linux/clocksource_ids.h
+++ b/include/linux/clocksource_ids.h
@@ -5,6 +5,7 @@
 /* Enum to give clocksources a unique identifier */
 enum clocksource_ids {
 	CSID_GENERIC		= 0,
+	CSID_ARM_ARCH_COUNTER,
 	CSID_MAX,
 };
 
-- 
cgit v1.2.3


From 3bf725699bf62494b3e179f1795f08c7d749f061 Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Wed, 9 Dec 2020 14:09:29 +0800
Subject: KVM: arm64: Add support for the KVM PTP service

Implement the hypervisor side of the KVM PTP interface.

The service offers wall time and cycle count from host to guest.
The caller must specify whether they want the host's view of
either the virtual or physical counter.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201209060932.212364-7-jianyong.wu@arm.com
---
 Documentation/virt/kvm/api.rst         | 10 +++++++
 Documentation/virt/kvm/arm/index.rst   |  1 +
 Documentation/virt/kvm/arm/ptp_kvm.rst | 25 ++++++++++++++++
 arch/arm64/kvm/arm.c                   |  1 +
 arch/arm64/kvm/hypercalls.c            | 53 ++++++++++++++++++++++++++++++++++
 include/linux/arm-smccc.h              | 16 ++++++++++
 include/uapi/linux/kvm.h               |  1 +
 7 files changed, 107 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/ptp_kvm.rst

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 38e327d4b479..987d99e39887 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6724,3 +6724,13 @@ vcpu_info is set.
 The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
 features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
 supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
+
+8.31 KVM_CAP_PTP_KVM
+--------------------
+
+:Architectures: arm64
+
+This capability indicates that the KVM virtual PTP service is
+supported in the host. A VMM can check whether the service is
+available to the guest on migration.
+
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index 3e2b2aba90fc..78a9b670aafe 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,3 +10,4 @@ ARM
    hyp-abi
    psci
    pvtime
+   ptp_kvm
diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst
new file mode 100644
index 000000000000..68cffb50d8bf
--- /dev/null
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm/arm64
+=============================
+
+PTP_KVM is used for high precision time sync between host and guests.
+It relies on transferring the wall clock and counter value from the
+host to the guest using a KVM-specific hypercall.
+
+* ARM_SMCCC_HYP_KVM_PTP_FUNC_ID: 0x86000001
+
+This hypercall uses the SMC32/HVC32 calling convention:
+
+ARM_SMCCC_HYP_KVM_PTP_FUNC_ID
+    =============    ==========    ==========
+    Function ID:     (uint32)      0x86000001
+    Arguments:       (uint32)      KVM_PTP_VIRT_COUNTER(0)
+                                   KVM_PTP_PHYS_COUNTER(1)
+    Return Values:   (int32)       NOT_SUPPORTED(-1) on error, or
+                     (uint32)      Upper 32 bits of wall clock time (r0)
+                     (uint32)      Lower 32 bits of wall clock time (r1)
+                     (uint32)      Upper 32 bits of counter (r2)
+                     (uint32)      Lower 32 bits of counter (r3)
+    Endianness:                    No Restrictions.
+    =============    ==========    ==========
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 7f06ba76698d..46401798c644 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_INJECT_EXT_DABT:
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
+	case KVM_CAP_PTP_KVM:
 		r = 1;
 		break;
 	case KVM_CAP_ARM_SET_DEVICE_ADDR:
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 78d32c34d49c..30da78f72b3b 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -9,6 +9,55 @@
 #include <kvm/arm_hypercalls.h>
 #include <kvm/arm_psci.h>
 
+static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
+{
+	struct system_time_snapshot systime_snapshot;
+	u64 cycles = ~0UL;
+	u32 feature;
+
+	/*
+	 * system time and counter value must captured at the same
+	 * time to keep consistency and precision.
+	 */
+	ktime_get_snapshot(&systime_snapshot);
+
+	/*
+	 * This is only valid if the current clocksource is the
+	 * architected counter, as this is the only one the guest
+	 * can see.
+	 */
+	if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+		return;
+
+	/*
+	 * The guest selects one of the two reference counters
+	 * (virtual or physical) with the first argument of the SMCCC
+	 * call. In case the identifier is not supported, error out.
+	 */
+	feature = smccc_get_arg1(vcpu);
+	switch (feature) {
+	case KVM_PTP_VIRT_COUNTER:
+		cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+		break;
+	case KVM_PTP_PHYS_COUNTER:
+		cycles = systime_snapshot.cycles;
+		break;
+	default:
+		return;
+	}
+
+	/*
+	 * This relies on the top bit of val[0] never being set for
+	 * valid values of system time, because that is *really* far
+	 * in the future (about 292 years from 1970, and at that stage
+	 * nobody will give a damn about it).
+	 */
+	val[0] = upper_32_bits(systime_snapshot.real);
+	val[1] = lower_32_bits(systime_snapshot.real);
+	val[2] = upper_32_bits(cycles);
+	val[3] = lower_32_bits(cycles);
+}
+
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
 	u32 func_id = smccc_get_function(vcpu);
@@ -79,6 +128,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 		break;
 	case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
 		val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP);
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+		kvm_ptp_get_time(vcpu, val);
 		break;
 	case ARM_SMCCC_TRNG_VERSION:
 	case ARM_SMCCC_TRNG_FEATURES:
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 1a27bd9493fe..6861489a1890 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -103,6 +103,7 @@
 
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
+#define ARM_SMCCC_KVM_FUNC_PTP			1
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -114,6 +115,21 @@
 
 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED	1
 
+/*
+ * ptp_kvm is a feature used for time sync between vm and host.
+ * ptp_kvm module in guest kernel will get service from host using
+ * this hypercall ID.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_PTP)
+
+/* ptp_kvm counter type ID */
+#define KVM_PTP_VIRT_COUNTER			0
+#define KVM_PTP_PHYS_COUNTER			1
+
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
 #define ARM_SMCCC_HV_PV_TIME_FEATURES				\
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f6afee209620..0e0f70c0d0dc 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1078,6 +1078,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_PPC_DAWR1 194
+#define KVM_CAP_PTP_KVM 195
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From e84dff1bf0eaccd0231ecf02a8f5c9830d7d34dc Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Wed, 24 Mar 2021 15:29:11 +0800
Subject: PM: core: Remove duplicate declaration from header file

struct device is declared twice, so remove the duplicate.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 482313a8ccfc..c9657408fee1 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -39,7 +39,6 @@ static inline void pm_vt_switch_unregister(struct device *dev)
  * Device power management
  */
 
-struct device;
 
 #ifdef CONFIG_PM
 extern const char power_group_name[];		/* = "power" */
-- 
cgit v1.2.3


From 2ab80d46fead0309d7f190d8023c8d64b2ffcbd5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 29 Mar 2021 20:15:19 +0200
Subject: cpuidle: Use s64 as exit_latency_ns and target_residency_ns data type

Subsequent changes will cause the exit_latency_ns and target_residency_ns
fields in struct cpuidle_state to be used in computations in which data
type conversions to u64 may turn a negative number close to zero into
a verly large positive number leading to incorrect results.

In preparation for that, change the data type of the fields mentioned
above to s64, but ensure that they will not be negative themselves.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/driver.c | 4 ++++
 include/linux/cpuidle.h  | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 4070e573bf43..f70aa17e2a8e 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -181,9 +181,13 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 		 */
 		if (s->target_residency > 0)
 			s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
+		else if (s->target_residency_ns < 0)
+			s->target_residency_ns = 0;
 
 		if (s->exit_latency > 0)
 			s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
+		else if (s->exit_latency_ns < 0)
+			s->exit_latency_ns =  0;
 	}
 }
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bd605b5585cf..fce476275e16 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -49,8 +49,8 @@ struct cpuidle_state {
 	char		name[CPUIDLE_NAME_LEN];
 	char		desc[CPUIDLE_DESC_LEN];
 
-	u64		exit_latency_ns;
-	u64		target_residency_ns;
+	s64		exit_latency_ns;
+	s64		target_residency_ns;
 	unsigned int	flags;
 	unsigned int	exit_latency; /* in US */
 	int		power_usage; /* in mW */
-- 
cgit v1.2.3


From b3084079c1779645222d5ea4e9da88d38026cae8 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Wed, 31 Mar 2021 13:57:12 +0300
Subject: clk: mux: provide devm_clk_hw_register_mux()

Add devm_clk_hw_register_mux() - devres-managed version of
clk_hw_register_mux().

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Abhinav Kumar <abhinavk@codeaurora.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20210331105735.3690009-2-dmitry.baryshkov@linaro.org
Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/clk/clk-mux.c        | 35 +++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h | 13 +++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c
index e54e79714818..20582aae7a35 100644
--- a/drivers/clk/clk-mux.c
+++ b/drivers/clk/clk-mux.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/io.h>
@@ -206,6 +207,40 @@ struct clk_hw *__clk_hw_register_mux(struct device *dev, struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(__clk_hw_register_mux);
 
+static void devm_clk_hw_release_mux(struct device *dev, void *res)
+{
+	clk_hw_unregister_mux(*(struct clk_hw **)res);
+}
+
+struct clk_hw *__devm_clk_hw_register_mux(struct device *dev, struct device_node *np,
+		const char *name, u8 num_parents,
+		const char * const *parent_names,
+		const struct clk_hw **parent_hws,
+		const struct clk_parent_data *parent_data,
+		unsigned long flags, void __iomem *reg, u8 shift, u32 mask,
+		u8 clk_mux_flags, u32 *table, spinlock_t *lock)
+{
+	struct clk_hw **ptr, *hw;
+
+	ptr = devres_alloc(devm_clk_hw_release_mux, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	hw = __clk_hw_register_mux(dev, np, name, num_parents, parent_names, parent_hws,
+				       parent_data, flags, reg, shift, mask,
+				       clk_mux_flags, table, lock);
+
+	if (!IS_ERR(hw)) {
+		*ptr = hw;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return hw;
+}
+EXPORT_SYMBOL_GPL(__devm_clk_hw_register_mux);
+
 struct clk *clk_register_mux_table(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents,
 		unsigned long flags, void __iomem *reg, u8 shift, u32 mask,
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 58f6fe866ae9..3eb15e0262f5 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -868,6 +868,13 @@ struct clk_hw *__clk_hw_register_mux(struct device *dev, struct device_node *np,
 		const struct clk_parent_data *parent_data,
 		unsigned long flags, void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
+struct clk_hw *__devm_clk_hw_register_mux(struct device *dev, struct device_node *np,
+		const char *name, u8 num_parents,
+		const char * const *parent_names,
+		const struct clk_hw **parent_hws,
+		const struct clk_parent_data *parent_data,
+		unsigned long flags, void __iomem *reg, u8 shift, u32 mask,
+		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
 struct clk *clk_register_mux_table(struct device *dev, const char *name,
 		const char * const *parent_names, u8 num_parents,
 		unsigned long flags, void __iomem *reg, u8 shift, u32 mask,
@@ -902,6 +909,12 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 	__clk_hw_register_mux((dev), NULL, (name), (num_parents), NULL, NULL, \
 			      (parent_data), (flags), (reg), (shift),	      \
 			      BIT((width)) - 1, (clk_mux_flags), NULL, (lock))
+#define devm_clk_hw_register_mux(dev, name, parent_names, num_parents, flags, reg, \
+			    shift, width, clk_mux_flags, lock)		      \
+	__devm_clk_hw_register_mux((dev), NULL, (name), (num_parents),	      \
+			      (parent_names), NULL, NULL, (flags), (reg),     \
+			      (shift), BIT((width)) - 1, (clk_mux_flags),     \
+			      NULL, (lock))
 
 int clk_mux_val_to_index(struct clk_hw *hw, u32 *table, unsigned int flags,
 			 unsigned int val);
-- 
cgit v1.2.3


From f4b43ac0b0af1d115effd08133046a694ac33dc2 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Wed, 31 Mar 2021 13:57:13 +0300
Subject: clk: divider: add devm_clk_hw_register_divider

Add devm_clk_hw_register_divider() - devres version of
clk_hw_register_divider().

Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Abhinav Kumar <abhinavk@codeaurora.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Link: https://lore.kernel.org/r/20210331105735.3690009-3-dmitry.baryshkov@linaro.org
Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 include/linux/clk-provider.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 3eb15e0262f5..162a2e5546a3 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -785,6 +785,23 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 				  (parent_data), (flags), (reg), (shift),     \
 				  (width), (clk_divider_flags), (table),      \
 				  (lock))
+/**
+ * devm_clk_hw_register_divider - register a divider clock with the clock framework
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_divider(dev, name, parent_name, flags, reg, shift,    \
+				width, clk_divider_flags, lock)		      \
+	__devm_clk_hw_register_divider((dev), NULL, (name), (parent_name), NULL,   \
+				  NULL, (flags), (reg), (shift), (width),     \
+				  (clk_divider_flags), NULL, (lock))
 /**
  * devm_clk_hw_register_divider_table - register a table based divider clock
  * with the clock framework (devres variant)
-- 
cgit v1.2.3


From 56f15e2cb1f77fbcf9df38de7e5dcb4b37070196 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 6 Apr 2021 17:23:59 -0700
Subject: ethtool: document PHY tunable callbacks

Add missing kdoc for phy tunable callbacks.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 3583f7fc075c..5c631a298994 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -410,6 +410,8 @@ struct ethtool_pause_stats {
  * @get_ethtool_phy_stats: Return extended statistics about the PHY device.
  *	This is only useful if the device maintains PHY statistics and
  *	cannot use the standard PHY library helpers.
+ * @get_phy_tunable: Read the value of a PHY tunable.
+ * @set_phy_tunable: Set the value of a PHY tunable.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
-- 
cgit v1.2.3


From f0ebc2b6b7df7716749445cda26734b3826a48cf Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 6 Apr 2021 17:28:25 -0700
Subject: ethtool: un-kdocify extended link state

Extended link state structures and enums use kdoc headers
but then do not describe any of the members.

Convert to normal comments.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  4 +---
 include/uapi/linux/ethtool.h | 26 ++++++--------------------
 2 files changed, 7 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index ec4cd3921c67..a2b1a21ee7fd 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -87,9 +87,7 @@ u32 ethtool_op_get_link(struct net_device *dev);
 int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *eti);
 
 
-/**
- * struct ethtool_link_ext_state_info - link extended state and substate.
- */
+/* Link extended state and substate. */
 struct ethtool_link_ext_state_info {
 	enum ethtool_link_ext_state link_ext_state;
 	union {
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index cde753bb2093..dc87ba092891 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -579,9 +579,7 @@ struct ethtool_pauseparam {
 	__u32	tx_pause;
 };
 
-/**
- * enum ethtool_link_ext_state - link extended state
- */
+/* Link extended state */
 enum ethtool_link_ext_state {
 	ETHTOOL_LINK_EXT_STATE_AUTONEG,
 	ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE,
@@ -595,10 +593,7 @@ enum ethtool_link_ext_state {
 	ETHTOOL_LINK_EXT_STATE_OVERHEAT,
 };
 
-/**
- * enum ethtool_link_ext_substate_autoneg - more information in addition to
- * ETHTOOL_LINK_EXT_STATE_AUTONEG.
- */
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */
 enum ethtool_link_ext_substate_autoneg {
 	ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1,
 	ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED,
@@ -608,9 +603,7 @@ enum ethtool_link_ext_substate_autoneg {
 	ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD,
 };
 
-/**
- * enum ethtool_link_ext_substate_link_training - more information in addition to
- * ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE.
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE.
  */
 enum ethtool_link_ext_substate_link_training {
 	ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1,
@@ -619,9 +612,7 @@ enum ethtool_link_ext_substate_link_training {
 	ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT,
 };
 
-/**
- * enum ethtool_link_ext_substate_logical_mismatch - more information in addition
- * to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH.
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH.
  */
 enum ethtool_link_ext_substate_link_logical_mismatch {
 	ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1,
@@ -631,19 +622,14 @@ enum ethtool_link_ext_substate_link_logical_mismatch {
 	ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED,
 };
 
-/**
- * enum ethtool_link_ext_substate_bad_signal_integrity - more information in
- * addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY.
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY.
  */
 enum ethtool_link_ext_substate_bad_signal_integrity {
 	ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1,
 	ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE,
 };
 
-/**
- * enum ethtool_link_ext_substate_cable_issue - more information in
- * addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE.
- */
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */
 enum ethtool_link_ext_substate_cable_issue {
 	ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1,
 	ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE,
-- 
cgit v1.2.3


From d9c65de0c1e1574d2cc8007dbe02291fe47db1d9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 6 Apr 2021 17:28:27 -0700
Subject: ethtool: fix kdoc in headers

Fix remaining issues with kdoc in the ethtool headers.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      | 9 +++++++--
 include/uapi/linux/ethtool.h | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index a2b1a21ee7fd..7c88dfff7420 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -290,6 +290,9 @@ struct ethtool_pause_stats {
  *	do not attach ext_substate attribute to netlink message). If link_ext_state
  *	and link_ext_substate are unknown, return -ENODATA. If not implemented,
  *	link_ext_state and link_ext_substate will not be sent to userspace.
+ * @get_eeprom_len: Read range of EEPROM addresses for validation of
+ *	@get_eeprom and @set_eeprom requests.
+ *	Returns 0 if device does not support EEPROM access.
  * @get_eeprom: Read data from the device EEPROM.
  *	Should fill in the magic field.  Don't need to check len for zero
  *	or wraparound.  Fill in the data argument with the eeprom values
@@ -382,6 +385,8 @@ struct ethtool_pause_stats {
  * @get_module_eeprom: Get the eeprom information from the plug-in module
  * @get_eee: Get Energy-Efficient (EEE) supported and status.
  * @set_eee: Set EEE status (enable/disable) as well as LPI timers.
+ * @get_tunable: Read the value of a driver / device tunable.
+ * @set_tunable: Set the value of a driver / device tunable.
  * @get_per_queue_coalesce: Get interrupt coalescing parameters per queue.
  *	It must check that the given queue number is valid. If neither a RX nor
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
@@ -545,8 +550,8 @@ struct phy_tdr_config;
  * @get_sset_count: Get number of strings that @get_strings will write.
  * @get_strings: Return a set of strings that describe the requested objects
  * @get_stats: Return extended statistics about the PHY device.
- * @start_cable_test - Start a cable test
- * @start_cable_test_tdr - Start a Time Domain Reflectometry cable test
+ * @start_cable_test: Start a cable test
+ * @start_cable_test_tdr: Start a Time Domain Reflectometry cable test
  *
  * All operations are optional (i.e. the function pointer may be set to %NULL)
  * and callers must take this into account. Callers must hold the RTNL lock.
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index c9c18e88c215..5afea692a3f7 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -659,6 +659,7 @@ enum ethtool_link_ext_substate_cable_issue {
  *	now deprecated
  * @ETH_SS_FEATURES: Device feature names
  * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names
+ * @ETH_SS_TUNABLES: tunable names
  * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS
  * @ETH_SS_PHY_TUNABLES: PHY tunable names
  * @ETH_SS_LINK_MODES: link mode names
@@ -668,6 +669,8 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_TS_TX_TYPES: timestamping Tx types
  * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters
  * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types
+ *
+ * @ETH_SS_COUNT: number of defined string sets
  */
 enum ethtool_stringset {
 	ETH_SS_TEST		= 0,
@@ -1962,8 +1965,11 @@ enum ethtool_reset_flags {
  *	autonegotiation; 0 if unknown or not applicable.  Read-only.
  * @transceiver: Used to distinguish different possible PHY types,
  *	reported consistently by PHYLIB.  Read-only.
+ * @master_slave_cfg: Master/slave port mode.
+ * @master_slave_state: Master/slave port state.
  * @reserved: Reserved for future use; see the note on reserved space.
  * @reserved1: Reserved for future use; see the note on reserved space.
+ * @link_mode_masks: Variable length bitmaps.
  *
  * If autonegotiation is disabled, the speed and @duplex represent the
  * fixed link mode and are writable if the driver supports multiple
-- 
cgit v1.2.3


From 2a3d15f270efa50d78d8a32d895e9d5396668f3a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:26 -0300
Subject: vfio/mdev: Add missing typesafety around mdev_device

The mdev API should accept and pass a 'struct mdev_device *' in all
places, not pass a 'struct device *' and cast it internally with
to_mdev_device(). Particularly in its struct mdev_driver functions, the
whole point of a bus's struct device_driver wrapper is to provide type
safety compared to the default struct device_driver.

Further, the driver core standard is for bus drivers to expose their
device structure in their public headers that can be used with
container_of() inlines and '&foo->dev' to go between the class levels, and
'&foo->dev' to be used with dev_err/etc driver core helper functions. Move
'struct mdev_device' to mdev.h

Once done this allows moving some one instruction exported functions to
static inlines, which in turns allows removing one of the two grotesque
symbol_get()'s related to mdev in the core code.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Message-Id: <3-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  4 +-
 drivers/vfio/mdev/mdev_core.c                     | 64 +++--------------------
 drivers/vfio/mdev/mdev_driver.c                   |  4 +-
 drivers/vfio/mdev/mdev_private.h                  | 23 ++------
 drivers/vfio/mdev/mdev_sysfs.c                    | 26 +++++----
 drivers/vfio/mdev/vfio_mdev.c                     |  7 ++-
 drivers/vfio/vfio_iommu_type1.c                   | 25 +++------
 include/linux/mdev.h                              | 58 +++++++++++++++-----
 8 files changed, 83 insertions(+), 128 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index 25eb7d5b834b..c43c1dc33333 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -105,8 +105,8 @@ structure to represent a mediated device's driver::
       */
      struct mdev_driver {
 	     const char *name;
-	     int  (*probe)  (struct device *dev);
-	     void (*remove) (struct device *dev);
+	     int  (*probe)  (struct mdev_device *dev);
+	     void (*remove) (struct mdev_device *dev);
 	     struct device_driver    driver;
      };
 
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 6de97d25a3f8..057922a1707e 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -33,36 +33,6 @@ struct device *mdev_parent_dev(struct mdev_device *mdev)
 }
 EXPORT_SYMBOL(mdev_parent_dev);
 
-void *mdev_get_drvdata(struct mdev_device *mdev)
-{
-	return mdev->driver_data;
-}
-EXPORT_SYMBOL(mdev_get_drvdata);
-
-void mdev_set_drvdata(struct mdev_device *mdev, void *data)
-{
-	mdev->driver_data = data;
-}
-EXPORT_SYMBOL(mdev_set_drvdata);
-
-struct device *mdev_dev(struct mdev_device *mdev)
-{
-	return &mdev->dev;
-}
-EXPORT_SYMBOL(mdev_dev);
-
-struct mdev_device *mdev_from_dev(struct device *dev)
-{
-	return dev_is_mdev(dev) ? to_mdev_device(dev) : NULL;
-}
-EXPORT_SYMBOL(mdev_from_dev);
-
-const guid_t *mdev_uuid(struct mdev_device *mdev)
-{
-	return &mdev->uuid;
-}
-EXPORT_SYMBOL(mdev_uuid);
-
 /* Should be called holding parent_list_lock */
 static struct mdev_parent *__find_parent_device(struct device *dev)
 {
@@ -107,7 +77,7 @@ static void mdev_device_remove_common(struct mdev_device *mdev)
 	int ret;
 
 	type = to_mdev_type(mdev->type_kobj);
-	mdev_remove_sysfs_files(&mdev->dev, type);
+	mdev_remove_sysfs_files(mdev, type);
 	device_del(&mdev->dev);
 	parent = mdev->parent;
 	lockdep_assert_held(&parent->unreg_sem);
@@ -122,12 +92,10 @@ static void mdev_device_remove_common(struct mdev_device *mdev)
 
 static int mdev_device_remove_cb(struct device *dev, void *data)
 {
-	if (dev_is_mdev(dev)) {
-		struct mdev_device *mdev;
+	struct mdev_device *mdev = mdev_from_dev(dev);
 
-		mdev = to_mdev_device(dev);
+	if (mdev)
 		mdev_device_remove_common(mdev);
-	}
 	return 0;
 }
 
@@ -332,7 +300,7 @@ int mdev_device_create(struct kobject *kobj,
 	if (ret)
 		goto add_fail;
 
-	ret = mdev_create_sysfs_files(&mdev->dev, type);
+	ret = mdev_create_sysfs_files(mdev, type);
 	if (ret)
 		goto sysfs_fail;
 
@@ -354,13 +322,11 @@ mdev_fail:
 	return ret;
 }
 
-int mdev_device_remove(struct device *dev)
+int mdev_device_remove(struct mdev_device *mdev)
 {
-	struct mdev_device *mdev, *tmp;
+	struct mdev_device *tmp;
 	struct mdev_parent *parent;
 
-	mdev = to_mdev_device(dev);
-
 	mutex_lock(&mdev_list_lock);
 	list_for_each_entry(tmp, &mdev_list, next) {
 		if (tmp == mdev)
@@ -390,24 +356,6 @@ int mdev_device_remove(struct device *dev)
 	return 0;
 }
 
-int mdev_set_iommu_device(struct device *dev, struct device *iommu_device)
-{
-	struct mdev_device *mdev = to_mdev_device(dev);
-
-	mdev->iommu_device = iommu_device;
-
-	return 0;
-}
-EXPORT_SYMBOL(mdev_set_iommu_device);
-
-struct device *mdev_get_iommu_device(struct device *dev)
-{
-	struct mdev_device *mdev = to_mdev_device(dev);
-
-	return mdev->iommu_device;
-}
-EXPORT_SYMBOL(mdev_get_iommu_device);
-
 static int __init mdev_init(void)
 {
 	return mdev_bus_register();
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 0d3223aee20b..44c3ba7e56d9 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -48,7 +48,7 @@ static int mdev_probe(struct device *dev)
 		return ret;
 
 	if (drv && drv->probe) {
-		ret = drv->probe(dev);
+		ret = drv->probe(mdev);
 		if (ret)
 			mdev_detach_iommu(mdev);
 	}
@@ -62,7 +62,7 @@ static int mdev_remove(struct device *dev)
 	struct mdev_device *mdev = to_mdev_device(dev);
 
 	if (drv && drv->remove)
-		drv->remove(dev);
+		drv->remove(mdev);
 
 	mdev_detach_iommu(mdev);
 
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index fdbb3bee99a9..97e2225f7f49 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -24,23 +24,6 @@ struct mdev_parent {
 	struct rw_semaphore unreg_sem;
 };
 
-struct mdev_device {
-	struct device dev;
-	struct mdev_parent *parent;
-	guid_t uuid;
-	void *driver_data;
-	struct list_head next;
-	struct kobject *type_kobj;
-	struct device *iommu_device;
-	bool active;
-};
-
-static inline struct mdev_device *to_mdev_device(struct device *dev)
-{
-	return container_of(dev, struct mdev_device, dev);
-}
-#define dev_is_mdev(d)		((d)->bus == &mdev_bus_type)
-
 struct mdev_type {
 	struct kobject kobj;
 	struct kobject *devices_kobj;
@@ -57,11 +40,11 @@ struct mdev_type {
 int  parent_create_sysfs_files(struct mdev_parent *parent);
 void parent_remove_sysfs_files(struct mdev_parent *parent);
 
-int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type);
-void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type);
+int  mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type);
+void mdev_remove_sysfs_files(struct mdev_device *mdev, struct mdev_type *type);
 
 int  mdev_device_create(struct kobject *kobj,
 			struct device *dev, const guid_t *uuid);
-int  mdev_device_remove(struct device *dev);
+int  mdev_device_remove(struct mdev_device *dev);
 
 #endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 367ff5412a38..18114f3e090a 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -225,6 +225,7 @@ create_err:
 static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	unsigned long val;
 
 	if (kstrtoul(buf, 0, &val) < 0)
@@ -233,7 +234,7 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 	if (val && device_remove_file_self(dev, attr)) {
 		int ret;
 
-		ret = mdev_device_remove(dev);
+		ret = mdev_device_remove(mdev);
 		if (ret)
 			return ret;
 	}
@@ -248,34 +249,37 @@ static const struct attribute *mdev_device_attrs[] = {
 	NULL,
 };
 
-int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type)
+int mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
 {
+	struct kobject *kobj = &mdev->dev.kobj;
 	int ret;
 
-	ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev));
+	ret = sysfs_create_link(type->devices_kobj, kobj, dev_name(&mdev->dev));
 	if (ret)
 		return ret;
 
-	ret = sysfs_create_link(&dev->kobj, &type->kobj, "mdev_type");
+	ret = sysfs_create_link(kobj, &type->kobj, "mdev_type");
 	if (ret)
 		goto type_link_failed;
 
-	ret = sysfs_create_files(&dev->kobj, mdev_device_attrs);
+	ret = sysfs_create_files(kobj, mdev_device_attrs);
 	if (ret)
 		goto create_files_failed;
 
 	return ret;
 
 create_files_failed:
-	sysfs_remove_link(&dev->kobj, "mdev_type");
+	sysfs_remove_link(kobj, "mdev_type");
 type_link_failed:
-	sysfs_remove_link(type->devices_kobj, dev_name(dev));
+	sysfs_remove_link(type->devices_kobj, dev_name(&mdev->dev));
 	return ret;
 }
 
-void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type)
+void mdev_remove_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
 {
-	sysfs_remove_files(&dev->kobj, mdev_device_attrs);
-	sysfs_remove_link(&dev->kobj, "mdev_type");
-	sysfs_remove_link(type->devices_kobj, dev_name(dev));
+	struct kobject *kobj = &mdev->dev.kobj;
+
+	sysfs_remove_files(kobj, mdev_device_attrs);
+	sysfs_remove_link(kobj, "mdev_type");
+	sysfs_remove_link(type->devices_kobj, dev_name(&mdev->dev));
 }
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index ae7e322fbe3c..91b7b8b9eb9c 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -124,9 +124,8 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.request	= vfio_mdev_request,
 };
 
-static int vfio_mdev_probe(struct device *dev)
+static int vfio_mdev_probe(struct mdev_device *mdev)
 {
-	struct mdev_device *mdev = to_mdev_device(dev);
 	struct vfio_device *vdev;
 	int ret;
 
@@ -144,9 +143,9 @@ static int vfio_mdev_probe(struct device *dev)
 	return 0;
 }
 
-static void vfio_mdev_remove(struct device *dev)
+static void vfio_mdev_remove(struct mdev_device *mdev)
 {
-	struct vfio_device *vdev = dev_get_drvdata(dev);
+	struct vfio_device *vdev = dev_get_drvdata(&mdev->dev);
 
 	vfio_unregister_group_dev(vdev);
 	kfree(vdev);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5fb1bb52b057..53b4a6345a23 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1933,28 +1933,13 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
 	return ret;
 }
 
-static struct device *vfio_mdev_get_iommu_device(struct device *dev)
-{
-	struct device *(*fn)(struct device *dev);
-	struct device *iommu_device;
-
-	fn = symbol_get(mdev_get_iommu_device);
-	if (fn) {
-		iommu_device = fn(dev);
-		symbol_put(mdev_get_iommu_device);
-
-		return iommu_device;
-	}
-
-	return NULL;
-}
-
 static int vfio_mdev_attach_domain(struct device *dev, void *data)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct iommu_domain *domain = data;
 	struct device *iommu_device;
 
-	iommu_device = vfio_mdev_get_iommu_device(dev);
+	iommu_device = mdev_get_iommu_device(mdev);
 	if (iommu_device) {
 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
 			return iommu_aux_attach_device(domain, iommu_device);
@@ -1967,10 +1952,11 @@ static int vfio_mdev_attach_domain(struct device *dev, void *data)
 
 static int vfio_mdev_detach_domain(struct device *dev, void *data)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct iommu_domain *domain = data;
 	struct device *iommu_device;
 
-	iommu_device = vfio_mdev_get_iommu_device(dev);
+	iommu_device = mdev_get_iommu_device(mdev);
 	if (iommu_device) {
 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
 			iommu_aux_detach_device(domain, iommu_device);
@@ -2018,9 +2004,10 @@ static bool vfio_bus_is_mdev(struct bus_type *bus)
 
 static int vfio_mdev_iommu_device(struct device *dev, void *data)
 {
+	struct mdev_device *mdev = to_mdev_device(dev);
 	struct device **old = data, *new;
 
-	new = vfio_mdev_get_iommu_device(dev);
+	new = mdev_get_iommu_device(mdev);
 	if (!new || (*old && *old != new))
 		return -EINVAL;
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 27eb383cb95d..52f7ea19dd0f 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -10,7 +10,21 @@
 #ifndef MDEV_H
 #define MDEV_H
 
-struct mdev_device;
+struct mdev_device {
+	struct device dev;
+	struct mdev_parent *parent;
+	guid_t uuid;
+	void *driver_data;
+	struct list_head next;
+	struct kobject *type_kobj;
+	struct device *iommu_device;
+	bool active;
+};
+
+static inline struct mdev_device *to_mdev_device(struct device *dev)
+{
+	return container_of(dev, struct mdev_device, dev);
+}
 
 /*
  * Called by the parent device driver to set the device which represents
@@ -19,12 +33,17 @@ struct mdev_device;
  *
  * @dev: the mediated device that iommu will isolate.
  * @iommu_device: a pci device which represents the iommu for @dev.
- *
- * Return 0 for success, otherwise negative error value.
  */
-int mdev_set_iommu_device(struct device *dev, struct device *iommu_device);
+static inline void mdev_set_iommu_device(struct mdev_device *mdev,
+					 struct device *iommu_device)
+{
+	mdev->iommu_device = iommu_device;
+}
 
-struct device *mdev_get_iommu_device(struct device *dev);
+static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
+{
+	return mdev->iommu_device;
+}
 
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
@@ -126,16 +145,25 @@ struct mdev_type_attribute mdev_type_attr_##_name =		\
  **/
 struct mdev_driver {
 	const char *name;
-	int  (*probe)(struct device *dev);
-	void (*remove)(struct device *dev);
+	int (*probe)(struct mdev_device *dev);
+	void (*remove)(struct mdev_device *dev);
 	struct device_driver driver;
 };
 
 #define to_mdev_driver(drv)	container_of(drv, struct mdev_driver, driver)
 
-void *mdev_get_drvdata(struct mdev_device *mdev);
-void mdev_set_drvdata(struct mdev_device *mdev, void *data);
-const guid_t *mdev_uuid(struct mdev_device *mdev);
+static inline void *mdev_get_drvdata(struct mdev_device *mdev)
+{
+	return mdev->driver_data;
+}
+static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
+{
+	mdev->driver_data = data;
+}
+static inline const guid_t *mdev_uuid(struct mdev_device *mdev)
+{
+	return &mdev->uuid;
+}
 
 extern struct bus_type mdev_bus_type;
 
@@ -146,7 +174,13 @@ int mdev_register_driver(struct mdev_driver *drv, struct module *owner);
 void mdev_unregister_driver(struct mdev_driver *drv);
 
 struct device *mdev_parent_dev(struct mdev_device *mdev);
-struct device *mdev_dev(struct mdev_device *mdev);
-struct mdev_device *mdev_from_dev(struct device *dev);
+static inline struct device *mdev_dev(struct mdev_device *mdev)
+{
+	return &mdev->dev;
+}
+static inline struct mdev_device *mdev_from_dev(struct device *dev)
+{
+	return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL;
+}
 
 #endif /* MDEV_H */
-- 
cgit v1.2.3


From 91b9969d9c6bb7c02253bbfc536bfd892f636fdc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:27 -0300
Subject: vfio/mdev: Simplify driver registration

This is only done once, we don't need to generate code to initialize a
structure stored in the ELF .data segment. Fill in the three required
.driver members directly instead of copying data into them during
mdev_register_driver().

Further the to_mdev_driver() function doesn't belong in a public header,
just inline it into the two places that need it. Finally, we can now
clearly see that 'drv' derived from dev->driver cannot be NULL, firstly
because the driver core forbids it, and secondly because NULL won't pass
through the container_of(). Remove the dead code.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <4-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 Documentation/driver-api/vfio-mediated-device.rst |  5 +----
 drivers/vfio/mdev/mdev_driver.c                   | 15 +++++++--------
 drivers/vfio/mdev/vfio_mdev.c                     |  8 ++++++--
 include/linux/mdev.h                              |  6 +-----
 4 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index c43c1dc33333..1779b85f014e 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -98,13 +98,11 @@ structure to represent a mediated device's driver::
 
      /*
       * struct mdev_driver [2] - Mediated device's driver
-      * @name: driver name
       * @probe: called when new device created
       * @remove: called when device removed
       * @driver: device driver structure
       */
      struct mdev_driver {
-	     const char *name;
 	     int  (*probe)  (struct mdev_device *dev);
 	     void (*remove) (struct mdev_device *dev);
 	     struct device_driver    driver;
@@ -115,8 +113,7 @@ to register and unregister itself with the core driver:
 
 * Register::
 
-    extern int  mdev_register_driver(struct mdev_driver *drv,
-				   struct module *owner);
+    extern int  mdev_register_driver(struct mdev_driver *drv);
 
 * Unregister::
 
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 44c3ba7e56d9..041699571b7e 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -39,7 +39,8 @@ static void mdev_detach_iommu(struct mdev_device *mdev)
 
 static int mdev_probe(struct device *dev)
 {
-	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_driver *drv =
+		container_of(dev->driver, struct mdev_driver, driver);
 	struct mdev_device *mdev = to_mdev_device(dev);
 	int ret;
 
@@ -47,7 +48,7 @@ static int mdev_probe(struct device *dev)
 	if (ret)
 		return ret;
 
-	if (drv && drv->probe) {
+	if (drv->probe) {
 		ret = drv->probe(mdev);
 		if (ret)
 			mdev_detach_iommu(mdev);
@@ -58,10 +59,11 @@ static int mdev_probe(struct device *dev)
 
 static int mdev_remove(struct device *dev)
 {
-	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_driver *drv =
+		container_of(dev->driver, struct mdev_driver, driver);
 	struct mdev_device *mdev = to_mdev_device(dev);
 
-	if (drv && drv->remove)
+	if (drv->remove)
 		drv->remove(mdev);
 
 	mdev_detach_iommu(mdev);
@@ -79,16 +81,13 @@ EXPORT_SYMBOL_GPL(mdev_bus_type);
 /**
  * mdev_register_driver - register a new MDEV driver
  * @drv: the driver to register
- * @owner: module owner of driver to be registered
  *
  * Returns a negative value on error, otherwise 0.
  **/
-int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
+int mdev_register_driver(struct mdev_driver *drv)
 {
 	/* initialize common driver fields */
-	drv->driver.name = drv->name;
 	drv->driver.bus = &mdev_bus_type;
-	drv->driver.owner = owner;
 
 	/* register with core */
 	return driver_register(&drv->driver);
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 91b7b8b9eb9c..cc9507ed85a1 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -152,14 +152,18 @@ static void vfio_mdev_remove(struct mdev_device *mdev)
 }
 
 static struct mdev_driver vfio_mdev_driver = {
-	.name	= "vfio_mdev",
+	.driver = {
+		.name = "vfio_mdev",
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
 	.probe	= vfio_mdev_probe,
 	.remove	= vfio_mdev_remove,
 };
 
 static int __init vfio_mdev_init(void)
 {
-	return mdev_register_driver(&vfio_mdev_driver, THIS_MODULE);
+	return mdev_register_driver(&vfio_mdev_driver);
 }
 
 static void __exit vfio_mdev_exit(void)
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 52f7ea19dd0f..cb771c712da0 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -137,21 +137,17 @@ struct mdev_type_attribute mdev_type_attr_##_name =		\
 
 /**
  * struct mdev_driver - Mediated device driver
- * @name: driver name
  * @probe: called when new device created
  * @remove: called when device removed
  * @driver: device driver structure
  *
  **/
 struct mdev_driver {
-	const char *name;
 	int (*probe)(struct mdev_device *dev);
 	void (*remove)(struct mdev_device *dev);
 	struct device_driver driver;
 };
 
-#define to_mdev_driver(drv)	container_of(drv, struct mdev_driver, driver)
-
 static inline void *mdev_get_drvdata(struct mdev_device *mdev)
 {
 	return mdev->driver_data;
@@ -170,7 +166,7 @@ extern struct bus_type mdev_bus_type;
 int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops);
 void mdev_unregister_device(struct device *dev);
 
-int mdev_register_driver(struct mdev_driver *drv, struct module *owner);
+int mdev_register_driver(struct mdev_driver *drv);
 void mdev_unregister_driver(struct mdev_driver *drv);
 
 struct device *mdev_parent_dev(struct mdev_device *mdev);
-- 
cgit v1.2.3


From 417fd5bf242d7691c15fe0bd705ab76c69276572 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:28 -0300
Subject: vfio/mdev: Use struct mdev_type in struct mdev_device

The kobj pointer in mdev_device is actually pointing at a struct
mdev_type. Use the proper type so things are understandable.

There are a number of places that are confused and passing both the mdev
and the mtype as function arguments, fix these to derive the mtype
directly from the mdev to remove the redundancy.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <5-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/mdev/mdev_core.c    | 16 ++++++----------
 drivers/vfio/mdev/mdev_private.h |  7 +++----
 drivers/vfio/mdev/mdev_sysfs.c   | 11 ++++++-----
 include/linux/mdev.h             |  4 +++-
 4 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 057922a1707e..5ca0efa5266b 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -73,11 +73,9 @@ static void mdev_put_parent(struct mdev_parent *parent)
 static void mdev_device_remove_common(struct mdev_device *mdev)
 {
 	struct mdev_parent *parent;
-	struct mdev_type *type;
 	int ret;
 
-	type = to_mdev_type(mdev->type_kobj);
-	mdev_remove_sysfs_files(mdev, type);
+	mdev_remove_sysfs_files(mdev);
 	device_del(&mdev->dev);
 	parent = mdev->parent;
 	lockdep_assert_held(&parent->unreg_sem);
@@ -241,13 +239,11 @@ static void mdev_device_release(struct device *dev)
 	mdev_device_free(mdev);
 }
 
-int mdev_device_create(struct kobject *kobj,
-		       struct device *dev, const guid_t *uuid)
+int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 {
 	int ret;
 	struct mdev_device *mdev, *tmp;
 	struct mdev_parent *parent;
-	struct mdev_type *type = to_mdev_type(kobj);
 
 	parent = mdev_get_parent(type->parent);
 	if (!parent)
@@ -285,14 +281,14 @@ int mdev_device_create(struct kobject *kobj,
 	}
 
 	device_initialize(&mdev->dev);
-	mdev->dev.parent  = dev;
+	mdev->dev.parent = parent->dev;
 	mdev->dev.bus     = &mdev_bus_type;
 	mdev->dev.release = mdev_device_release;
 	dev_set_name(&mdev->dev, "%pUl", uuid);
 	mdev->dev.groups = parent->ops->mdev_attr_groups;
-	mdev->type_kobj = kobj;
+	mdev->type = type;
 
-	ret = parent->ops->create(kobj, mdev);
+	ret = parent->ops->create(&type->kobj, mdev);
 	if (ret)
 		goto ops_create_fail;
 
@@ -300,7 +296,7 @@ int mdev_device_create(struct kobject *kobj,
 	if (ret)
 		goto add_fail;
 
-	ret = mdev_create_sysfs_files(mdev, type);
+	ret = mdev_create_sysfs_files(mdev);
 	if (ret)
 		goto sysfs_fail;
 
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index 97e2225f7f49..f12e34e16ab9 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -40,11 +40,10 @@ struct mdev_type {
 int  parent_create_sysfs_files(struct mdev_parent *parent);
 void parent_remove_sysfs_files(struct mdev_parent *parent);
 
-int  mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type);
-void mdev_remove_sysfs_files(struct mdev_device *mdev, struct mdev_type *type);
+int  mdev_create_sysfs_files(struct mdev_device *mdev);
+void mdev_remove_sysfs_files(struct mdev_device *mdev);
 
-int  mdev_device_create(struct kobject *kobj,
-			struct device *dev, const guid_t *uuid);
+int mdev_device_create(struct mdev_type *kobj, const guid_t *uuid);
 int  mdev_device_remove(struct mdev_device *dev);
 
 #endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 18114f3e090a..bcfe48d56e8a 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -67,7 +67,7 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = mdev_device_create(kobj, dev, &uuid);
+	ret = mdev_device_create(to_mdev_type(kobj), &uuid);
 	if (ret)
 		return ret;
 
@@ -249,8 +249,9 @@ static const struct attribute *mdev_device_attrs[] = {
 	NULL,
 };
 
-int mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
+int mdev_create_sysfs_files(struct mdev_device *mdev)
 {
+	struct mdev_type *type = mdev->type;
 	struct kobject *kobj = &mdev->dev.kobj;
 	int ret;
 
@@ -271,15 +272,15 @@ int mdev_create_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
 create_files_failed:
 	sysfs_remove_link(kobj, "mdev_type");
 type_link_failed:
-	sysfs_remove_link(type->devices_kobj, dev_name(&mdev->dev));
+	sysfs_remove_link(mdev->type->devices_kobj, dev_name(&mdev->dev));
 	return ret;
 }
 
-void mdev_remove_sysfs_files(struct mdev_device *mdev, struct mdev_type *type)
+void mdev_remove_sysfs_files(struct mdev_device *mdev)
 {
 	struct kobject *kobj = &mdev->dev.kobj;
 
 	sysfs_remove_files(kobj, mdev_device_attrs);
 	sysfs_remove_link(kobj, "mdev_type");
-	sysfs_remove_link(type->devices_kobj, dev_name(&mdev->dev));
+	sysfs_remove_link(mdev->type->devices_kobj, dev_name(&mdev->dev));
 }
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index cb771c712da0..349e8ac1fe33 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -10,13 +10,15 @@
 #ifndef MDEV_H
 #define MDEV_H
 
+struct mdev_type;
+
 struct mdev_device {
 	struct device dev;
 	struct mdev_parent *parent;
 	guid_t uuid;
 	void *driver_data;
 	struct list_head next;
-	struct kobject *type_kobj;
+	struct mdev_type *type;
 	struct device *iommu_device;
 	bool active;
 };
-- 
cgit v1.2.3


From fbea43239074e16c91048f5ce70378664efbdb99 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:33 -0300
Subject: vfio/mdev: Remove duplicate storage of parent in mdev_device

mdev_device->type->parent is the same thing.

The struct mdev_device was relying on the kref on the mdev_parent to also
indirectly hold a kref on the mdev_type pointer. Now that the type holds a
kref on the parent we can directly kref the mdev_type and remove this
implicit relationship.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <10-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/mdev/mdev_core.c | 13 +++++--------
 drivers/vfio/mdev/vfio_mdev.c | 14 +++++++-------
 include/linux/mdev.h          |  1 -
 3 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 4caedb3d4fbf..2a20bdaf6142 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -29,7 +29,7 @@ static DEFINE_MUTEX(mdev_list_lock);
 
 struct device *mdev_parent_dev(struct mdev_device *mdev)
 {
-	return mdev->parent->dev;
+	return mdev->type->parent->dev;
 }
 EXPORT_SYMBOL(mdev_parent_dev);
 
@@ -58,12 +58,11 @@ void mdev_release_parent(struct kref *kref)
 /* Caller must hold parent unreg_sem read or write lock */
 static void mdev_device_remove_common(struct mdev_device *mdev)
 {
-	struct mdev_parent *parent;
+	struct mdev_parent *parent = mdev->type->parent;
 	int ret;
 
 	mdev_remove_sysfs_files(mdev);
 	device_del(&mdev->dev);
-	parent = mdev->parent;
 	lockdep_assert_held(&parent->unreg_sem);
 	ret = parent->ops->remove(mdev);
 	if (ret)
@@ -212,7 +211,7 @@ static void mdev_device_release(struct device *dev)
 	struct mdev_device *mdev = to_mdev_device(dev);
 
 	/* Pairs with the get in mdev_device_create() */
-	mdev_put_parent(mdev->parent);
+	kobject_put(&mdev->type->kobj);
 
 	mutex_lock(&mdev_list_lock);
 	list_del(&mdev->next);
@@ -250,9 +249,8 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 	mdev->dev.release = mdev_device_release;
 	mdev->dev.groups = parent->ops->mdev_attr_groups;
 	mdev->type = type;
-	mdev->parent = parent;
 	/* Pairs with the put in mdev_device_release() */
-	mdev_get_parent(parent);
+	kobject_get(&type->kobj);
 
 	guid_copy(&mdev->uuid, uuid);
 	list_add(&mdev->next, &mdev_list);
@@ -300,7 +298,7 @@ out_put_device:
 int mdev_device_remove(struct mdev_device *mdev)
 {
 	struct mdev_device *tmp;
-	struct mdev_parent *parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	mutex_lock(&mdev_list_lock);
 	list_for_each_entry(tmp, &mdev_list, next) {
@@ -321,7 +319,6 @@ int mdev_device_remove(struct mdev_device *mdev)
 	mdev->active = false;
 	mutex_unlock(&mdev_list_lock);
 
-	parent = mdev->parent;
 	/* Check if parent unregistration has started */
 	if (!down_read_trylock(&parent->unreg_sem))
 		return -ENODEV;
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index cc9507ed85a1..922729071c5a 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -24,7 +24,7 @@
 static int vfio_mdev_open(struct vfio_device *core_vdev)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	int ret;
 
@@ -44,7 +44,7 @@ static int vfio_mdev_open(struct vfio_device *core_vdev)
 static void vfio_mdev_release(struct vfio_device *core_vdev)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (likely(parent->ops->release))
 		parent->ops->release(mdev);
@@ -56,7 +56,7 @@ static long vfio_mdev_unlocked_ioctl(struct vfio_device *core_vdev,
 				     unsigned int cmd, unsigned long arg)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->ioctl))
 		return -EINVAL;
@@ -68,7 +68,7 @@ static ssize_t vfio_mdev_read(struct vfio_device *core_vdev, char __user *buf,
 			      size_t count, loff_t *ppos)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->read))
 		return -EINVAL;
@@ -81,7 +81,7 @@ static ssize_t vfio_mdev_write(struct vfio_device *core_vdev,
 			       loff_t *ppos)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->write))
 		return -EINVAL;
@@ -93,7 +93,7 @@ static int vfio_mdev_mmap(struct vfio_device *core_vdev,
 			  struct vm_area_struct *vma)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (unlikely(!parent->ops->mmap))
 		return -EINVAL;
@@ -104,7 +104,7 @@ static int vfio_mdev_mmap(struct vfio_device *core_vdev,
 static void vfio_mdev_request(struct vfio_device *core_vdev, unsigned int count)
 {
 	struct mdev_device *mdev = to_mdev_device(core_vdev->dev);
-	struct mdev_parent *parent = mdev->parent;
+	struct mdev_parent *parent = mdev->type->parent;
 
 	if (parent->ops->request)
 		parent->ops->request(mdev, count);
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 349e8ac1fe33..fb582adda28a 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -14,7 +14,6 @@ struct mdev_type;
 
 struct mdev_device {
 	struct device dev;
-	struct mdev_parent *parent;
 	guid_t uuid;
 	void *driver_data;
 	struct list_head next;
-- 
cgit v1.2.3


From 15fcc44be0c7afa2945b1896a96ac2ddf09f1fa7 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:34 -0300
Subject: vfio/mdev: Add mdev/mtype_get_type_group_id()

This returns the index in the supported_type_groups array that is
associated with the mdev_type attached to the struct mdev_device or its
containing struct kobject.

Each mdev_device can be spawned from exactly one mdev_type, which in turn
originates from exactly one supported_type_group.

Drivers are using weird string calculations to try and get back to this
index, providing a direct access to the index removes a bunch of wonky
driver code.

mdev_type->group can be deleted as the group is obtained using the
type_group_id.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <11-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/mdev/mdev_core.c    | 20 ++++++++++++++++++++
 drivers/vfio/mdev/mdev_private.h |  2 +-
 drivers/vfio/mdev/mdev_sysfs.c   | 15 +++++++++------
 include/linux/mdev.h             |  3 +++
 4 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 2a20bdaf6142..5ae06f951a09 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -33,6 +33,26 @@ struct device *mdev_parent_dev(struct mdev_device *mdev)
 }
 EXPORT_SYMBOL(mdev_parent_dev);
 
+/*
+ * Return the index in supported_type_groups that this mdev_device was created
+ * from.
+ */
+unsigned int mdev_get_type_group_id(struct mdev_device *mdev)
+{
+	return mdev->type->type_group_id;
+}
+EXPORT_SYMBOL(mdev_get_type_group_id);
+
+/*
+ * Used in mdev_type_attribute sysfs functions to return the index in the
+ * supported_type_groups that the sysfs is called from.
+ */
+unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj)
+{
+	return container_of(mtype_kobj, struct mdev_type, kobj)->type_group_id;
+}
+EXPORT_SYMBOL(mtype_get_type_group_id);
+
 /* Should be called holding parent_list_lock */
 static struct mdev_parent *__find_parent_device(struct device *dev)
 {
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index fddab240ccc3..6999c89db7b1 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -29,7 +29,7 @@ struct mdev_type {
 	struct kobject *devices_kobj;
 	struct mdev_parent *parent;
 	struct list_head next;
-	struct attribute_group *group;
+	unsigned int type_group_id;
 };
 
 #define to_mdev_type_attr(_attr)	\
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 8c169d12ba7d..712fbc78b12e 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -92,9 +92,11 @@ static struct kobj_type mdev_type_ktype = {
 };
 
 static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
-						 struct attribute_group *group)
+						 unsigned int type_group_id)
 {
 	struct mdev_type *type;
+	struct attribute_group *group =
+		parent->ops->supported_type_groups[type_group_id];
 	int ret;
 
 	if (!group->name) {
@@ -110,6 +112,7 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 	type->parent = parent;
 	/* Pairs with the put in mdev_type_release() */
 	mdev_get_parent(parent);
+	type->type_group_id = type_group_id;
 
 	ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL,
 				   "%s-%s", dev_driver_string(parent->dev),
@@ -135,8 +138,6 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
 		ret = -ENOMEM;
 		goto attrs_failed;
 	}
-
-	type->group = group;
 	return type;
 
 attrs_failed:
@@ -151,8 +152,11 @@ attr_create_failed:
 
 static void remove_mdev_supported_type(struct mdev_type *type)
 {
+	struct attribute_group *group =
+		type->parent->ops->supported_type_groups[type->type_group_id];
+
 	sysfs_remove_files(&type->kobj,
-			   (const struct attribute **)type->group->attrs);
+			   (const struct attribute **)group->attrs);
 	kobject_put(type->devices_kobj);
 	sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr);
 	kobject_del(&type->kobj);
@@ -166,8 +170,7 @@ static int add_mdev_supported_type_groups(struct mdev_parent *parent)
 	for (i = 0; parent->ops->supported_type_groups[i]; i++) {
 		struct mdev_type *type;
 
-		type = add_mdev_supported_type(parent,
-					parent->ops->supported_type_groups[i]);
+		type = add_mdev_supported_type(parent, i);
 		if (IS_ERR(type)) {
 			struct mdev_type *ltype, *tmp;
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index fb582adda28a..41e919365223 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -46,6 +46,9 @@ static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
 	return mdev->iommu_device;
 }
 
+unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
+unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj);
+
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
  * register the device to mdev module.
-- 
cgit v1.2.3


From 0854fa82c96ca37a35e954b7079c0bfd795affb1 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Tue, 6 Apr 2021 23:40:51 -0700
Subject: net: remove the new_ifindex argument from dev_change_net_namespace

Here is only one place where we want to specify new_ifindex. In all
other cases, callers pass 0 as new_ifindex. It looks reasonable to add a
low-level function with new_ifindex and to convert
dev_change_net_namespace to a static inline wrapper.

Fixes: eeb85a14ee34 ("net: Allow to specify ifindex when device is moved to another namespace")
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c |  2 +-
 include/linux/netdevice.h       |  8 +++++++-
 net/core/dev.c                  | 10 +++++-----
 net/core/rtnetlink.c            |  4 ++--
 net/ieee802154/core.c           |  4 ++--
 net/wireless/core.c             |  4 ++--
 6 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 8c0c70e1da77..7349a70af083 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2354,7 +2354,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev)
 	 */
 	if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
 		ret = dev_change_net_namespace(vf_netdev,
-					       dev_net(ndev), "eth%d", 0);
+					       dev_net(ndev), "eth%d");
 		if (ret)
 			netdev_err(vf_netdev,
 				   "could not move to same namespace as %s: %d\n",
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b482236c0e99..5cbc950b34df 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4026,8 +4026,14 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags,
 int dev_change_name(struct net_device *, const char *);
 int dev_set_alias(struct net_device *, const char *, size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
+int __dev_change_net_namespace(struct net_device *dev, struct net *net,
+			       const char *pat, int new_ifindex);
+static inline
 int dev_change_net_namespace(struct net_device *dev, struct net *net,
-			     const char *pat, int new_ifindex);
+			     const char *pat)
+{
+	return __dev_change_net_namespace(dev, net, pat, 0);
+}
 int __dev_set_mtu(struct net_device *, int);
 int dev_validate_mtu(struct net_device *dev, int mtu,
 		     struct netlink_ext_ack *extack);
diff --git a/net/core/dev.c b/net/core/dev.c
index 9d1a8fac793f..33ff4a944109 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11062,7 +11062,7 @@ void unregister_netdev(struct net_device *dev)
 EXPORT_SYMBOL(unregister_netdev);
 
 /**
- *	dev_change_net_namespace - move device to different nethost namespace
+ *	__dev_change_net_namespace - move device to different nethost namespace
  *	@dev: device
  *	@net: network namespace
  *	@pat: If not NULL name pattern to try if the current device name
@@ -11077,8 +11077,8 @@ EXPORT_SYMBOL(unregister_netdev);
  *	Callers must hold the rtnl semaphore.
  */
 
-int dev_change_net_namespace(struct net_device *dev, struct net *net,
-			     const char *pat, int new_ifindex)
+int __dev_change_net_namespace(struct net_device *dev, struct net *net,
+			       const char *pat, int new_ifindex)
 {
 	struct net *net_old = dev_net(dev);
 	int err, new_nsid;
@@ -11202,7 +11202,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net,
 out:
 	return err;
 }
-EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
 
 static int dev_cpu_dead(unsigned int oldcpu)
 {
@@ -11458,7 +11458,7 @@ static void __net_exit default_device_exit(struct net *net)
 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
 		if (__dev_get_by_name(&init_net, fb_name))
 			snprintf(fb_name, IFNAMSIZ, "dev%%d");
-		err = dev_change_net_namespace(dev, &init_net, fb_name, 0);
+		err = dev_change_net_namespace(dev, &init_net, fb_name);
 		if (err) {
 			pr_emerg("%s: failed to move %s to init_net: %d\n",
 				 __func__, dev->name, err);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9108a7e6c0c0..9f1f55785a6f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2619,7 +2619,7 @@ static int do_setlink(const struct sk_buff *skb,
 		else
 			new_ifindex = 0;
 
-		err = dev_change_net_namespace(dev, net, ifname, new_ifindex);
+		err = __dev_change_net_namespace(dev, net, ifname, new_ifindex);
 		put_net(net);
 		if (err)
 			goto errout;
@@ -3461,7 +3461,7 @@ replay:
 	if (err < 0)
 		goto out_unregister;
 	if (link_net) {
-		err = dev_change_net_namespace(dev, dest_net, ifname, 0);
+		err = dev_change_net_namespace(dev, dest_net, ifname);
 		if (err < 0)
 			goto out_unregister;
 	}
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index ec3068937fc3..de259b5170ab 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -205,7 +205,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
 		if (!wpan_dev->netdev)
 			continue;
 		wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
-		err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d", 0);
+		err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d");
 		if (err)
 			break;
 		wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
@@ -222,7 +222,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
 				continue;
 			wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
 			err = dev_change_net_namespace(wpan_dev->netdev, net,
-						       "wpan%d", 0);
+						       "wpan%d");
 			WARN_ON(err);
 			wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
 		}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index fabb677b7d58..a2785379df6e 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -165,7 +165,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
 		if (!wdev->netdev)
 			continue;
 		wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
-		err = dev_change_net_namespace(wdev->netdev, net, "wlan%d", 0);
+		err = dev_change_net_namespace(wdev->netdev, net, "wlan%d");
 		if (err)
 			break;
 		wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
@@ -182,7 +182,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
 				continue;
 			wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
 			err = dev_change_net_namespace(wdev->netdev, net,
-							"wlan%d", 0);
+							"wlan%d");
 			WARN_ON(err);
 			wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
 		}
-- 
cgit v1.2.3


From a975d7d8a356ce92872af0b007b101183f0224e2 Mon Sep 17 00:00:00 2001
From: Danielle Ratson <danieller@nvidia.com>
Date: Wed, 7 Apr 2021 13:06:51 +0300
Subject: ethtool: Remove link_mode param and derive link params from driver

Some drivers clear the 'ethtool_link_ksettings' struct in their
get_link_ksettings() callback, before populating it with actual values.
Such drivers will set the new 'link_mode' field to zero, resulting in
user space receiving wrong link mode information given that zero is a
valid value for the field.

Another problem is that some drivers (notably tun) can report random
values in the 'link_mode' field. This can result in a general protection
fault when the field is used as an index to the 'link_mode_params' array
[1].

This happens because such drivers implement their set_link_ksettings()
callback by simply overwriting their private copy of
'ethtool_link_ksettings' struct with the one they get from the stack,
which is not always properly initialized.

Fix these problems by removing 'link_mode' from 'ethtool_link_ksettings'
and instead have drivers call ethtool_params_from_link_mode() with the
current link mode. The function will derive the link parameters (e.g.,
speed) from the link mode and fill them in the 'ethtool_link_ksettings'
struct.

v3:
	* Remove link_mode parameter and derive the link parameters in
	  the driver instead of passing link_mode parameter to ethtool
	  and derive it there.

v2:
	* Introduce 'cap_link_mode_supported' instead of adding a
	  validity field to 'ethtool_link_ksettings' struct.

[1]
general protection fault, probably for non-canonical address 0xdffffc00f14cc32c: 0000 [#1] PREEMPT SMP KASAN
KASAN: probably user-memory-access in range [0x000000078a661960-0x000000078a661967]
CPU: 0 PID: 8452 Comm: syz-executor360 Not tainted 5.11.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__ethtool_get_link_ksettings+0x1a3/0x3a0 net/ethtool/ioctl.c:446
Code: b7 3e fa 83 fd ff 0f 84 30 01 00 00 e8 16 b0 3e fa 48 8d 3c ed 60 d5 69 8a 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03
+38 d0 7c 08 84 d2 0f 85 b9
RSP: 0018:ffffc900019df7a0 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff888026136008 RCX: 0000000000000000
RDX: 00000000f14cc32c RSI: ffffffff873439ca RDI: 000000078a661960
RBP: 00000000ffff8880 R08: 00000000ffffffff R09: ffff88802613606f
R10: ffffffff873439bc R11: 0000000000000000 R12: 0000000000000000
R13: ffff88802613606c R14: ffff888011d0c210 R15: ffff888011d0c210
FS:  0000000000749300(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004b60f0 CR3: 00000000185c2000 CR4: 00000000001506f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 linkinfo_prepare_data+0xfd/0x280 net/ethtool/linkinfo.c:37
 ethnl_default_notify+0x1dc/0x630 net/ethtool/netlink.c:586
 ethtool_notify+0xbd/0x1f0 net/ethtool/netlink.c:656
 ethtool_set_link_ksettings+0x277/0x330 net/ethtool/ioctl.c:620
 dev_ethtool+0x2b35/0x45d0 net/ethtool/ioctl.c:2842
 dev_ioctl+0x463/0xb70 net/core/dev_ioctl.c:440
 sock_do_ioctl+0x148/0x2d0 net/socket.c:1060
 sock_ioctl+0x477/0x6a0 net/socket.c:1177
 vfs_ioctl fs/ioctl.c:48 [inline]
 __do_sys_ioctl fs/ioctl.c:753 [inline]
 __se_sys_ioctl fs/ioctl.c:739 [inline]
 __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: c8907043c6ac9 ("ethtool: Get link mode in use instead of speed and duplex parameters")
Signed-off-by: Danielle Ratson <danieller@nvidia.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_ethtool.c    | 19 ++++++++++++++-----
 include/linux/ethtool.h                               |  9 ++++++++-
 net/ethtool/common.c                                  | 16 ++++++++++++++++
 net/ethtool/ioctl.c                                   | 18 +-----------------
 4 files changed, 39 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c
index 0bd64169bf81..078601d31cde 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c
@@ -1230,16 +1230,22 @@ mlxsw_sp1_from_ptys_link_mode(struct mlxsw_sp *mlxsw_sp, bool carrier_ok,
 			      u32 ptys_eth_proto,
 			      struct ethtool_link_ksettings *cmd)
 {
+	struct mlxsw_sp1_port_link_mode link;
 	int i;
 
-	cmd->link_mode = -1;
+	cmd->base.speed = SPEED_UNKNOWN;
+	cmd->base.duplex = DUPLEX_UNKNOWN;
+	cmd->lanes = 0;
 
 	if (!carrier_ok)
 		return;
 
 	for (i = 0; i < MLXSW_SP1_PORT_LINK_MODE_LEN; i++) {
-		if (ptys_eth_proto & mlxsw_sp1_port_link_mode[i].mask)
-			cmd->link_mode = mlxsw_sp1_port_link_mode[i].mask_ethtool;
+		if (ptys_eth_proto & mlxsw_sp1_port_link_mode[i].mask) {
+			link = mlxsw_sp1_port_link_mode[i];
+			ethtool_params_from_link_mode(cmd,
+						      link.mask_ethtool);
+		}
 	}
 }
 
@@ -1672,7 +1678,9 @@ mlxsw_sp2_from_ptys_link_mode(struct mlxsw_sp *mlxsw_sp, bool carrier_ok,
 	struct mlxsw_sp2_port_link_mode link;
 	int i;
 
-	cmd->link_mode = -1;
+	cmd->base.speed = SPEED_UNKNOWN;
+	cmd->base.duplex = DUPLEX_UNKNOWN;
+	cmd->lanes = 0;
 
 	if (!carrier_ok)
 		return;
@@ -1680,7 +1688,8 @@ mlxsw_sp2_from_ptys_link_mode(struct mlxsw_sp *mlxsw_sp, bool carrier_ok,
 	for (i = 0; i < MLXSW_SP2_PORT_LINK_MODE_LEN; i++) {
 		if (ptys_eth_proto & mlxsw_sp2_port_link_mode[i].mask) {
 			link = mlxsw_sp2_port_link_mode[i];
-			cmd->link_mode = link.mask_ethtool[1];
+			ethtool_params_from_link_mode(cmd,
+						      link.mask_ethtool[1]);
 		}
 	}
 }
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 7c88dfff7420..cdca84e6dd6b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -127,7 +127,6 @@ struct ethtool_link_ksettings {
 		__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
 	} link_modes;
 	u32	lanes;
-	enum ethtool_link_mode_bit_indices link_mode;
 };
 
 /**
@@ -574,4 +573,12 @@ struct ethtool_phy_ops {
  */
 void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops);
 
+/*
+ * ethtool_params_from_link_mode - Derive link parameters from a given link mode
+ * @link_ksettings: Link parameters to be derived from the link mode
+ * @link_mode: Link mode
+ */
+void
+ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
+			      enum ethtool_link_mode_bit_indices link_mode);
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index c6a383dfd6c2..030aa7984a91 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -562,3 +562,19 @@ void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops)
 	rtnl_unlock();
 }
 EXPORT_SYMBOL_GPL(ethtool_set_ethtool_phy_ops);
+
+void
+ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
+			      enum ethtool_link_mode_bit_indices link_mode)
+{
+	const struct link_mode_info *link_info;
+
+	if (WARN_ON_ONCE(link_mode >= __ETHTOOL_LINK_MODE_MASK_NBITS))
+		return;
+
+	link_info = &link_mode_params[link_mode];
+	link_ksettings->base.speed = link_info->speed;
+	link_ksettings->lanes = link_info->lanes;
+	link_ksettings->base.duplex = link_info->duplex;
+}
+EXPORT_SYMBOL_GPL(ethtool_params_from_link_mode);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 24783b71c584..771688e1b0da 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -426,29 +426,13 @@ struct ethtool_link_usettings {
 int __ethtool_get_link_ksettings(struct net_device *dev,
 				 struct ethtool_link_ksettings *link_ksettings)
 {
-	const struct link_mode_info *link_info;
-	int err;
-
 	ASSERT_RTNL();
 
 	if (!dev->ethtool_ops->get_link_ksettings)
 		return -EOPNOTSUPP;
 
 	memset(link_ksettings, 0, sizeof(*link_ksettings));
-
-	link_ksettings->link_mode = -1;
-	err = dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
-	if (err)
-		return err;
-
-	if (link_ksettings->link_mode != -1) {
-		link_info = &link_mode_params[link_ksettings->link_mode];
-		link_ksettings->base.speed = link_info->speed;
-		link_ksettings->lanes = link_info->lanes;
-		link_ksettings->base.duplex = link_info->duplex;
-	}
-
-	return 0;
+	return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
 }
 EXPORT_SYMBOL(__ethtool_get_link_ksettings);
 
-- 
cgit v1.2.3


From 01fd45f676f1b3785b7cdd5d815f9c31ddcd9dd1 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 7 Apr 2021 12:39:21 +0200
Subject: USB: serial: add generic support for TIOCSSERIAL

TIOCSSERIAL is a horrid, underspecified, legacy interface which for most
serial devices is only useful for setting the close_delay and
closing_wait parameters.

The closing_wait parameter determines how long to wait for the transfer
buffers to drain during close and the default timeout of 30 seconds may
not be sufficient at low line speeds. In other cases, when for example
flow is stopped, the default timeout may instead be too long.

Add generic support for TIOCSSERIAL and TIOCGSERIAL with handling of the
three common parameters close_delay, closing_wait and line for the
benefit of all USB serial drivers while still allowing drivers to
implement further functionality through the existing callbacks.

This currently includes a few drivers that report their base baud clock
rate even if that is really only of interest when setting custom
divisors through the deprecated ASYNC_SPD_CUST interface; an interface
which only the FTDI driver actually implements.

Some drivers have also been reporting back a fake UART type, something
which should no longer be needed and will be dropped by a follow-on
patch.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/ark3116.c          |  9 +-----
 drivers/usb/serial/f81232.c           | 12 ++------
 drivers/usb/serial/f81534.c           |  8 +-----
 drivers/usb/serial/ftdi_sio.c         | 11 ++------
 drivers/usb/serial/io_edgeport.c      | 12 ++------
 drivers/usb/serial/io_ti.c            | 17 ++---------
 drivers/usb/serial/mos7720.c          | 12 ++------
 drivers/usb/serial/mos7840.c          | 10 +------
 drivers/usb/serial/opticon.c          | 12 ++------
 drivers/usb/serial/option.c           |  2 --
 drivers/usb/serial/pl2303.c           | 10 +------
 drivers/usb/serial/quatech2.c         | 13 ---------
 drivers/usb/serial/ssu100.c           | 13 ---------
 drivers/usb/serial/ti_usb_3410_5052.c | 42 ++-------------------------
 drivers/usb/serial/usb-serial.c       | 53 +++++++++++++++++++++++++++++++----
 drivers/usb/serial/usb-wwan.h         |  4 ---
 drivers/usb/serial/usb_wwan.c         | 42 ---------------------------
 drivers/usb/serial/whiteheat.c        | 12 ++------
 include/linux/usb/serial.h            |  2 +-
 19 files changed, 70 insertions(+), 226 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c
index 957cdd694b1f..c0cf60e9273d 100644
--- a/drivers/usb/serial/ark3116.c
+++ b/drivers/usb/serial/ark3116.c
@@ -385,17 +385,10 @@ err_free:
 	return result;
 }
 
-static int ark3116_get_serial_info(struct tty_struct *tty,
+static void ark3116_get_serial_info(struct tty_struct *tty,
 			struct serial_struct *ss)
 {
-	struct usb_serial_port *port = tty->driver_data;
-
 	ss->type = PORT_16654;
-	ss->line = port->minor;
-	ss->close_delay = 50;
-	ss->closing_wait = 3000;
-
-	return 0;
 }
 
 static int ark3116_tiocmget(struct tty_struct *tty)
diff --git a/drivers/usb/serial/f81232.c b/drivers/usb/serial/f81232.c
index af0fe2a82eb2..5e34b364d94d 100644
--- a/drivers/usb/serial/f81232.c
+++ b/drivers/usb/serial/f81232.c
@@ -820,19 +820,13 @@ static int f81232_carrier_raised(struct usb_serial_port *port)
 	return 0;
 }
 
-static int f81232_get_serial_info(struct tty_struct *tty,
-		struct serial_struct *ss)
+static void f81232_get_serial(struct tty_struct *tty, struct serial_struct *ss)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct f81232_private *priv = usb_get_serial_port_data(port);
 
 	ss->type = PORT_16550A;
-	ss->line = port->minor;
 	ss->baud_base = priv->baud_base;
-	ss->close_delay = 50;
-	ss->closing_wait = 3000;
-
-	return 0;
 }
 
 static void  f81232_interrupt_work(struct work_struct *work)
@@ -1023,7 +1017,7 @@ static struct usb_serial_driver f81232_device = {
 	.close =		f81232_close,
 	.dtr_rts =		f81232_dtr_rts,
 	.carrier_raised =	f81232_carrier_raised,
-	.get_serial =		f81232_get_serial_info,
+	.get_serial =		f81232_get_serial,
 	.break_ctl =		f81232_break_ctl,
 	.set_termios =		f81232_set_termios,
 	.tiocmget =		f81232_tiocmget,
@@ -1048,7 +1042,7 @@ static struct usb_serial_driver f81534a_device = {
 	.close =		f81232_close,
 	.dtr_rts =		f81232_dtr_rts,
 	.carrier_raised =	f81232_carrier_raised,
-	.get_serial =		f81232_get_serial_info,
+	.get_serial =		f81232_get_serial,
 	.break_ctl =		f81232_break_ctl,
 	.set_termios =		f81232_set_termios,
 	.tiocmget =		f81232_tiocmget,
diff --git a/drivers/usb/serial/f81534.c b/drivers/usb/serial/f81534.c
index c9f90d437e3a..633de52feaad 100644
--- a/drivers/usb/serial/f81534.c
+++ b/drivers/usb/serial/f81534.c
@@ -1140,8 +1140,7 @@ static void f81534_close(struct usb_serial_port *port)
 	mutex_unlock(&serial_priv->urb_mutex);
 }
 
-static int f81534_get_serial_info(struct tty_struct *tty,
-				  struct serial_struct *ss)
+static void f81534_get_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct f81534_port_private *port_priv;
@@ -1149,12 +1148,7 @@ static int f81534_get_serial_info(struct tty_struct *tty,
 	port_priv = usb_get_serial_port_data(port);
 
 	ss->type = PORT_16550A;
-	ss->line = port->minor;
 	ss->baud_base = port_priv->baud_base;
-	ss->close_delay = 50;
-	ss->closing_wait = 3000;
-
-	return 0;
 }
 
 static void f81534_process_per_serial_block(struct usb_serial_port *port,
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index f8a0911f90ea..16d3e50487e6 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1082,8 +1082,7 @@ static int  ftdi_tiocmset(struct tty_struct *tty,
 			unsigned int set, unsigned int clear);
 static int  ftdi_ioctl(struct tty_struct *tty,
 			unsigned int cmd, unsigned long arg);
-static int get_serial_info(struct tty_struct *tty,
-				struct serial_struct *ss);
+static void get_serial_info(struct tty_struct *tty, struct serial_struct *ss);
 static int set_serial_info(struct tty_struct *tty,
 				struct serial_struct *ss);
 static void ftdi_break_ctl(struct tty_struct *tty, int break_state);
@@ -1477,20 +1476,14 @@ static int read_latency_timer(struct usb_serial_port *port)
 	return 0;
 }
 
-static int get_serial_info(struct tty_struct *tty,
-				struct serial_struct *ss)
+static void get_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
 
-	ss->line = port->minor;
 	ss->flags = priv->flags;
 	ss->baud_base = priv->baud_base;
 	ss->custom_divisor = priv->custom_divisor;
-	ss->close_delay = 50;
-	ss->closing_wait = 3000;
-
-	return 0;
 }
 
 static int set_serial_info(struct tty_struct *tty,
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index 3b4809875a71..6b86e68ee2e8 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -1594,17 +1594,9 @@ static int edge_tiocmget(struct tty_struct *tty)
 	return result;
 }
 
-static int get_serial_info(struct tty_struct *tty,
-				struct serial_struct *ss)
+static void get_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 {
-	struct usb_serial_port *port = tty->driver_data;
-
-	ss->type		= PORT_16550A;
-	ss->line		= port->minor;
-	ss->close_delay		= 50;
-	ss->closing_wait	= 3000;
-
-	return 0;
+	ss->type = PORT_16550A;
 }
 
 
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index f5aab570fd05..dce994c29afe 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2433,22 +2433,9 @@ static int edge_tiocmget(struct tty_struct *tty)
 	return result;
 }
 
-static int get_serial_info(struct tty_struct *tty,
-				struct serial_struct *ss)
+static void get_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 {
-	struct usb_serial_port *port = tty->driver_data;
-	unsigned cwait;
-
-	cwait = port->port.closing_wait;
-	if (cwait != ASYNC_CLOSING_WAIT_NONE)
-		cwait = jiffies_to_msecs(cwait) / 10;
-
-	ss->type		= PORT_16550A;
-	ss->line		= port->minor;
-	ss->close_delay		= 50;
-	ss->closing_wait	= cwait;
-
-	return 0;
+	ss->type = PORT_16550A;
 }
 
 static void edge_break(struct tty_struct *tty, int break_state)
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 7289d46c3164..4012b448388a 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -1634,17 +1634,9 @@ static int mos7720_tiocmset(struct tty_struct *tty,
 	return 0;
 }
 
-static int get_serial_info(struct tty_struct *tty,
-			   struct serial_struct *ss)
+static void get_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 {
-	struct usb_serial_port *port = tty->driver_data;
-
-	ss->type		= PORT_16550A;
-	ss->line		= port->minor;
-	ss->close_delay		= 50;
-	ss->closing_wait	= 3000;
-
-	return 0;
+	ss->type = PORT_16550A;
 }
 
 static int mos7720_ioctl(struct tty_struct *tty,
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 77cbe18a1629..d20fb0a678dc 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -1388,17 +1388,9 @@ static int mos7840_get_lsr_info(struct tty_struct *tty,
  *      function to get information about serial port
  *****************************************************************************/
 
-static int mos7840_get_serial_info(struct tty_struct *tty,
-				   struct serial_struct *ss)
+static void mos7840_get_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 {
-	struct usb_serial_port *port = tty->driver_data;
-
 	ss->type = PORT_16550A;
-	ss->line = port->minor;
-	ss->close_delay = 50;
-	ss->closing_wait = 3000;
-
-	return 0;
 }
 
 /*****************************************************************************
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index 1c7e5dc2c272..db84afcf7f1a 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -352,18 +352,10 @@ static int opticon_tiocmset(struct tty_struct *tty,
 	return 0;
 }
 
-static int get_serial_info(struct tty_struct *tty,
-			   struct serial_struct *ss)
+static void get_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 {
-	struct usb_serial_port *port = tty->driver_data;
-
 	/* fake emulate a 16550 uart to make userspace code happy */
-	ss->type		= PORT_16550A;
-	ss->line		= port->minor;
-	ss->close_delay		= 50;
-	ss->closing_wait	= 3000;
-
-	return 0;
+	ss->type = PORT_16550A;
 }
 
 static int opticon_port_probe(struct usb_serial_port *port)
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index c6969ca72839..3e79a543d3e7 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -2095,8 +2095,6 @@ static struct usb_serial_driver option_1port_device = {
 	.chars_in_buffer   = usb_wwan_chars_in_buffer,
 	.tiocmget          = usb_wwan_tiocmget,
 	.tiocmset          = usb_wwan_tiocmset,
-	.get_serial        = usb_wwan_get_serial_info,
-	.set_serial        = usb_wwan_set_serial_info,
 	.attach            = option_attach,
 	.release           = option_release,
 	.port_probe        = usb_wwan_port_probe,
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 1bb870ca7044..64f08a45eb46 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -1048,17 +1048,9 @@ static int pl2303_carrier_raised(struct usb_serial_port *port)
 	return 0;
 }
 
-static int pl2303_get_serial(struct tty_struct *tty,
-			struct serial_struct *ss)
+static void pl2303_get_serial(struct tty_struct *tty, struct serial_struct *ss)
 {
-	struct usb_serial_port *port = tty->driver_data;
-
 	ss->type = PORT_16654;
-	ss->line = port->minor;
-	ss->close_delay = 50;
-	ss->closing_wait = 3000;
-
-	return 0;
 }
 
 static void pl2303_set_break(struct usb_serial_port *port, bool enable)
diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c
index 0d23e565e0d2..5f2e7f668e68 100644
--- a/drivers/usb/serial/quatech2.c
+++ b/drivers/usb/serial/quatech2.c
@@ -453,18 +453,6 @@ static void qt2_disconnect(struct usb_serial *serial)
 	usb_kill_urb(serial_priv->read_urb);
 }
 
-static int get_serial_info(struct tty_struct *tty,
-			   struct serial_struct *ss)
-{
-	struct usb_serial_port *port = tty->driver_data;
-
-	ss->line		= port->minor;
-	ss->close_delay		= 50;
-	ss->closing_wait	= 3000;
-
-	return 0;
-}
-
 static void qt2_process_status(struct usb_serial_port *port, unsigned char *ch)
 {
 	switch (*ch) {
@@ -975,7 +963,6 @@ static struct usb_serial_driver qt2_device = {
 	.tiocmset            = qt2_tiocmset,
 	.tiocmiwait          = usb_serial_generic_tiocmiwait,
 	.get_icount	     = usb_serial_generic_get_icount,
-	.get_serial          = get_serial_info,
 	.set_termios         = qt2_set_termios,
 };
 
diff --git a/drivers/usb/serial/ssu100.c b/drivers/usb/serial/ssu100.c
index c4616c37f33f..3baf7c0f5a98 100644
--- a/drivers/usb/serial/ssu100.c
+++ b/drivers/usb/serial/ssu100.c
@@ -331,18 +331,6 @@ static int ssu100_open(struct tty_struct *tty, struct usb_serial_port *port)
 	return usb_serial_generic_open(tty, port);
 }
 
-static int get_serial_info(struct tty_struct *tty,
-			   struct serial_struct *ss)
-{
-	struct usb_serial_port *port = tty->driver_data;
-
-	ss->line		= port->minor;
-	ss->close_delay		= 50;
-	ss->closing_wait	= 3000;
-
-	return 0;
-}
-
 static int ssu100_attach(struct usb_serial *serial)
 {
 	return ssu100_initdevice(serial->dev);
@@ -542,7 +530,6 @@ static struct usb_serial_driver ssu100_device = {
 	.tiocmset            = ssu100_tiocmset,
 	.tiocmiwait          = usb_serial_generic_tiocmiwait,
 	.get_icount	     = usb_serial_generic_get_icount,
-	.get_serial          = get_serial_info,
 	.set_termios         = ssu100_set_termios,
 };
 
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index bb50098a0ce6..6df316bdb40f 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -328,10 +328,7 @@ static void ti_recv(struct usb_serial_port *port, unsigned char *data,
 static void ti_send(struct ti_port *tport);
 static int ti_set_mcr(struct ti_port *tport, unsigned int mcr);
 static int ti_get_lsr(struct ti_port *tport, u8 *lsr);
-static int ti_get_serial_info(struct tty_struct *tty,
-	struct serial_struct *ss);
-static int ti_set_serial_info(struct tty_struct *tty,
-	struct serial_struct *ss);
+static void ti_get_serial_info(struct tty_struct *tty, struct serial_struct *ss);
 static void ti_handle_new_msr(struct ti_port *tport, u8 msr);
 
 static void ti_stop_read(struct ti_port *tport, struct tty_struct *tty);
@@ -435,7 +432,6 @@ static struct usb_serial_driver ti_1port_device = {
 	.throttle		= ti_throttle,
 	.unthrottle		= ti_unthrottle,
 	.get_serial		= ti_get_serial_info,
-	.set_serial		= ti_set_serial_info,
 	.set_termios		= ti_set_termios,
 	.tiocmget		= ti_tiocmget,
 	.tiocmset		= ti_tiocmset,
@@ -469,7 +465,6 @@ static struct usb_serial_driver ti_2port_device = {
 	.throttle		= ti_throttle,
 	.unthrottle		= ti_unthrottle,
 	.get_serial		= ti_get_serial_info,
-	.set_serial		= ti_set_serial_info,
 	.set_termios		= ti_set_termios,
 	.tiocmget		= ti_tiocmget,
 	.tiocmset		= ti_tiocmset,
@@ -1393,46 +1388,13 @@ free_data:
 }
 
 
-static int ti_get_serial_info(struct tty_struct *tty,
-	struct serial_struct *ss)
+static void ti_get_serial_info(struct tty_struct *tty, struct serial_struct *ss)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct ti_port *tport = usb_get_serial_port_data(port);
-	unsigned cwait;
-
-	cwait = port->port.closing_wait;
-	if (cwait != ASYNC_CLOSING_WAIT_NONE)
-		cwait = jiffies_to_msecs(cwait) / 10;
 
 	ss->type = PORT_16550A;
-	ss->line = port->minor;
 	ss->baud_base = tport->tp_tdev->td_is_3410 ? 921600 : 460800;
-	ss->close_delay = 50;
-	ss->closing_wait = cwait;
-
-	return 0;
-}
-
-
-static int ti_set_serial_info(struct tty_struct *tty,
-	struct serial_struct *ss)
-{
-	struct usb_serial_port *port = tty->driver_data;
-	struct tty_port *tport = &port->port;
-	unsigned cwait;
-
-	cwait = ss->closing_wait;
-	if (cwait != ASYNC_CLOSING_WAIT_NONE)
-		cwait = msecs_to_jiffies(10 * ss->closing_wait);
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		if (cwait != tport->closing_wait)
-			return -EPERM;
-	}
-
-	tport->closing_wait = cwait;
-
-	return 0;
 }
 
 
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index f53a830f4094..255f562ef1a0 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -437,19 +437,62 @@ static void serial_unthrottle(struct tty_struct *tty)
 static int serial_get_serial(struct tty_struct *tty, struct serial_struct *ss)
 {
 	struct usb_serial_port *port = tty->driver_data;
+	struct tty_port *tport = &port->port;
+	unsigned int close_delay, closing_wait;
+
+	mutex_lock(&tport->mutex);
+
+	close_delay = jiffies_to_msecs(tport->close_delay) / 10;
+	closing_wait = tport->closing_wait;
+	if (closing_wait != ASYNC_CLOSING_WAIT_NONE)
+		closing_wait = jiffies_to_msecs(closing_wait) / 10;
+
+	ss->line = port->minor;
+	ss->close_delay = close_delay;
+	ss->closing_wait = closing_wait;
 
 	if (port->serial->type->get_serial)
-		return port->serial->type->get_serial(tty, ss);
-	return -ENOTTY;
+		port->serial->type->get_serial(tty, ss);
+
+	mutex_unlock(&tport->mutex);
+
+	return 0;
 }
 
 static int serial_set_serial(struct tty_struct *tty, struct serial_struct *ss)
 {
 	struct usb_serial_port *port = tty->driver_data;
+	struct tty_port *tport = &port->port;
+	unsigned int close_delay, closing_wait;
+	int ret = 0;
+
+	close_delay = msecs_to_jiffies(ss->close_delay * 10);
+	closing_wait = ss->closing_wait;
+	if (closing_wait != ASYNC_CLOSING_WAIT_NONE)
+		closing_wait = msecs_to_jiffies(closing_wait * 10);
+
+	mutex_lock(&tport->mutex);
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		if (close_delay != tport->close_delay ||
+				closing_wait != tport->closing_wait) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+	}
 
-	if (port->serial->type->set_serial)
-		return port->serial->type->set_serial(tty, ss);
-	return -ENOTTY;
+	if (port->serial->type->set_serial) {
+		ret = port->serial->type->set_serial(tty, ss);
+		if (ret)
+			goto out_unlock;
+	}
+
+	tport->close_delay = close_delay;
+	tport->closing_wait = closing_wait;
+out_unlock:
+	mutex_unlock(&tport->mutex);
+
+	return ret;
 }
 
 static int serial_ioctl(struct tty_struct *tty,
diff --git a/drivers/usb/serial/usb-wwan.h b/drivers/usb/serial/usb-wwan.h
index 79dafd98e0a1..b5331d03092f 100644
--- a/drivers/usb/serial/usb-wwan.h
+++ b/drivers/usb/serial/usb-wwan.h
@@ -15,10 +15,6 @@ extern int usb_wwan_write_room(struct tty_struct *tty);
 extern int usb_wwan_tiocmget(struct tty_struct *tty);
 extern int usb_wwan_tiocmset(struct tty_struct *tty,
 			     unsigned int set, unsigned int clear);
-extern int usb_wwan_get_serial_info(struct tty_struct *tty,
-			   struct serial_struct *ss);
-extern int usb_wwan_set_serial_info(struct tty_struct *tty,
-			   struct serial_struct *ss);
 extern int usb_wwan_write(struct tty_struct *tty, struct usb_serial_port *port,
 			  const unsigned char *buf, int count);
 extern int usb_wwan_chars_in_buffer(struct tty_struct *tty);
diff --git a/drivers/usb/serial/usb_wwan.c b/drivers/usb/serial/usb_wwan.c
index 4ea315e5e69b..3eb72c59ede6 100644
--- a/drivers/usb/serial/usb_wwan.c
+++ b/drivers/usb/serial/usb_wwan.c
@@ -132,48 +132,6 @@ int usb_wwan_tiocmset(struct tty_struct *tty,
 }
 EXPORT_SYMBOL(usb_wwan_tiocmset);
 
-int usb_wwan_get_serial_info(struct tty_struct *tty,
-			   struct serial_struct *ss)
-{
-	struct usb_serial_port *port = tty->driver_data;
-
-	ss->line            = port->minor;
-	ss->close_delay	    = jiffies_to_msecs(port->port.close_delay) / 10;
-	ss->closing_wait    = port->port.closing_wait == ASYNC_CLOSING_WAIT_NONE ?
-				 ASYNC_CLOSING_WAIT_NONE :
-				 jiffies_to_msecs(port->port.closing_wait) / 10;
-	return 0;
-}
-EXPORT_SYMBOL(usb_wwan_get_serial_info);
-
-int usb_wwan_set_serial_info(struct tty_struct *tty,
-			   struct serial_struct *ss)
-{
-	struct usb_serial_port *port = tty->driver_data;
-	unsigned int closing_wait, close_delay;
-	int retval = 0;
-
-	close_delay = msecs_to_jiffies(ss->close_delay * 10);
-	closing_wait = ss->closing_wait == ASYNC_CLOSING_WAIT_NONE ?
-			ASYNC_CLOSING_WAIT_NONE :
-			msecs_to_jiffies(ss->closing_wait * 10);
-
-	mutex_lock(&port->port.mutex);
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		if ((close_delay != port->port.close_delay) ||
-		    (closing_wait != port->port.closing_wait))
-			retval = -EPERM;
-	} else {
-		port->port.close_delay  = close_delay;
-		port->port.closing_wait = closing_wait;
-	}
-
-	mutex_unlock(&port->port.mutex);
-	return retval;
-}
-EXPORT_SYMBOL(usb_wwan_set_serial_info);
-
 int usb_wwan_write(struct tty_struct *tty, struct usb_serial_port *port,
 		   const unsigned char *buf, int count)
 {
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index c8b10faa2ff8..6a95c5a0056f 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -83,7 +83,7 @@ static void whiteheat_port_remove(struct usb_serial_port *port);
 static int  whiteheat_open(struct tty_struct *tty,
 			struct usb_serial_port *port);
 static void whiteheat_close(struct usb_serial_port *port);
-static int  whiteheat_get_serial(struct tty_struct *tty,
+static void whiteheat_get_serial(struct tty_struct *tty,
 			struct serial_struct *ss);
 static void whiteheat_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
@@ -439,18 +439,10 @@ static int whiteheat_tiocmset(struct tty_struct *tty,
 }
 
 
-static int whiteheat_get_serial(struct tty_struct *tty,
-				struct serial_struct *ss)
+static void whiteheat_get_serial(struct tty_struct *tty, struct serial_struct *ss)
 {
-	struct usb_serial_port *port = tty->driver_data;
-
 	ss->type = PORT_16654;
-	ss->line = port->minor;
 	ss->baud_base = 460800;
-	ss->close_delay = 50;
-	ss->closing_wait = 3000;
-
-	return 0;
 }
 
 
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index e9b90577f50b..8c63fa9bfc74 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -279,7 +279,7 @@ struct usb_serial_driver {
 	int  (*write_room)(struct tty_struct *tty);
 	int  (*ioctl)(struct tty_struct *tty,
 		      unsigned int cmd, unsigned long arg);
-	int  (*get_serial)(struct tty_struct *tty, struct serial_struct *ss);
+	void (*get_serial)(struct tty_struct *tty, struct serial_struct *ss);
 	int  (*set_serial)(struct tty_struct *tty, struct serial_struct *ss);
 	void (*set_termios)(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
-- 
cgit v1.2.3


From fd921693fe989afe82600d97b37f54c942a6db6c Mon Sep 17 00:00:00 2001
From: David Stevens <stevensd@chromium.org>
Date: Thu, 8 Apr 2021 18:54:28 +0900
Subject: drm/syncobj: use newly allocated stub fences
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allocate a new private stub fence in drm_syncobj_assign_null_handle,
instead of using a static stub fence.

When userspace creates a fence with DRM_SYNCOBJ_CREATE_SIGNALED or when
userspace signals a fence via DRM_IOCTL_SYNCOBJ_SIGNAL, the timestamp
obtained when the fence is exported and queried with SYNC_IOC_FILE_INFO
should match when the fence's status was changed from the perspective of
userspace, which is during the respective ioctl.

When a static stub fence started being used in by these ioctls, this
behavior changed. Instead, the timestamp returned by SYNC_IOC_FILE_INFO
became the first time anything used the static stub fence, which has no
meaning to userspace.

Signed-off-by: David Stevens <stevensd@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20210408095428.3983055-1-stevensd@google.com
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/dma-buf/dma-fence.c   | 27 ++++++++++++++++++++++++++-
 drivers/gpu/drm/drm_syncobj.c | 25 +++++++++++++++++++------
 include/linux/dma-fence.h     |  1 +
 3 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c
index d64fc03929be..ce0f5eff575d 100644
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -123,7 +123,9 @@ static const struct dma_fence_ops dma_fence_stub_ops = {
 /**
  * dma_fence_get_stub - return a signaled fence
  *
- * Return a stub fence which is already signaled.
+ * Return a stub fence which is already signaled. The fence's
+ * timestamp corresponds to the first time after boot this
+ * function is called.
  */
 struct dma_fence *dma_fence_get_stub(void)
 {
@@ -141,6 +143,29 @@ struct dma_fence *dma_fence_get_stub(void)
 }
 EXPORT_SYMBOL(dma_fence_get_stub);
 
+/**
+ * dma_fence_allocate_private_stub - return a private, signaled fence
+ *
+ * Return a newly allocated and signaled stub fence.
+ */
+struct dma_fence *dma_fence_allocate_private_stub(void)
+{
+	struct dma_fence *fence;
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (fence == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	dma_fence_init(fence,
+		       &dma_fence_stub_ops,
+		       &dma_fence_stub_lock,
+		       0, 0);
+	dma_fence_signal(fence);
+
+	return fence;
+}
+EXPORT_SYMBOL(dma_fence_allocate_private_stub);
+
 /**
  * dma_fence_context_alloc - allocate an array of fence contexts
  * @num: amount of contexts to allocate
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 6231a8214c25..fdd2ec87cdd1 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -350,12 +350,16 @@ EXPORT_SYMBOL(drm_syncobj_replace_fence);
  *
  * Assign a already signaled stub fence to the sync object.
  */
-static void drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj)
+static int drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj)
 {
-	struct dma_fence *fence = dma_fence_get_stub();
+	struct dma_fence *fence = dma_fence_allocate_private_stub();
+
+	if (IS_ERR(fence))
+		return PTR_ERR(fence);
 
 	drm_syncobj_replace_fence(syncobj, fence);
 	dma_fence_put(fence);
+	return 0;
 }
 
 /* 5s default for wait submission */
@@ -478,6 +482,7 @@ EXPORT_SYMBOL(drm_syncobj_free);
 int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags,
 		       struct dma_fence *fence)
 {
+	int ret;
 	struct drm_syncobj *syncobj;
 
 	syncobj = kzalloc(sizeof(struct drm_syncobj), GFP_KERNEL);
@@ -488,8 +493,13 @@ int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags,
 	INIT_LIST_HEAD(&syncobj->cb_list);
 	spin_lock_init(&syncobj->lock);
 
-	if (flags & DRM_SYNCOBJ_CREATE_SIGNALED)
-		drm_syncobj_assign_null_handle(syncobj);
+	if (flags & DRM_SYNCOBJ_CREATE_SIGNALED) {
+		ret = drm_syncobj_assign_null_handle(syncobj);
+		if (ret < 0) {
+			drm_syncobj_put(syncobj);
+			return ret;
+		}
+	}
 
 	if (fence)
 		drm_syncobj_replace_fence(syncobj, fence);
@@ -1334,8 +1344,11 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void *data,
 	if (ret < 0)
 		return ret;
 
-	for (i = 0; i < args->count_handles; i++)
-		drm_syncobj_assign_null_handle(syncobjs[i]);
+	for (i = 0; i < args->count_handles; i++) {
+		ret = drm_syncobj_assign_null_handle(syncobjs[i]);
+		if (ret < 0)
+			break;
+	}
 
 	drm_syncobj_array_free(syncobjs, args->count_handles);
 
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 9f12efaaa93a..6ffb4b2c6371 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -587,6 +587,7 @@ static inline signed long dma_fence_wait(struct dma_fence *fence, bool intr)
 }
 
 struct dma_fence *dma_fence_get_stub(void);
+struct dma_fence *dma_fence_allocate_private_stub(void);
 u64 dma_fence_context_alloc(unsigned num);
 
 #define DMA_FENCE_TRACE(f, fmt, args...) \
-- 
cgit v1.2.3


From 7c566bb5e4d5fb0d89579a90d8a1f54eaff6f95d Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Thu, 11 Feb 2021 21:35:46 +0900
Subject: asm-generic/io.h:  Add a non-posted variant of ioremap()

ARM64 currently defaults to posted MMIO (nGnRE), but some devices
require the use of non-posted MMIO (nGnRnE). Introduce a new ioremap()
variant to handle this case. ioremap_np() returns NULL on arches that
do not implement this variant.

sparc64 is the only architecture that needs to be touched directly,
because it includes neither of the generic io.h or iomap.h headers.

This adds the IORESOURCE_MEM_NONPOSTED flag, which maps to this
variant and marks a given resource as requiring non-posted mappings.
This is implemented in the resource system because it is a SoC-level
requirement, so existing drivers do not need special-case code to pick
this ioremap variant.

Then this is implemented in devres by introducing devm_ioremap_np(),
and making devm_ioremap_resource() automatically select this variant
when the resource has the IORESOURCE_MEM_NONPOSTED flag set.

Acked-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Hector Martin <marcan@marcan.st>
---
 Documentation/driver-api/driver-model/devres.rst |  1 +
 arch/sparc/include/asm/io_64.h                   |  4 ++++
 include/asm-generic/io.h                         | 22 +++++++++++++++++++++-
 include/asm-generic/iomap.h                      |  9 +++++++++
 include/linux/io.h                               |  2 ++
 include/linux/ioport.h                           |  1 +
 lib/devres.c                                     | 22 ++++++++++++++++++++++
 7 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index cd8b6e657b94..2f45877a539d 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -309,6 +309,7 @@ IOMAP
   devm_ioremap()
   devm_ioremap_uc()
   devm_ioremap_wc()
+  devm_ioremap_np()
   devm_ioremap_resource() : checks resource, requests memory region, ioremaps
   devm_ioremap_resource_wc()
   devm_platform_ioremap_resource() : calls devm_ioremap_resource() for platform device
diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h
index 9bb27e5c22f1..9fbfc9574432 100644
--- a/arch/sparc/include/asm/io_64.h
+++ b/arch/sparc/include/asm/io_64.h
@@ -409,6 +409,10 @@ static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
 #define ioremap_uc(X,Y)			ioremap((X),(Y))
 #define ioremap_wc(X,Y)			ioremap((X),(Y))
 #define ioremap_wt(X,Y)			ioremap((X),(Y))
+static inline void __iomem *ioremap_np(unsigned long offset, unsigned long size)
+{
+	return NULL;
+}
 
 static inline void iounmap(volatile void __iomem *addr)
 {
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index c6af40ce03be..082e0c96db6e 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -942,7 +942,9 @@ static inline void *phys_to_virt(unsigned long address)
  *
  * ioremap_wc() and ioremap_wt() can provide more relaxed caching attributes
  * for specific drivers if the architecture choses to implement them.  If they
- * are not implemented we fall back to plain ioremap.
+ * are not implemented we fall back to plain ioremap. Conversely, ioremap_np()
+ * can provide stricter non-posted write semantics if the architecture
+ * implements them.
  */
 #ifndef CONFIG_MMU
 #ifndef ioremap
@@ -993,6 +995,24 @@ static inline void __iomem *ioremap_uc(phys_addr_t offset, size_t size)
 {
 	return NULL;
 }
+
+/*
+ * ioremap_np needs an explicit architecture implementation, as it
+ * requests stronger semantics than regular ioremap(). Portable drivers
+ * should instead use one of the higher-level abstractions, like
+ * devm_ioremap_resource(), to choose the correct variant for any given
+ * device and bus. Portable drivers with a good reason to want non-posted
+ * write semantics should always provide an ioremap() fallback in case
+ * ioremap_np() is not available.
+ */
+#ifndef ioremap_np
+#define ioremap_np ioremap_np
+static inline void __iomem *ioremap_np(phys_addr_t offset, size_t size)
+{
+	return NULL;
+}
+#endif
+
 #endif
 
 #ifdef CONFIG_HAS_IOPORT_MAP
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 649224664969..9b3eb6d86200 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -101,6 +101,15 @@ extern void ioport_unmap(void __iomem *);
 #define ioremap_wt ioremap
 #endif
 
+#ifndef ARCH_HAS_IOREMAP_NP
+/* See the comment in asm-generic/io.h about ioremap_np(). */
+#define ioremap_np ioremap_np
+static inline void __iomem *ioremap_np(phys_addr_t offset, size_t size)
+{
+	return NULL;
+}
+#endif
+
 #ifdef CONFIG_PCI
 /* Destroy a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
diff --git a/include/linux/io.h b/include/linux/io.h
index 8394c56babc2..d718354ed3e1 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -68,6 +68,8 @@ void __iomem *devm_ioremap_uc(struct device *dev, resource_size_t offset,
 				   resource_size_t size);
 void __iomem *devm_ioremap_wc(struct device *dev, resource_size_t offset,
 				   resource_size_t size);
+void __iomem *devm_ioremap_np(struct device *dev, resource_size_t offset,
+				   resource_size_t size);
 void devm_iounmap(struct device *dev, void __iomem *addr);
 int check_signature(const volatile void __iomem *io_addr,
 			const unsigned char *signature, int length);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 55de385c839c..1de6c2e40c32 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -108,6 +108,7 @@ struct resource {
 #define IORESOURCE_MEM_32BIT		(3<<3)
 #define IORESOURCE_MEM_SHADOWABLE	(1<<5)	/* dup: IORESOURCE_SHADOWABLE */
 #define IORESOURCE_MEM_EXPANSIONROM	(1<<6)
+#define IORESOURCE_MEM_NONPOSTED	(1<<7)
 
 /* PnP I/O specific bits (IORESOURCE_BITS) */
 #define IORESOURCE_IO_16BIT_ADDR	(1<<0)
diff --git a/lib/devres.c b/lib/devres.c
index 2a4ff5d64288..4679dbb1bf5f 100644
--- a/lib/devres.c
+++ b/lib/devres.c
@@ -10,6 +10,7 @@ enum devm_ioremap_type {
 	DEVM_IOREMAP = 0,
 	DEVM_IOREMAP_UC,
 	DEVM_IOREMAP_WC,
+	DEVM_IOREMAP_NP,
 };
 
 void devm_ioremap_release(struct device *dev, void *res)
@@ -42,6 +43,9 @@ static void __iomem *__devm_ioremap(struct device *dev, resource_size_t offset,
 	case DEVM_IOREMAP_WC:
 		addr = ioremap_wc(offset, size);
 		break;
+	case DEVM_IOREMAP_NP:
+		addr = ioremap_np(offset, size);
+		break;
 	}
 
 	if (addr) {
@@ -98,6 +102,21 @@ void __iomem *devm_ioremap_wc(struct device *dev, resource_size_t offset,
 }
 EXPORT_SYMBOL(devm_ioremap_wc);
 
+/**
+ * devm_ioremap_np - Managed ioremap_np()
+ * @dev: Generic device to remap IO address for
+ * @offset: Resource address to map
+ * @size: Size of map
+ *
+ * Managed ioremap_np().  Map is automatically unmapped on driver detach.
+ */
+void __iomem *devm_ioremap_np(struct device *dev, resource_size_t offset,
+			      resource_size_t size)
+{
+	return __devm_ioremap(dev, offset, size, DEVM_IOREMAP_NP);
+}
+EXPORT_SYMBOL(devm_ioremap_np);
+
 /**
  * devm_iounmap - Managed iounmap()
  * @dev: Generic device to unmap for
@@ -128,6 +147,9 @@ __devm_ioremap_resource(struct device *dev, const struct resource *res,
 		return IOMEM_ERR_PTR(-EINVAL);
 	}
 
+	if (type == DEVM_IOREMAP && res->flags & IORESOURCE_MEM_NONPOSTED)
+		type = DEVM_IOREMAP_NP;
+
 	size = resource_size(res);
 
 	if (res->name)
-- 
cgit v1.2.3


From b10eb2d50911f98a8f1cacf00b1b677339593f4c Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Thu, 25 Mar 2021 22:50:19 +0900
Subject: asm-generic/io.h: implement pci_remap_cfgspace using ioremap_np

Now that we have ioremap_np(), we can make pci_remap_cfgspace() default
to it, falling back to ioremap() on platforms where it is not available.

Remove the arm64 implementation, since that is now redundant. Future
cleanups should be able to do the same for other arches, and eventually
make the generic pci_remap_cfgspace() unconditional.

Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Hector Martin <marcan@marcan.st>
---
 arch/arm64/include/asm/io.h | 10 ----------
 include/linux/io.h          | 16 ++++++++--------
 2 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 953b8703af60..7fd836bea7eb 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -171,16 +171,6 @@ extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size);
 #define ioremap_wc(addr, size)		__ioremap((addr), (size), __pgprot(PROT_NORMAL_NC))
 #define ioremap_np(addr, size)		__ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
 
-/*
- * PCI configuration space mapping function.
- *
- * The PCI specification disallows posted write configuration transactions.
- * Add an arch specific pci_remap_cfgspace() definition that is implemented
- * through nGnRnE device memory attribute as recommended by the ARM v8
- * Architecture reference manual Issue A.k B2.8.2 "Device memory".
- */
-#define pci_remap_cfgspace(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
-
 /*
  * io{read,write}{16,32,64}be() macros
  */
diff --git a/include/linux/io.h b/include/linux/io.h
index d718354ed3e1..61ff7d6278b6 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -82,20 +82,20 @@ void devm_memunmap(struct device *dev, void *addr);
 #ifdef CONFIG_PCI
 /*
  * The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and
- * Posting") mandate non-posted configuration transactions. There is
- * no ioremap API in the kernel that can guarantee non-posted write
- * semantics across arches so provide a default implementation for
- * mapping PCI config space that defaults to ioremap(); arches
- * should override it if they have memory mapping implementations that
- * guarantee non-posted writes semantics to make the memory mapping
- * compliant with the PCI specification.
+ * Posting") mandate non-posted configuration transactions. This default
+ * implementation attempts to use the ioremap_np() API to provide this
+ * on arches that support it, and falls back to ioremap() on those that
+ * don't. Overriding this function is deprecated; arches that properly
+ * support non-posted accesses should implement ioremap_np() instead, which
+ * this default implementation can then use to return mappings compliant with
+ * the PCI specification.
  */
 #ifndef pci_remap_cfgspace
 #define pci_remap_cfgspace pci_remap_cfgspace
 static inline void __iomem *pci_remap_cfgspace(phys_addr_t offset,
 					       size_t size)
 {
-	return ioremap(offset, size);
+	return ioremap_np(offset, size) ?: ioremap(offset, size);
 }
 #endif
 #endif
-- 
cgit v1.2.3


From 8a657f71705f9f9c2bf8308e2cfd57b9f329e0d9 Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Mon, 1 Mar 2021 12:36:24 +0900
Subject: arm64: Move ICH_ sysreg bits from arm-gic-v3.h to sysreg.h

These definitions are in arm-gic-v3.h for historical reasons which no
longer apply. Move them to sysreg.h so the AIC driver can use them, as
it needs to peek into vGIC registers to deal with the GIC maintentance
interrupt.

Acked-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Hector Martin <marcan@marcan.st>
---
 arch/arm64/include/asm/sysreg.h    | 60 ++++++++++++++++++++++++++++++++++++++
 include/linux/irqchip/arm-gic-v3.h | 56 -----------------------------------
 2 files changed, 60 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index d4a5fca984c3..609dc42ec8c8 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1032,6 +1032,66 @@
 #define TRFCR_ELx_ExTRE			BIT(1)
 #define TRFCR_ELx_E0TRE			BIT(0)
 
+
+/* GIC Hypervisor interface registers */
+/* ICH_MISR_EL2 bit definitions */
+#define ICH_MISR_EOI		(1 << 0)
+#define ICH_MISR_U		(1 << 1)
+
+/* ICH_LR*_EL2 bit definitions */
+#define ICH_LR_VIRTUAL_ID_MASK	((1ULL << 32) - 1)
+
+#define ICH_LR_EOI		(1ULL << 41)
+#define ICH_LR_GROUP		(1ULL << 60)
+#define ICH_LR_HW		(1ULL << 61)
+#define ICH_LR_STATE		(3ULL << 62)
+#define ICH_LR_PENDING_BIT	(1ULL << 62)
+#define ICH_LR_ACTIVE_BIT	(1ULL << 63)
+#define ICH_LR_PHYS_ID_SHIFT	32
+#define ICH_LR_PHYS_ID_MASK	(0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
+#define ICH_LR_PRIORITY_SHIFT	48
+#define ICH_LR_PRIORITY_MASK	(0xffULL << ICH_LR_PRIORITY_SHIFT)
+
+/* ICH_HCR_EL2 bit definitions */
+#define ICH_HCR_EN		(1 << 0)
+#define ICH_HCR_UIE		(1 << 1)
+#define ICH_HCR_NPIE		(1 << 3)
+#define ICH_HCR_TC		(1 << 10)
+#define ICH_HCR_TALL0		(1 << 11)
+#define ICH_HCR_TALL1		(1 << 12)
+#define ICH_HCR_EOIcount_SHIFT	27
+#define ICH_HCR_EOIcount_MASK	(0x1f << ICH_HCR_EOIcount_SHIFT)
+
+/* ICH_VMCR_EL2 bit definitions */
+#define ICH_VMCR_ACK_CTL_SHIFT	2
+#define ICH_VMCR_ACK_CTL_MASK	(1 << ICH_VMCR_ACK_CTL_SHIFT)
+#define ICH_VMCR_FIQ_EN_SHIFT	3
+#define ICH_VMCR_FIQ_EN_MASK	(1 << ICH_VMCR_FIQ_EN_SHIFT)
+#define ICH_VMCR_CBPR_SHIFT	4
+#define ICH_VMCR_CBPR_MASK	(1 << ICH_VMCR_CBPR_SHIFT)
+#define ICH_VMCR_EOIM_SHIFT	9
+#define ICH_VMCR_EOIM_MASK	(1 << ICH_VMCR_EOIM_SHIFT)
+#define ICH_VMCR_BPR1_SHIFT	18
+#define ICH_VMCR_BPR1_MASK	(7 << ICH_VMCR_BPR1_SHIFT)
+#define ICH_VMCR_BPR0_SHIFT	21
+#define ICH_VMCR_BPR0_MASK	(7 << ICH_VMCR_BPR0_SHIFT)
+#define ICH_VMCR_PMR_SHIFT	24
+#define ICH_VMCR_PMR_MASK	(0xffUL << ICH_VMCR_PMR_SHIFT)
+#define ICH_VMCR_ENG0_SHIFT	0
+#define ICH_VMCR_ENG0_MASK	(1 << ICH_VMCR_ENG0_SHIFT)
+#define ICH_VMCR_ENG1_SHIFT	1
+#define ICH_VMCR_ENG1_MASK	(1 << ICH_VMCR_ENG1_SHIFT)
+
+/* ICH_VTR_EL2 bit definitions */
+#define ICH_VTR_PRI_BITS_SHIFT	29
+#define ICH_VTR_PRI_BITS_MASK	(7 << ICH_VTR_PRI_BITS_SHIFT)
+#define ICH_VTR_ID_BITS_SHIFT	23
+#define ICH_VTR_ID_BITS_MASK	(7 << ICH_VTR_ID_BITS_SHIFT)
+#define ICH_VTR_SEIS_SHIFT	22
+#define ICH_VTR_SEIS_MASK	(1 << ICH_VTR_SEIS_SHIFT)
+#define ICH_VTR_A3V_SHIFT	21
+#define ICH_VTR_A3V_MASK	(1 << ICH_VTR_A3V_SHIFT)
+
 #ifdef __ASSEMBLY__
 
 	.irp	num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index f6d092fdb93d..81cbf85f73de 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -575,67 +575,11 @@
 #define ICC_SRE_EL1_DFB			(1U << 1)
 #define ICC_SRE_EL1_SRE			(1U << 0)
 
-/*
- * Hypervisor interface registers (SRE only)
- */
-#define ICH_LR_VIRTUAL_ID_MASK		((1ULL << 32) - 1)
-
-#define ICH_LR_EOI			(1ULL << 41)
-#define ICH_LR_GROUP			(1ULL << 60)
-#define ICH_LR_HW			(1ULL << 61)
-#define ICH_LR_STATE			(3ULL << 62)
-#define ICH_LR_PENDING_BIT		(1ULL << 62)
-#define ICH_LR_ACTIVE_BIT		(1ULL << 63)
-#define ICH_LR_PHYS_ID_SHIFT		32
-#define ICH_LR_PHYS_ID_MASK		(0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
-#define ICH_LR_PRIORITY_SHIFT		48
-#define ICH_LR_PRIORITY_MASK		(0xffULL << ICH_LR_PRIORITY_SHIFT)
-
 /* These are for GICv2 emulation only */
 #define GICH_LR_VIRTUALID		(0x3ffUL << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT	(10)
 #define GICH_LR_PHYSID_CPUID		(7UL << GICH_LR_PHYSID_CPUID_SHIFT)
 
-#define ICH_MISR_EOI			(1 << 0)
-#define ICH_MISR_U			(1 << 1)
-
-#define ICH_HCR_EN			(1 << 0)
-#define ICH_HCR_UIE			(1 << 1)
-#define ICH_HCR_NPIE			(1 << 3)
-#define ICH_HCR_TC			(1 << 10)
-#define ICH_HCR_TALL0			(1 << 11)
-#define ICH_HCR_TALL1			(1 << 12)
-#define ICH_HCR_EOIcount_SHIFT		27
-#define ICH_HCR_EOIcount_MASK		(0x1f << ICH_HCR_EOIcount_SHIFT)
-
-#define ICH_VMCR_ACK_CTL_SHIFT		2
-#define ICH_VMCR_ACK_CTL_MASK		(1 << ICH_VMCR_ACK_CTL_SHIFT)
-#define ICH_VMCR_FIQ_EN_SHIFT		3
-#define ICH_VMCR_FIQ_EN_MASK		(1 << ICH_VMCR_FIQ_EN_SHIFT)
-#define ICH_VMCR_CBPR_SHIFT		4
-#define ICH_VMCR_CBPR_MASK		(1 << ICH_VMCR_CBPR_SHIFT)
-#define ICH_VMCR_EOIM_SHIFT		9
-#define ICH_VMCR_EOIM_MASK		(1 << ICH_VMCR_EOIM_SHIFT)
-#define ICH_VMCR_BPR1_SHIFT		18
-#define ICH_VMCR_BPR1_MASK		(7 << ICH_VMCR_BPR1_SHIFT)
-#define ICH_VMCR_BPR0_SHIFT		21
-#define ICH_VMCR_BPR0_MASK		(7 << ICH_VMCR_BPR0_SHIFT)
-#define ICH_VMCR_PMR_SHIFT		24
-#define ICH_VMCR_PMR_MASK		(0xffUL << ICH_VMCR_PMR_SHIFT)
-#define ICH_VMCR_ENG0_SHIFT		0
-#define ICH_VMCR_ENG0_MASK		(1 << ICH_VMCR_ENG0_SHIFT)
-#define ICH_VMCR_ENG1_SHIFT		1
-#define ICH_VMCR_ENG1_MASK		(1 << ICH_VMCR_ENG1_SHIFT)
-
-#define ICH_VTR_PRI_BITS_SHIFT		29
-#define ICH_VTR_PRI_BITS_MASK		(7 << ICH_VTR_PRI_BITS_SHIFT)
-#define ICH_VTR_ID_BITS_SHIFT		23
-#define ICH_VTR_ID_BITS_MASK		(7 << ICH_VTR_ID_BITS_SHIFT)
-#define ICH_VTR_SEIS_SHIFT		22
-#define ICH_VTR_SEIS_MASK		(1 << ICH_VTR_SEIS_SHIFT)
-#define ICH_VTR_A3V_SHIFT		21
-#define ICH_VTR_A3V_MASK		(1 << ICH_VTR_A3V_SHIFT)
-
 #define ICC_IAR1_EL1_SPURIOUS		0x3ff
 
 #define ICC_SRE_EL2_SRE			(1 << 0)
-- 
cgit v1.2.3


From 76cde26394114f6af2710c6b2ad6854f1e8ee859 Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan@marcan.st>
Date: Thu, 21 Jan 2021 08:55:15 +0900
Subject: irqchip/apple-aic: Add support for the Apple Interrupt Controller

This is the root interrupt controller used on Apple ARM SoCs such as the
M1. This irqchip driver performs multiple functions:

* Handles both IRQs and FIQs

* Drives the AIC peripheral itself (which handles IRQs)

* Dispatches FIQs to downstream hard-wired clients (currently the ARM
  timer).

* Implements a virtual IPI multiplexer to funnel multiple Linux IPIs
  into a single hardware IPI

Reviewed-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Hector Martin <marcan@marcan.st>
---
 MAINTAINERS                     |   2 +
 drivers/irqchip/Kconfig         |   8 +
 drivers/irqchip/Makefile        |   1 +
 drivers/irqchip/irq-apple-aic.c | 852 ++++++++++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h      |   1 +
 5 files changed, 864 insertions(+)
 create mode 100644 drivers/irqchip/irq-apple-aic.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index b26a7e23c512..e27332ec1f12 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1647,6 +1647,8 @@ C:	irc://chat.freenode.net/asahi-dev
 T:	git https://github.com/AsahiLinux/linux.git
 F:	Documentation/devicetree/bindings/arm/apple.yaml
 F:	Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml
+F:	drivers/irqchip/irq-apple-aic.c
+F:	include/dt-bindings/interrupt-controller/apple-aic.h
 
 ARM/ARTPEC MACHINE SUPPORT
 M:	Jesper Nilsson <jesper.nilsson@axis.com>
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 15536e321df5..d3a14f304ec8 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -577,4 +577,12 @@ config MST_IRQ
 	help
 	  Support MStar Interrupt Controller.
 
+config APPLE_AIC
+	bool "Apple Interrupt Controller (AIC)"
+	depends on ARM64
+	default ARCH_APPLE
+	help
+	  Support for the Apple Interrupt Controller found on Apple Silicon SoCs,
+	  such as the M1.
+
 endmenu
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index c59b95a0532c..eb6a515f0f64 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -113,3 +113,4 @@ obj-$(CONFIG_LOONGSON_PCH_MSI)		+= irq-loongson-pch-msi.o
 obj-$(CONFIG_MST_IRQ)			+= irq-mst-intc.o
 obj-$(CONFIG_SL28CPLD_INTC)		+= irq-sl28cpld.o
 obj-$(CONFIG_MACH_REALTEK_RTL)		+= irq-realtek-rtl.o
+obj-$(CONFIG_APPLE_AIC)			+= irq-apple-aic.o
diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c
new file mode 100644
index 000000000000..c179e27062fd
--- /dev/null
+++ b/drivers/irqchip/irq-apple-aic.c
@@ -0,0 +1,852 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright The Asahi Linux Contributors
+ *
+ * Based on irq-lpc32xx:
+ *   Copyright 2015-2016 Vladimir Zapolskiy <vz@mleia.com>
+ * Based on irq-bcm2836:
+ *   Copyright 2015 Broadcom
+ */
+
+/*
+ * AIC is a fairly simple interrupt controller with the following features:
+ *
+ * - 896 level-triggered hardware IRQs
+ *   - Single mask bit per IRQ
+ *   - Per-IRQ affinity setting
+ *   - Automatic masking on event delivery (auto-ack)
+ *   - Software triggering (ORed with hw line)
+ * - 2 per-CPU IPIs (meant as "self" and "other", but they are
+ *   interchangeable if not symmetric)
+ * - Automatic prioritization (single event/ack register per CPU, lower IRQs =
+ *   higher priority)
+ * - Automatic masking on ack
+ * - Default "this CPU" register view and explicit per-CPU views
+ *
+ * In addition, this driver also handles FIQs, as these are routed to the same
+ * IRQ vector. These are used for Fast IPIs (TODO), the ARMv8 timer IRQs, and
+ * performance counters (TODO).
+ *
+ * Implementation notes:
+ *
+ * - This driver creates two IRQ domains, one for HW IRQs and internal FIQs,
+ *   and one for IPIs.
+ * - Since Linux needs more than 2 IPIs, we implement a software IRQ controller
+ *   and funnel all IPIs into one per-CPU IPI (the second "self" IPI is unused).
+ * - FIQ hwirq numbers are assigned after true hwirqs, and are per-cpu.
+ * - DT bindings use 3-cell form (like GIC):
+ *   - <0 nr flags> - hwirq #nr
+ *   - <1 nr flags> - FIQ #nr
+ *     - nr=0  Physical HV timer
+ *     - nr=1  Virtual HV timer
+ *     - nr=2  Physical guest timer
+ *     - nr=3  Virtual guest timer
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bits.h>
+#include <linux/bitfield.h>
+#include <linux/cpuhotplug.h>
+#include <linux/io.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/limits.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <asm/exception.h>
+#include <asm/sysreg.h>
+#include <asm/virt.h>
+
+#include <dt-bindings/interrupt-controller/apple-aic.h>
+
+/*
+ * AIC registers (MMIO)
+ */
+
+#define AIC_INFO		0x0004
+#define AIC_INFO_NR_HW		GENMASK(15, 0)
+
+#define AIC_CONFIG		0x0010
+
+#define AIC_WHOAMI		0x2000
+#define AIC_EVENT		0x2004
+#define AIC_EVENT_TYPE		GENMASK(31, 16)
+#define AIC_EVENT_NUM		GENMASK(15, 0)
+
+#define AIC_EVENT_TYPE_HW	1
+#define AIC_EVENT_TYPE_IPI	4
+#define AIC_EVENT_IPI_OTHER	1
+#define AIC_EVENT_IPI_SELF	2
+
+#define AIC_IPI_SEND		0x2008
+#define AIC_IPI_ACK		0x200c
+#define AIC_IPI_MASK_SET	0x2024
+#define AIC_IPI_MASK_CLR	0x2028
+
+#define AIC_IPI_SEND_CPU(cpu)	BIT(cpu)
+
+#define AIC_IPI_OTHER		BIT(0)
+#define AIC_IPI_SELF		BIT(31)
+
+#define AIC_TARGET_CPU		0x3000
+#define AIC_SW_SET		0x4000
+#define AIC_SW_CLR		0x4080
+#define AIC_MASK_SET		0x4100
+#define AIC_MASK_CLR		0x4180
+
+#define AIC_CPU_IPI_SET(cpu)	(0x5008 + ((cpu) << 7))
+#define AIC_CPU_IPI_CLR(cpu)	(0x500c + ((cpu) << 7))
+#define AIC_CPU_IPI_MASK_SET(cpu) (0x5024 + ((cpu) << 7))
+#define AIC_CPU_IPI_MASK_CLR(cpu) (0x5028 + ((cpu) << 7))
+
+#define MASK_REG(x)		(4 * ((x) >> 5))
+#define MASK_BIT(x)		BIT((x) & GENMASK(4, 0))
+
+/*
+ * IMP-DEF sysregs that control FIQ sources
+ * Note: sysreg-based IPIs are not supported yet.
+ */
+
+/* Core PMC control register */
+#define SYS_IMP_APL_PMCR0_EL1		sys_reg(3, 1, 15, 0, 0)
+#define PMCR0_IMODE			GENMASK(10, 8)
+#define PMCR0_IMODE_OFF			0
+#define PMCR0_IMODE_PMI			1
+#define PMCR0_IMODE_AIC			2
+#define PMCR0_IMODE_HALT		3
+#define PMCR0_IMODE_FIQ			4
+#define PMCR0_IACT			BIT(11)
+
+/* IPI request registers */
+#define SYS_IMP_APL_IPI_RR_LOCAL_EL1	sys_reg(3, 5, 15, 0, 0)
+#define SYS_IMP_APL_IPI_RR_GLOBAL_EL1	sys_reg(3, 5, 15, 0, 1)
+#define IPI_RR_CPU			GENMASK(7, 0)
+/* Cluster only used for the GLOBAL register */
+#define IPI_RR_CLUSTER			GENMASK(23, 16)
+#define IPI_RR_TYPE			GENMASK(29, 28)
+#define IPI_RR_IMMEDIATE		0
+#define IPI_RR_RETRACT			1
+#define IPI_RR_DEFERRED			2
+#define IPI_RR_NOWAKE			3
+
+/* IPI status register */
+#define SYS_IMP_APL_IPI_SR_EL1		sys_reg(3, 5, 15, 1, 1)
+#define IPI_SR_PENDING			BIT(0)
+
+/* Guest timer FIQ enable register */
+#define SYS_IMP_APL_VM_TMR_FIQ_ENA_EL2	sys_reg(3, 5, 15, 1, 3)
+#define VM_TMR_FIQ_ENABLE_V		BIT(0)
+#define VM_TMR_FIQ_ENABLE_P		BIT(1)
+
+/* Deferred IPI countdown register */
+#define SYS_IMP_APL_IPI_CR_EL1		sys_reg(3, 5, 15, 3, 1)
+
+/* Uncore PMC control register */
+#define SYS_IMP_APL_UPMCR0_EL1		sys_reg(3, 7, 15, 0, 4)
+#define UPMCR0_IMODE			GENMASK(18, 16)
+#define UPMCR0_IMODE_OFF		0
+#define UPMCR0_IMODE_AIC		2
+#define UPMCR0_IMODE_HALT		3
+#define UPMCR0_IMODE_FIQ		4
+
+/* Uncore PMC status register */
+#define SYS_IMP_APL_UPMSR_EL1		sys_reg(3, 7, 15, 6, 4)
+#define UPMSR_IACT			BIT(0)
+
+#define AIC_NR_FIQ		4
+#define AIC_NR_SWIPI		32
+
+/*
+ * FIQ hwirq index definitions: FIQ sources use the DT binding defines
+ * directly, except that timers are special. At the irqchip level, the
+ * two timer types are represented by their access method: _EL0 registers
+ * or _EL02 registers. In the DT binding, the timers are represented
+ * by their purpose (HV or guest). This mapping is for when the kernel is
+ * running at EL2 (with VHE). When the kernel is running at EL1, the
+ * mapping differs and aic_irq_domain_translate() performs the remapping.
+ */
+
+#define AIC_TMR_EL0_PHYS	AIC_TMR_HV_PHYS
+#define AIC_TMR_EL0_VIRT	AIC_TMR_HV_VIRT
+#define AIC_TMR_EL02_PHYS	AIC_TMR_GUEST_PHYS
+#define AIC_TMR_EL02_VIRT	AIC_TMR_GUEST_VIRT
+
+struct aic_irq_chip {
+	void __iomem *base;
+	struct irq_domain *hw_domain;
+	struct irq_domain *ipi_domain;
+	int nr_hw;
+	int ipi_hwirq;
+};
+
+static DEFINE_PER_CPU(uint32_t, aic_fiq_unmasked);
+
+static DEFINE_PER_CPU(atomic_t, aic_vipi_flag);
+static DEFINE_PER_CPU(atomic_t, aic_vipi_enable);
+
+static struct aic_irq_chip *aic_irqc;
+
+static void aic_handle_ipi(struct pt_regs *regs);
+
+static u32 aic_ic_read(struct aic_irq_chip *ic, u32 reg)
+{
+	return readl_relaxed(ic->base + reg);
+}
+
+static void aic_ic_write(struct aic_irq_chip *ic, u32 reg, u32 val)
+{
+	writel_relaxed(val, ic->base + reg);
+}
+
+/*
+ * IRQ irqchip
+ */
+
+static void aic_irq_mask(struct irq_data *d)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+
+	aic_ic_write(ic, AIC_MASK_SET + MASK_REG(irqd_to_hwirq(d)),
+		     MASK_BIT(irqd_to_hwirq(d)));
+}
+
+static void aic_irq_unmask(struct irq_data *d)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+
+	aic_ic_write(ic, AIC_MASK_CLR + MASK_REG(d->hwirq),
+		     MASK_BIT(irqd_to_hwirq(d)));
+}
+
+static void aic_irq_eoi(struct irq_data *d)
+{
+	/*
+	 * Reading the interrupt reason automatically acknowledges and masks
+	 * the IRQ, so we just unmask it here if needed.
+	 */
+	if (!irqd_irq_disabled(d) && !irqd_irq_masked(d))
+		aic_irq_unmask(d);
+}
+
+static void __exception_irq_entry aic_handle_irq(struct pt_regs *regs)
+{
+	struct aic_irq_chip *ic = aic_irqc;
+	u32 event, type, irq;
+
+	do {
+		/*
+		 * We cannot use a relaxed read here, as reads from DMA buffers
+		 * need to be ordered after the IRQ fires.
+		 */
+		event = readl(ic->base + AIC_EVENT);
+		type = FIELD_GET(AIC_EVENT_TYPE, event);
+		irq = FIELD_GET(AIC_EVENT_NUM, event);
+
+		if (type == AIC_EVENT_TYPE_HW)
+			handle_domain_irq(aic_irqc->hw_domain, irq, regs);
+		else if (type == AIC_EVENT_TYPE_IPI && irq == 1)
+			aic_handle_ipi(regs);
+		else if (event != 0)
+			pr_err_ratelimited("Unknown IRQ event %d, %d\n", type, irq);
+	} while (event);
+
+	/*
+	 * vGIC maintenance interrupts end up here too, so we need to check
+	 * for them separately. This should never trigger if KVM is working
+	 * properly, because it will have already taken care of clearing it
+	 * on guest exit before this handler runs.
+	 */
+	if (is_kernel_in_hyp_mode() && (read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EN) &&
+		read_sysreg_s(SYS_ICH_MISR_EL2) != 0) {
+		pr_err_ratelimited("vGIC IRQ fired and not handled by KVM, disabling.\n");
+		sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0);
+	}
+}
+
+static int aic_irq_set_affinity(struct irq_data *d,
+				const struct cpumask *mask_val, bool force)
+{
+	irq_hw_number_t hwirq = irqd_to_hwirq(d);
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+	int cpu;
+
+	if (force)
+		cpu = cpumask_first(mask_val);
+	else
+		cpu = cpumask_any_and(mask_val, cpu_online_mask);
+
+	aic_ic_write(ic, AIC_TARGET_CPU + hwirq * 4, BIT(cpu));
+	irq_data_update_effective_affinity(d, cpumask_of(cpu));
+
+	return IRQ_SET_MASK_OK;
+}
+
+static int aic_irq_set_type(struct irq_data *d, unsigned int type)
+{
+	/*
+	 * Some IRQs (e.g. MSIs) implicitly have edge semantics, and we don't
+	 * have a way to find out the type of any given IRQ, so just allow both.
+	 */
+	return (type == IRQ_TYPE_LEVEL_HIGH || type == IRQ_TYPE_EDGE_RISING) ? 0 : -EINVAL;
+}
+
+static struct irq_chip aic_chip = {
+	.name = "AIC",
+	.irq_mask = aic_irq_mask,
+	.irq_unmask = aic_irq_unmask,
+	.irq_eoi = aic_irq_eoi,
+	.irq_set_affinity = aic_irq_set_affinity,
+	.irq_set_type = aic_irq_set_type,
+};
+
+/*
+ * FIQ irqchip
+ */
+
+static unsigned long aic_fiq_get_idx(struct irq_data *d)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+
+	return irqd_to_hwirq(d) - ic->nr_hw;
+}
+
+static void aic_fiq_set_mask(struct irq_data *d)
+{
+	/* Only the guest timers have real mask bits, unfortunately. */
+	switch (aic_fiq_get_idx(d)) {
+	case AIC_TMR_EL02_PHYS:
+		sysreg_clear_set_s(SYS_IMP_APL_VM_TMR_FIQ_ENA_EL2, VM_TMR_FIQ_ENABLE_P, 0);
+		isb();
+		break;
+	case AIC_TMR_EL02_VIRT:
+		sysreg_clear_set_s(SYS_IMP_APL_VM_TMR_FIQ_ENA_EL2, VM_TMR_FIQ_ENABLE_V, 0);
+		isb();
+		break;
+	default:
+		break;
+	}
+}
+
+static void aic_fiq_clear_mask(struct irq_data *d)
+{
+	switch (aic_fiq_get_idx(d)) {
+	case AIC_TMR_EL02_PHYS:
+		sysreg_clear_set_s(SYS_IMP_APL_VM_TMR_FIQ_ENA_EL2, 0, VM_TMR_FIQ_ENABLE_P);
+		isb();
+		break;
+	case AIC_TMR_EL02_VIRT:
+		sysreg_clear_set_s(SYS_IMP_APL_VM_TMR_FIQ_ENA_EL2, 0, VM_TMR_FIQ_ENABLE_V);
+		isb();
+		break;
+	default:
+		break;
+	}
+}
+
+static void aic_fiq_mask(struct irq_data *d)
+{
+	aic_fiq_set_mask(d);
+	__this_cpu_and(aic_fiq_unmasked, ~BIT(aic_fiq_get_idx(d)));
+}
+
+static void aic_fiq_unmask(struct irq_data *d)
+{
+	aic_fiq_clear_mask(d);
+	__this_cpu_or(aic_fiq_unmasked, BIT(aic_fiq_get_idx(d)));
+}
+
+static void aic_fiq_eoi(struct irq_data *d)
+{
+	/* We mask to ack (where we can), so we need to unmask at EOI. */
+	if (__this_cpu_read(aic_fiq_unmasked) & BIT(aic_fiq_get_idx(d)))
+		aic_fiq_clear_mask(d);
+}
+
+#define TIMER_FIRING(x)                                                        \
+	(((x) & (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_MASK |            \
+		 ARCH_TIMER_CTRL_IT_STAT)) ==                                  \
+	 (ARCH_TIMER_CTRL_ENABLE | ARCH_TIMER_CTRL_IT_STAT))
+
+static void __exception_irq_entry aic_handle_fiq(struct pt_regs *regs)
+{
+	/*
+	 * It would be really nice if we had a system register that lets us get
+	 * the FIQ source state without having to peek down into sources...
+	 * but such a register does not seem to exist.
+	 *
+	 * So, we have these potential sources to test for:
+	 *  - Fast IPIs (not yet used)
+	 *  - The 4 timers (CNTP, CNTV for each of HV and guest)
+	 *  - Per-core PMCs (not yet supported)
+	 *  - Per-cluster uncore PMCs (not yet supported)
+	 *
+	 * Since not dealing with any of these results in a FIQ storm,
+	 * we check for everything here, even things we don't support yet.
+	 */
+
+	if (read_sysreg_s(SYS_IMP_APL_IPI_SR_EL1) & IPI_SR_PENDING) {
+		pr_err_ratelimited("Fast IPI fired. Acking.\n");
+		write_sysreg_s(IPI_SR_PENDING, SYS_IMP_APL_IPI_SR_EL1);
+	}
+
+	if (TIMER_FIRING(read_sysreg(cntp_ctl_el0)))
+		handle_domain_irq(aic_irqc->hw_domain,
+				  aic_irqc->nr_hw + AIC_TMR_EL0_PHYS, regs);
+
+	if (TIMER_FIRING(read_sysreg(cntv_ctl_el0)))
+		handle_domain_irq(aic_irqc->hw_domain,
+				  aic_irqc->nr_hw + AIC_TMR_EL0_VIRT, regs);
+
+	if (is_kernel_in_hyp_mode()) {
+		uint64_t enabled = read_sysreg_s(SYS_IMP_APL_VM_TMR_FIQ_ENA_EL2);
+
+		if ((enabled & VM_TMR_FIQ_ENABLE_P) &&
+		    TIMER_FIRING(read_sysreg_s(SYS_CNTP_CTL_EL02)))
+			handle_domain_irq(aic_irqc->hw_domain,
+					  aic_irqc->nr_hw + AIC_TMR_EL02_PHYS, regs);
+
+		if ((enabled & VM_TMR_FIQ_ENABLE_V) &&
+		    TIMER_FIRING(read_sysreg_s(SYS_CNTV_CTL_EL02)))
+			handle_domain_irq(aic_irqc->hw_domain,
+					  aic_irqc->nr_hw + AIC_TMR_EL02_VIRT, regs);
+	}
+
+	if ((read_sysreg_s(SYS_IMP_APL_PMCR0_EL1) & (PMCR0_IMODE | PMCR0_IACT)) ==
+			(FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_FIQ) | PMCR0_IACT)) {
+		/*
+		 * Not supported yet, let's figure out how to handle this when
+		 * we implement these proprietary performance counters. For now,
+		 * just mask it and move on.
+		 */
+		pr_err_ratelimited("PMC FIQ fired. Masking.\n");
+		sysreg_clear_set_s(SYS_IMP_APL_PMCR0_EL1, PMCR0_IMODE | PMCR0_IACT,
+				   FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_OFF));
+	}
+
+	if (FIELD_GET(UPMCR0_IMODE, read_sysreg_s(SYS_IMP_APL_UPMCR0_EL1)) == UPMCR0_IMODE_FIQ &&
+			(read_sysreg_s(SYS_IMP_APL_UPMSR_EL1) & UPMSR_IACT)) {
+		/* Same story with uncore PMCs */
+		pr_err_ratelimited("Uncore PMC FIQ fired. Masking.\n");
+		sysreg_clear_set_s(SYS_IMP_APL_UPMCR0_EL1, UPMCR0_IMODE,
+				   FIELD_PREP(UPMCR0_IMODE, UPMCR0_IMODE_OFF));
+	}
+}
+
+static int aic_fiq_set_type(struct irq_data *d, unsigned int type)
+{
+	return (type == IRQ_TYPE_LEVEL_HIGH) ? 0 : -EINVAL;
+}
+
+static struct irq_chip fiq_chip = {
+	.name = "AIC-FIQ",
+	.irq_mask = aic_fiq_mask,
+	.irq_unmask = aic_fiq_unmask,
+	.irq_ack = aic_fiq_set_mask,
+	.irq_eoi = aic_fiq_eoi,
+	.irq_set_type = aic_fiq_set_type,
+};
+
+/*
+ * Main IRQ domain
+ */
+
+static int aic_irq_domain_map(struct irq_domain *id, unsigned int irq,
+			      irq_hw_number_t hw)
+{
+	struct aic_irq_chip *ic = id->host_data;
+
+	if (hw < ic->nr_hw) {
+		irq_domain_set_info(id, irq, hw, &aic_chip, id->host_data,
+				    handle_fasteoi_irq, NULL, NULL);
+		irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(irq)));
+	} else {
+		irq_set_percpu_devid(irq);
+		irq_domain_set_info(id, irq, hw, &fiq_chip, id->host_data,
+				    handle_percpu_devid_irq, NULL, NULL);
+	}
+
+	return 0;
+}
+
+static int aic_irq_domain_translate(struct irq_domain *id,
+				    struct irq_fwspec *fwspec,
+				    unsigned long *hwirq,
+				    unsigned int *type)
+{
+	struct aic_irq_chip *ic = id->host_data;
+
+	if (fwspec->param_count != 3 || !is_of_node(fwspec->fwnode))
+		return -EINVAL;
+
+	switch (fwspec->param[0]) {
+	case AIC_IRQ:
+		if (fwspec->param[1] >= ic->nr_hw)
+			return -EINVAL;
+		*hwirq = fwspec->param[1];
+		break;
+	case AIC_FIQ:
+		if (fwspec->param[1] >= AIC_NR_FIQ)
+			return -EINVAL;
+		*hwirq = ic->nr_hw + fwspec->param[1];
+
+		/*
+		 * In EL1 the non-redirected registers are the guest's,
+		 * not EL2's, so remap the hwirqs to match.
+		 */
+		if (!is_kernel_in_hyp_mode()) {
+			switch (fwspec->param[1]) {
+			case AIC_TMR_GUEST_PHYS:
+				*hwirq = ic->nr_hw + AIC_TMR_EL0_PHYS;
+				break;
+			case AIC_TMR_GUEST_VIRT:
+				*hwirq = ic->nr_hw + AIC_TMR_EL0_VIRT;
+				break;
+			case AIC_TMR_HV_PHYS:
+			case AIC_TMR_HV_VIRT:
+				return -ENOENT;
+			default:
+				break;
+			}
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+
+	return 0;
+}
+
+static int aic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs, void *arg)
+{
+	unsigned int type = IRQ_TYPE_NONE;
+	struct irq_fwspec *fwspec = arg;
+	irq_hw_number_t hwirq;
+	int i, ret;
+
+	ret = aic_irq_domain_translate(domain, fwspec, &hwirq, &type);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nr_irqs; i++) {
+		ret = aic_irq_domain_map(domain, virq + i, hwirq + i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void aic_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+				unsigned int nr_irqs)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_data *d = irq_domain_get_irq_data(domain, virq + i);
+
+		irq_set_handler(virq + i, NULL);
+		irq_domain_reset_irq_data(d);
+	}
+}
+
+static const struct irq_domain_ops aic_irq_domain_ops = {
+	.translate	= aic_irq_domain_translate,
+	.alloc		= aic_irq_domain_alloc,
+	.free		= aic_irq_domain_free,
+};
+
+/*
+ * IPI irqchip
+ */
+
+static void aic_ipi_mask(struct irq_data *d)
+{
+	u32 irq_bit = BIT(irqd_to_hwirq(d));
+
+	/* No specific ordering requirements needed here. */
+	atomic_andnot(irq_bit, this_cpu_ptr(&aic_vipi_enable));
+}
+
+static void aic_ipi_unmask(struct irq_data *d)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+	u32 irq_bit = BIT(irqd_to_hwirq(d));
+
+	atomic_or(irq_bit, this_cpu_ptr(&aic_vipi_enable));
+
+	/*
+	 * The atomic_or() above must complete before the atomic_read()
+	 * below to avoid racing aic_ipi_send_mask().
+	 */
+	smp_mb__after_atomic();
+
+	/*
+	 * If a pending vIPI was unmasked, raise a HW IPI to ourselves.
+	 * No barriers needed here since this is a self-IPI.
+	 */
+	if (atomic_read(this_cpu_ptr(&aic_vipi_flag)) & irq_bit)
+		aic_ic_write(ic, AIC_IPI_SEND, AIC_IPI_SEND_CPU(smp_processor_id()));
+}
+
+static void aic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+	struct aic_irq_chip *ic = irq_data_get_irq_chip_data(d);
+	u32 irq_bit = BIT(irqd_to_hwirq(d));
+	u32 send = 0;
+	int cpu;
+	unsigned long pending;
+
+	for_each_cpu(cpu, mask) {
+		/*
+		 * This sequence is the mirror of the one in aic_ipi_unmask();
+		 * see the comment there. Additionally, release semantics
+		 * ensure that the vIPI flag set is ordered after any shared
+		 * memory accesses that precede it. This therefore also pairs
+		 * with the atomic_fetch_andnot in aic_handle_ipi().
+		 */
+		pending = atomic_fetch_or_release(irq_bit, per_cpu_ptr(&aic_vipi_flag, cpu));
+
+		/*
+		 * The atomic_fetch_or_release() above must complete before the
+		 * atomic_read() below to avoid racing aic_ipi_unmask().
+		 */
+		smp_mb__after_atomic();
+
+		if (!(pending & irq_bit) &&
+		    (atomic_read(per_cpu_ptr(&aic_vipi_enable, cpu)) & irq_bit))
+			send |= AIC_IPI_SEND_CPU(cpu);
+	}
+
+	/*
+	 * The flag writes must complete before the physical IPI is issued
+	 * to another CPU. This is implied by the control dependency on
+	 * the result of atomic_read_acquire() above, which is itself
+	 * already ordered after the vIPI flag write.
+	 */
+	if (send)
+		aic_ic_write(ic, AIC_IPI_SEND, send);
+}
+
+static struct irq_chip ipi_chip = {
+	.name = "AIC-IPI",
+	.irq_mask = aic_ipi_mask,
+	.irq_unmask = aic_ipi_unmask,
+	.ipi_send_mask = aic_ipi_send_mask,
+};
+
+/*
+ * IPI IRQ domain
+ */
+
+static void aic_handle_ipi(struct pt_regs *regs)
+{
+	int i;
+	unsigned long enabled, firing;
+
+	/*
+	 * Ack the IPI. We need to order this after the AIC event read, but
+	 * that is enforced by normal MMIO ordering guarantees.
+	 */
+	aic_ic_write(aic_irqc, AIC_IPI_ACK, AIC_IPI_OTHER);
+
+	/*
+	 * The mask read does not need to be ordered. Only we can change
+	 * our own mask anyway, so no races are possible here, as long as
+	 * we are properly in the interrupt handler (which is covered by
+	 * the barrier that is part of the top-level AIC handler's readl()).
+	 */
+	enabled = atomic_read(this_cpu_ptr(&aic_vipi_enable));
+
+	/*
+	 * Clear the IPIs we are about to handle. This pairs with the
+	 * atomic_fetch_or_release() in aic_ipi_send_mask(), and needs to be
+	 * ordered after the aic_ic_write() above (to avoid dropping vIPIs) and
+	 * before IPI handling code (to avoid races handling vIPIs before they
+	 * are signaled). The former is taken care of by the release semantics
+	 * of the write portion, while the latter is taken care of by the
+	 * acquire semantics of the read portion.
+	 */
+	firing = atomic_fetch_andnot(enabled, this_cpu_ptr(&aic_vipi_flag)) & enabled;
+
+	for_each_set_bit(i, &firing, AIC_NR_SWIPI)
+		handle_domain_irq(aic_irqc->ipi_domain, i, regs);
+
+	/*
+	 * No ordering needed here; at worst this just changes the timing of
+	 * when the next IPI will be delivered.
+	 */
+	aic_ic_write(aic_irqc, AIC_IPI_MASK_CLR, AIC_IPI_OTHER);
+}
+
+static int aic_ipi_alloc(struct irq_domain *d, unsigned int virq,
+			 unsigned int nr_irqs, void *args)
+{
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_set_percpu_devid(virq + i);
+		irq_domain_set_info(d, virq + i, i, &ipi_chip, d->host_data,
+				    handle_percpu_devid_irq, NULL, NULL);
+	}
+
+	return 0;
+}
+
+static void aic_ipi_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs)
+{
+	/* Not freeing IPIs */
+}
+
+static const struct irq_domain_ops aic_ipi_domain_ops = {
+	.alloc = aic_ipi_alloc,
+	.free = aic_ipi_free,
+};
+
+static int aic_init_smp(struct aic_irq_chip *irqc, struct device_node *node)
+{
+	struct irq_domain *ipi_domain;
+	int base_ipi;
+
+	ipi_domain = irq_domain_create_linear(irqc->hw_domain->fwnode, AIC_NR_SWIPI,
+					      &aic_ipi_domain_ops, irqc);
+	if (WARN_ON(!ipi_domain))
+		return -ENODEV;
+
+	ipi_domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+	irq_domain_update_bus_token(ipi_domain, DOMAIN_BUS_IPI);
+
+	base_ipi = __irq_domain_alloc_irqs(ipi_domain, -1, AIC_NR_SWIPI,
+					   NUMA_NO_NODE, NULL, false, NULL);
+
+	if (WARN_ON(!base_ipi)) {
+		irq_domain_remove(ipi_domain);
+		return -ENODEV;
+	}
+
+	set_smp_ipi_range(base_ipi, AIC_NR_SWIPI);
+
+	irqc->ipi_domain = ipi_domain;
+
+	return 0;
+}
+
+static int aic_init_cpu(unsigned int cpu)
+{
+	/* Mask all hard-wired per-CPU IRQ/FIQ sources */
+
+	/* Pending Fast IPI FIQs */
+	write_sysreg_s(IPI_SR_PENDING, SYS_IMP_APL_IPI_SR_EL1);
+
+	/* Timer FIQs */
+	sysreg_clear_set(cntp_ctl_el0, 0, ARCH_TIMER_CTRL_IT_MASK);
+	sysreg_clear_set(cntv_ctl_el0, 0, ARCH_TIMER_CTRL_IT_MASK);
+
+	/* EL2-only (VHE mode) IRQ sources */
+	if (is_kernel_in_hyp_mode()) {
+		/* Guest timers */
+		sysreg_clear_set_s(SYS_IMP_APL_VM_TMR_FIQ_ENA_EL2,
+				   VM_TMR_FIQ_ENABLE_V | VM_TMR_FIQ_ENABLE_P, 0);
+
+		/* vGIC maintenance IRQ */
+		sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EN, 0);
+	}
+
+	/* PMC FIQ */
+	sysreg_clear_set_s(SYS_IMP_APL_PMCR0_EL1, PMCR0_IMODE | PMCR0_IACT,
+			   FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_OFF));
+
+	/* Uncore PMC FIQ */
+	sysreg_clear_set_s(SYS_IMP_APL_UPMCR0_EL1, UPMCR0_IMODE,
+			   FIELD_PREP(UPMCR0_IMODE, UPMCR0_IMODE_OFF));
+
+	/* Commit all of the above */
+	isb();
+
+	/*
+	 * Make sure the kernel's idea of logical CPU order is the same as AIC's
+	 * If we ever end up with a mismatch here, we will have to introduce
+	 * a mapping table similar to what other irqchip drivers do.
+	 */
+	WARN_ON(aic_ic_read(aic_irqc, AIC_WHOAMI) != smp_processor_id());
+
+	/*
+	 * Always keep IPIs unmasked at the hardware level (except auto-masking
+	 * by AIC during processing). We manage masks at the vIPI level.
+	 */
+	aic_ic_write(aic_irqc, AIC_IPI_ACK, AIC_IPI_SELF | AIC_IPI_OTHER);
+	aic_ic_write(aic_irqc, AIC_IPI_MASK_SET, AIC_IPI_SELF);
+	aic_ic_write(aic_irqc, AIC_IPI_MASK_CLR, AIC_IPI_OTHER);
+
+	/* Initialize the local mask state */
+	__this_cpu_write(aic_fiq_unmasked, 0);
+
+	return 0;
+}
+
+static int __init aic_of_ic_init(struct device_node *node, struct device_node *parent)
+{
+	int i;
+	void __iomem *regs;
+	u32 info;
+	struct aic_irq_chip *irqc;
+
+	regs = of_iomap(node, 0);
+	if (WARN_ON(!regs))
+		return -EIO;
+
+	irqc = kzalloc(sizeof(*irqc), GFP_KERNEL);
+	if (!irqc)
+		return -ENOMEM;
+
+	aic_irqc = irqc;
+	irqc->base = regs;
+
+	info = aic_ic_read(irqc, AIC_INFO);
+	irqc->nr_hw = FIELD_GET(AIC_INFO_NR_HW, info);
+
+	irqc->hw_domain = irq_domain_create_linear(of_node_to_fwnode(node),
+						   irqc->nr_hw + AIC_NR_FIQ,
+						   &aic_irq_domain_ops, irqc);
+	if (WARN_ON(!irqc->hw_domain)) {
+		iounmap(irqc->base);
+		kfree(irqc);
+		return -ENODEV;
+	}
+
+	irq_domain_update_bus_token(irqc->hw_domain, DOMAIN_BUS_WIRED);
+
+	if (aic_init_smp(irqc, node)) {
+		irq_domain_remove(irqc->hw_domain);
+		iounmap(irqc->base);
+		kfree(irqc);
+		return -ENODEV;
+	}
+
+	set_handle_irq(aic_handle_irq);
+	set_handle_fiq(aic_handle_fiq);
+
+	for (i = 0; i < BITS_TO_U32(irqc->nr_hw); i++)
+		aic_ic_write(irqc, AIC_MASK_SET + i * 4, U32_MAX);
+	for (i = 0; i < BITS_TO_U32(irqc->nr_hw); i++)
+		aic_ic_write(irqc, AIC_SW_CLR + i * 4, U32_MAX);
+	for (i = 0; i < irqc->nr_hw; i++)
+		aic_ic_write(irqc, AIC_TARGET_CPU + i * 4, 1);
+
+	if (!is_kernel_in_hyp_mode())
+		pr_info("Kernel running in EL1, mapping interrupts");
+
+	cpuhp_setup_state(CPUHP_AP_IRQ_APPLE_AIC_STARTING,
+			  "irqchip/apple-aic/ipi:starting",
+			  aic_init_cpu, NULL);
+
+	pr_info("Initialized with %d IRQs, %d FIQs, %d vIPIs\n",
+		irqc->nr_hw, AIC_NR_FIQ, AIC_NR_SWIPI);
+
+	return 0;
+}
+
+IRQCHIP_DECLARE(apple_m1_aic, "apple,aic", aic_of_ic_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f14adb882338..f56eee992c75 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -100,6 +100,7 @@ enum cpuhp_state {
 	CPUHP_AP_CPU_PM_STARTING,
 	CPUHP_AP_IRQ_GIC_STARTING,
 	CPUHP_AP_IRQ_HIP04_STARTING,
+	CPUHP_AP_IRQ_APPLE_AIC_STARTING,
 	CPUHP_AP_IRQ_ARMADA_XP_STARTING,
 	CPUHP_AP_IRQ_BCM2836_STARTING,
 	CPUHP_AP_IRQ_MIPS_GIC_STARTING,
-- 
cgit v1.2.3


From 0d66ccc1627013c95f1e7ef10b95b8451cd7834e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 1 Apr 2021 16:23:42 -0700
Subject: jump_label: Provide CONFIG-driven build state defaults

As shown in the comment in jump_label.h, choosing the initial state of
static branches changes the assembly layout. If the condition is expected
to be likely it's inline, and if unlikely it is out of line via a jump.

A few places in the kernel use (or could be using) a CONFIG to choose the
default state, which would give a small performance benefit to their
compile-time declared default. Provide the infrastructure to do this.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210401232347.2791257-2-keescook@chromium.org
---
 include/linux/jump_label.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index d92691262f51..05f5554d860f 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -382,6 +382,21 @@ struct static_key_false {
 		[0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,	\
 	}
 
+#define _DEFINE_STATIC_KEY_1(name)	DEFINE_STATIC_KEY_TRUE(name)
+#define _DEFINE_STATIC_KEY_0(name)	DEFINE_STATIC_KEY_FALSE(name)
+#define DEFINE_STATIC_KEY_MAYBE(cfg, name)			\
+	__PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name)
+
+#define _DEFINE_STATIC_KEY_RO_1(name)	DEFINE_STATIC_KEY_TRUE_RO(name)
+#define _DEFINE_STATIC_KEY_RO_0(name)	DEFINE_STATIC_KEY_FALSE_RO(name)
+#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name)			\
+	__PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name)
+
+#define _DECLARE_STATIC_KEY_1(name)	DECLARE_STATIC_KEY_TRUE(name)
+#define _DECLARE_STATIC_KEY_0(name)	DECLARE_STATIC_KEY_FALSE(name)
+#define DECLARE_STATIC_KEY_MAYBE(cfg, name)			\
+	__PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name)
+
 extern bool ____wrong_branch_error(void);
 
 #define static_key_enabled(x)							\
@@ -482,6 +497,10 @@ extern bool ____wrong_branch_error(void);
 
 #endif /* CONFIG_JUMP_LABEL */
 
+#define static_branch_maybe(config, x)					\
+	(IS_ENABLED(config) ? static_branch_likely(x)			\
+			    : static_branch_unlikely(x))
+
 /*
  * Advanced usage; refcount, branch is enabled when: count != 0
  */
-- 
cgit v1.2.3


From 51cba1ebc60df9c4ce034a9f5441169c0d0956c0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 1 Apr 2021 16:23:43 -0700
Subject: init_on_alloc: Optimize static branches

The state of CONFIG_INIT_ON_ALLOC_DEFAULT_ON (and ...ON_FREE...) did not
change the assembly ordering of the static branches: they were always out
of line. Use the new jump_label macros to check the CONFIG settings to
default to the "expected" state, which slightly optimizes the resulting
assembly code.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Alexander Potapenko <glider@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Link: https://lore.kernel.org/r/20210401232347.2791257-3-keescook@chromium.org
---
 include/linux/mm.h | 10 ++++++----
 mm/page_alloc.c    |  4 ++--
 mm/slab.h          |  6 ++++--
 3 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ba434287387..616dcaf08d99 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2904,18 +2904,20 @@ static inline void kernel_poison_pages(struct page *page, int numpages) { }
 static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
 #endif
 
-DECLARE_STATIC_KEY_FALSE(init_on_alloc);
+DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
 static inline bool want_init_on_alloc(gfp_t flags)
 {
-	if (static_branch_unlikely(&init_on_alloc))
+	if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+				&init_on_alloc))
 		return true;
 	return flags & __GFP_ZERO;
 }
 
-DECLARE_STATIC_KEY_FALSE(init_on_free);
+DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
 static inline bool want_init_on_free(void)
 {
-	return static_branch_unlikely(&init_on_free);
+	return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
+				   &init_on_free);
 }
 
 extern bool _debug_pagealloc_enabled_early;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cfc72873961d..e2f19bf948db 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -167,10 +167,10 @@ unsigned long totalcma_pages __read_mostly;
 
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-DEFINE_STATIC_KEY_FALSE(init_on_alloc);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
 EXPORT_SYMBOL(init_on_alloc);
 
-DEFINE_STATIC_KEY_FALSE(init_on_free);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
 EXPORT_SYMBOL(init_on_free);
 
 static bool _init_on_alloc_enabled_early __read_mostly
diff --git a/mm/slab.h b/mm/slab.h
index 076582f58f68..774c7221efdc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -601,7 +601,8 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
 
 static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
 {
-	if (static_branch_unlikely(&init_on_alloc)) {
+	if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
+				&init_on_alloc)) {
 		if (c->ctor)
 			return false;
 		if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
@@ -613,7 +614,8 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
 
 static inline bool slab_want_init_on_free(struct kmem_cache *c)
 {
-	if (static_branch_unlikely(&init_on_free))
+	if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
+				&init_on_free))
 		return !(c->ctor ||
 			 (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
 	return false;
-- 
cgit v1.2.3


From 39218ff4c625dbf2e68224024fe0acaa60bcd51a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 1 Apr 2021 16:23:44 -0700
Subject: stack: Optionally randomize kernel stack offset each syscall

This provides the ability for architectures to enable kernel stack base
address offset randomization. This feature is controlled by the boot
param "randomize_kstack_offset=on/off", with its default value set by
CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT.

This feature is based on the original idea from the last public release
of PaX's RANDKSTACK feature: https://pax.grsecurity.net/docs/randkstack.txt
All the credit for the original idea goes to the PaX team. Note that
the design and implementation of this upstream randomize_kstack_offset
feature differs greatly from the RANDKSTACK feature (see below).

Reasoning for the feature:

This feature aims to make harder the various stack-based attacks that
rely on deterministic stack structure. We have had many such attacks in
past (just to name few):

https://jon.oberheide.org/files/infiltrate12-thestackisback.pdf
https://jon.oberheide.org/files/stackjacking-infiltrate11.pdf
https://googleprojectzero.blogspot.com/2016/06/exploiting-recursion-in-linux-kernel_20.html

As Linux kernel stack protections have been constantly improving
(vmap-based stack allocation with guard pages, removal of thread_info,
STACKLEAK), attackers have had to find new ways for their exploits
to work. They have done so, continuing to rely on the kernel's stack
determinism, in situations where VMAP_STACK and THREAD_INFO_IN_TASK_STRUCT
were not relevant. For example, the following recent attacks would have
been hampered if the stack offset was non-deterministic between syscalls:

https://repositorio-aberto.up.pt/bitstream/10216/125357/2/374717.pdf
(page 70: targeting the pt_regs copy with linear stack overflow)

https://a13xp0p0v.github.io/2020/02/15/CVE-2019-18683.html
(leaked stack address from one syscall as a target during next syscall)

The main idea is that since the stack offset is randomized on each system
call, it is harder for an attack to reliably land in any particular place
on the thread stack, even with address exposures, as the stack base will
change on the next syscall. Also, since randomization is performed after
placing pt_regs, the ptrace-based approach[1] to discover the randomized
offset during a long-running syscall should not be possible.

Design description:

During most of the kernel's execution, it runs on the "thread stack",
which is pretty deterministic in its structure: it is fixed in size,
and on every entry from userspace to kernel on a syscall the thread
stack starts construction from an address fetched from the per-cpu
cpu_current_top_of_stack variable. The first element to be pushed to the
thread stack is the pt_regs struct that stores all required CPU registers
and syscall parameters. Finally the specific syscall function is called,
with the stack being used as the kernel executes the resulting request.

The goal of randomize_kstack_offset feature is to add a random offset
after the pt_regs has been pushed to the stack and before the rest of the
thread stack is used during the syscall processing, and to change it every
time a process issues a syscall. The source of randomness is currently
architecture-defined (but x86 is using the low byte of rdtsc()). Future
improvements for different entropy sources is possible, but out of scope
for this patch. Further more, to add more unpredictability, new offsets
are chosen at the end of syscalls (the timing of which should be less
easy to measure from userspace than at syscall entry time), and stored
in a per-CPU variable, so that the life of the value does not stay
explicitly tied to a single task.

As suggested by Andy Lutomirski, the offset is added using alloca()
and an empty asm() statement with an output constraint, since it avoids
changes to assembly syscall entry code, to the unwinder, and provides
correct stack alignment as defined by the compiler.

In order to make this available by default with zero performance impact
for those that don't want it, it is boot-time selectable with static
branches. This way, if the overhead is not wanted, it can just be
left turned off with no performance impact.

The generated assembly for x86_64 with GCC looks like this:

...
ffffffff81003977: 65 8b 05 02 ea 00 7f  mov %gs:0x7f00ea02(%rip),%eax
					    # 12380 <kstack_offset>
ffffffff8100397e: 25 ff 03 00 00        and $0x3ff,%eax
ffffffff81003983: 48 83 c0 0f           add $0xf,%rax
ffffffff81003987: 25 f8 07 00 00        and $0x7f8,%eax
ffffffff8100398c: 48 29 c4              sub %rax,%rsp
ffffffff8100398f: 48 8d 44 24 0f        lea 0xf(%rsp),%rax
ffffffff81003994: 48 83 e0 f0           and $0xfffffffffffffff0,%rax
...

As a result of the above stack alignment, this patch introduces about
5 bits of randomness after pt_regs is spilled to the thread stack on
x86_64, and 6 bits on x86_32 (since its has 1 fewer bit required for
stack alignment). The amount of entropy could be adjusted based on how
much of the stack space we wish to trade for security.

My measure of syscall performance overhead (on x86_64):

lmbench: /usr/lib/lmbench/bin/x86_64-linux-gnu/lat_syscall -N 10000 null
    randomize_kstack_offset=y	Simple syscall: 0.7082 microseconds
    randomize_kstack_offset=n	Simple syscall: 0.7016 microseconds

So, roughly 0.9% overhead growth for a no-op syscall, which is very
manageable. And for people that don't want this, it's off by default.

There are two gotchas with using the alloca() trick. First,
compilers that have Stack Clash protection (-fstack-clash-protection)
enabled by default (e.g. Ubuntu[3]) add pagesize stack probes to
any dynamic stack allocations. While the randomization offset is
always less than a page, the resulting assembly would still contain
(unreachable!) probing routines, bloating the resulting assembly. To
avoid this, -fno-stack-clash-protection is unconditionally added to
the kernel Makefile since this is the only dynamic stack allocation in
the kernel (now that VLAs have been removed) and it is provably safe
from Stack Clash style attacks.

The second gotcha with alloca() is a negative interaction with
-fstack-protector*, in that it sees the alloca() as an array allocation,
which triggers the unconditional addition of the stack canary function
pre/post-amble which slows down syscalls regardless of the static
branch. In order to avoid adding this unneeded check and its associated
performance impact, architectures need to carefully remove uses of
-fstack-protector-strong (or -fstack-protector) in the compilation units
that use the add_random_kstack() macro and to audit the resulting stack
mitigation coverage (to make sure no desired coverage disappears). No
change is visible for this on x86 because the stack protector is already
unconditionally disabled for the compilation unit, but the change is
required on arm64. There is, unfortunately, no attribute that can be
used to disable stack protector for specific functions.

Comparison to PaX RANDKSTACK feature:

The RANDKSTACK feature randomizes the location of the stack start
(cpu_current_top_of_stack), i.e. including the location of pt_regs
structure itself on the stack. Initially this patch followed the same
approach, but during the recent discussions[2], it has been determined
to be of a little value since, if ptrace functionality is available for
an attacker, they can use PTRACE_PEEKUSR/PTRACE_POKEUSR to read/write
different offsets in the pt_regs struct, observe the cache behavior of
the pt_regs accesses, and figure out the random stack offset. Another
difference is that the random offset is stored in a per-cpu variable,
rather than having it be per-thread. As a result, these implementations
differ a fair bit in their implementation details and results, though
obviously the intent is similar.

[1] https://lore.kernel.org/kernel-hardening/2236FBA76BA1254E88B949DDB74E612BA4BC57C1@IRSMSX102.ger.corp.intel.com/
[2] https://lore.kernel.org/kernel-hardening/20190329081358.30497-1-elena.reshetova@intel.com/
[3] https://lists.ubuntu.com/archives/ubuntu-devel/2019-June/040741.html

Co-developed-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210401232347.2791257-4-keescook@chromium.org
---
 Documentation/admin-guide/kernel-parameters.txt | 11 +++++
 Makefile                                        |  4 ++
 arch/Kconfig                                    | 23 +++++++++++
 include/linux/randomize_kstack.h                | 54 +++++++++++++++++++++++++
 init/main.c                                     | 23 +++++++++++
 5 files changed, 115 insertions(+)
 create mode 100644 include/linux/randomize_kstack.h

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 04545725f187..bee8644a192e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4061,6 +4061,17 @@
 			fully seed the kernel's CRNG. Default is controlled
 			by CONFIG_RANDOM_TRUST_CPU.
 
+	randomize_kstack_offset=
+			[KNL] Enable or disable kernel stack offset
+			randomization, which provides roughly 5 bits of
+			entropy, frustrating memory corruption attacks
+			that depend on stack address determinism or
+			cross-syscall address exposures. This is only
+			available on architectures that have defined
+			CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET.
+			Format: <bool>  (1/Y/y=enable, 0/N/n=disable)
+			Default is CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT.
+
 	ras=option[,option,...]	[KNL] RAS-specific options
 
 		cec_disable	[X86]
diff --git a/Makefile b/Makefile
index cc77fd45ca64..d3bf50326d69 100644
--- a/Makefile
+++ b/Makefile
@@ -813,6 +813,10 @@ KBUILD_CFLAGS	+= -ftrivial-auto-var-init=zero
 KBUILD_CFLAGS	+= -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang
 endif
 
+# While VLAs have been removed, GCC produces unreachable stack probes
+# for the randomize_kstack_offset feature. Disable it for all compilers.
+KBUILD_CFLAGS	+= $(call cc-option, -fno-stack-clash-protection)
+
 DEBUG_CFLAGS	:=
 
 # Workaround for GCC versions < 5.0
diff --git a/arch/Kconfig b/arch/Kconfig
index ecfd3520b676..6b11c825fc36 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1054,6 +1054,29 @@ config VMAP_STACK
 	  backing virtual mappings with real shadow memory, and KASAN_VMALLOC
 	  must be enabled.
 
+config HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+	def_bool n
+	help
+	  An arch should select this symbol if it can support kernel stack
+	  offset randomization with calls to add_random_kstack_offset()
+	  during syscall entry and choose_random_kstack_offset() during
+	  syscall exit. Careful removal of -fstack-protector-strong and
+	  -fstack-protector should also be applied to the entry code and
+	  closely examined, as the artificial stack bump looks like an array
+	  to the compiler, so it will attempt to add canary checks regardless
+	  of the static branch state.
+
+config RANDOMIZE_KSTACK_OFFSET_DEFAULT
+	bool "Randomize kernel stack offset on syscall entry"
+	depends on HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+	help
+	  The kernel stack offset can be randomized (after pt_regs) by
+	  roughly 5 bits of entropy, frustrating memory corruption
+	  attacks that depend on stack address determinism or
+	  cross-syscall address exposures. This feature is controlled
+	  by kernel boot param "randomize_kstack_offset=on/off", and this
+	  config chooses the default boot state.
+
 config ARCH_OPTIONAL_KERNEL_RWX
 	def_bool n
 
diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h
new file mode 100644
index 000000000000..fd80fab663a9
--- /dev/null
+++ b/include/linux/randomize_kstack.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_RANDOMIZE_KSTACK_H
+#define _LINUX_RANDOMIZE_KSTACK_H
+
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <linux/percpu-defs.h>
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
+			 randomize_kstack_offset);
+DECLARE_PER_CPU(u32, kstack_offset);
+
+/*
+ * Do not use this anywhere else in the kernel. This is used here because
+ * it provides an arch-agnostic way to grow the stack with correct
+ * alignment. Also, since this use is being explicitly masked to a max of
+ * 10 bits, stack-clash style attacks are unlikely. For more details see
+ * "VLAs" in Documentation/process/deprecated.rst
+ */
+void *__builtin_alloca(size_t size);
+/*
+ * Use, at most, 10 bits of entropy. We explicitly cap this to keep the
+ * "VLA" from being unbounded (see above). 10 bits leaves enough room for
+ * per-arch offset masks to reduce entropy (by removing higher bits, since
+ * high entropy may overly constrain usable stack space), and for
+ * compiler/arch-specific stack alignment to remove the lower bits.
+ */
+#define KSTACK_OFFSET_MAX(x)	((x) & 0x3FF)
+
+/*
+ * These macros must be used during syscall entry when interrupts and
+ * preempt are disabled, and after user registers have been stored to
+ * the stack.
+ */
+#define add_random_kstack_offset() do {					\
+	if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,	\
+				&randomize_kstack_offset)) {		\
+		u32 offset = raw_cpu_read(kstack_offset);		\
+		u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset));	\
+		/* Keep allocation even after "ptr" loses scope. */	\
+		asm volatile("" : "=o"(*ptr) :: "memory");		\
+	}								\
+} while (0)
+
+#define choose_random_kstack_offset(rand) do {				\
+	if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,	\
+				&randomize_kstack_offset)) {		\
+		u32 offset = raw_cpu_read(kstack_offset);		\
+		offset ^= (rand);					\
+		raw_cpu_write(kstack_offset, offset);			\
+	}								\
+} while (0)
+
+#endif
diff --git a/init/main.c b/init/main.c
index 53b278845b88..f498aac26e8c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -844,6 +844,29 @@ static void __init mm_init(void)
 	pti_init();
 }
 
+#ifdef CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
+			   randomize_kstack_offset);
+DEFINE_PER_CPU(u32, kstack_offset);
+
+static int __init early_randomize_kstack_offset(char *buf)
+{
+	int ret;
+	bool bool_result;
+
+	ret = kstrtobool(buf, &bool_result);
+	if (ret)
+		return ret;
+
+	if (bool_result)
+		static_branch_enable(&randomize_kstack_offset);
+	else
+		static_branch_disable(&randomize_kstack_offset);
+	return 0;
+}
+early_param("randomize_kstack_offset", early_randomize_kstack_offset);
+#endif
+
 void __init __weak arch_call_rest_init(void)
 {
 	rest_init();
-- 
cgit v1.2.3


From 25de4ce5ed02994aea8bc111d133308f6fd62566 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Tue, 23 Mar 2021 09:43:26 +0200
Subject: clocksource/drivers/timer-ti-dm: Handle dra7 timer wrap errata i940

There is a timer wrap issue on dra7 for the ARM architected timer.
In a typical clock configuration the timer fails to wrap after 388 days.

To work around the issue, we need to use timer-ti-dm percpu timers instead.

Let's configure dmtimer3 and 4 as percpu timers by default, and warn about
the issue if the dtb is not configured properly.

Let's do this as a single patch so it can be backported to v5.8 and later
kernels easily. Note that this patch depends on earlier timer-ti-dm
systimer posted mode fixes, and a preparatory clockevent patch
"clocksource/drivers/timer-ti-dm: Prepare to handle dra7 timer wrap issue".

For more information, please see the errata for "AM572x Sitara Processors
Silicon Revisions 1.1, 2.0":

https://www.ti.com/lit/er/sprz429m/sprz429m.pdf

The concept is based on earlier reference patches done by Tero Kristo and
Keerthy.

Cc: Keerthy <j-keerthy@ti.com>
Cc: Tero Kristo <kristo@kernel.org>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20210323074326.28302-3-tony@atomide.com
---
 arch/arm/boot/dts/dra7-l4.dtsi             |  4 +-
 arch/arm/boot/dts/dra7.dtsi                | 20 ++++++++
 drivers/clocksource/timer-ti-dm-systimer.c | 76 ++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h                 |  1 +
 4 files changed, 99 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/boot/dts/dra7-l4.dtsi b/arch/arm/boot/dts/dra7-l4.dtsi
index 3bf90d9e3335..a294a02f2d23 100644
--- a/arch/arm/boot/dts/dra7-l4.dtsi
+++ b/arch/arm/boot/dts/dra7-l4.dtsi
@@ -1168,7 +1168,7 @@
 			};
 		};
 
-		target-module@34000 {			/* 0x48034000, ap 7 46.0 */
+		timer3_target: target-module@34000 {	/* 0x48034000, ap 7 46.0 */
 			compatible = "ti,sysc-omap4-timer", "ti,sysc";
 			reg = <0x34000 0x4>,
 			      <0x34010 0x4>;
@@ -1195,7 +1195,7 @@
 			};
 		};
 
-		target-module@36000 {			/* 0x48036000, ap 9 4e.0 */
+		timer4_target: target-module@36000 {	/* 0x48036000, ap 9 4e.0 */
 			compatible = "ti,sysc-omap4-timer", "ti,sysc";
 			reg = <0x36000 0x4>,
 			      <0x36010 0x4>;
diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index ce1194744f84..53d68786a61f 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -46,6 +46,7 @@
 
 	timer {
 		compatible = "arm,armv7-timer";
+		status = "disabled";	/* See ARM architected timer wrap erratum i940 */
 		interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(2) | IRQ_TYPE_LEVEL_LOW)>,
 			     <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(2) | IRQ_TYPE_LEVEL_LOW)>,
 			     <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(2) | IRQ_TYPE_LEVEL_LOW)>,
@@ -1241,3 +1242,22 @@
 		assigned-clock-parents = <&sys_32k_ck>;
 	};
 };
+
+/* Local timers, see ARM architected timer wrap erratum i940 */
+&timer3_target {
+	ti,no-reset-on-init;
+	ti,no-idle;
+	timer@0 {
+		assigned-clocks = <&l4per_clkctrl DRA7_L4PER_TIMER3_CLKCTRL 24>;
+		assigned-clock-parents = <&timer_sys_clk_div>;
+	};
+};
+
+&timer4_target {
+	ti,no-reset-on-init;
+	ti,no-idle;
+	timer@0 {
+		assigned-clocks = <&l4per_clkctrl DRA7_L4PER_TIMER4_CLKCTRL 24>;
+		assigned-clock-parents = <&timer_sys_clk_div>;
+	};
+};
diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c
index 330803150d85..b6f97960d8ee 100644
--- a/drivers/clocksource/timer-ti-dm-systimer.c
+++ b/drivers/clocksource/timer-ti-dm-systimer.c
@@ -2,6 +2,7 @@
 #include <linux/clk.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
+#include <linux/cpuhotplug.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
@@ -630,6 +631,78 @@ err_out_free:
 	return error;
 }
 
+/* Dmtimer as percpu timer. See dra7 ARM architected timer wrap erratum i940 */
+static DEFINE_PER_CPU(struct dmtimer_clockevent, dmtimer_percpu_timer);
+
+static int __init dmtimer_percpu_timer_init(struct device_node *np, int cpu)
+{
+	struct dmtimer_clockevent *clkevt;
+	int error;
+
+	if (!cpu_possible(cpu))
+		return -EINVAL;
+
+	if (!of_property_read_bool(np->parent, "ti,no-reset-on-init") ||
+	    !of_property_read_bool(np->parent, "ti,no-idle"))
+		pr_warn("Incomplete dtb for percpu dmtimer %pOF\n", np->parent);
+
+	clkevt = per_cpu_ptr(&dmtimer_percpu_timer, cpu);
+
+	error = dmtimer_clkevt_init_common(clkevt, np, CLOCK_EVT_FEAT_ONESHOT,
+					   cpumask_of(cpu), "percpu-dmtimer",
+					   500);
+	if (error)
+		return error;
+
+	return 0;
+}
+
+/* See TRM for timer internal resynch latency */
+static int omap_dmtimer_starting_cpu(unsigned int cpu)
+{
+	struct dmtimer_clockevent *clkevt = per_cpu_ptr(&dmtimer_percpu_timer, cpu);
+	struct clock_event_device *dev = &clkevt->dev;
+	struct dmtimer_systimer *t = &clkevt->t;
+
+	clockevents_config_and_register(dev, t->rate, 3, ULONG_MAX);
+	irq_force_affinity(dev->irq, cpumask_of(cpu));
+
+	return 0;
+}
+
+static int __init dmtimer_percpu_timer_startup(void)
+{
+	struct dmtimer_clockevent *clkevt = per_cpu_ptr(&dmtimer_percpu_timer, 0);
+	struct dmtimer_systimer *t = &clkevt->t;
+
+	if (t->sysc) {
+		cpuhp_setup_state(CPUHP_AP_TI_GP_TIMER_STARTING,
+				  "clockevents/omap/gptimer:starting",
+				  omap_dmtimer_starting_cpu, NULL);
+	}
+
+	return 0;
+}
+subsys_initcall(dmtimer_percpu_timer_startup);
+
+static int __init dmtimer_percpu_quirk_init(struct device_node *np, u32 pa)
+{
+	struct device_node *arm_timer;
+
+	arm_timer = of_find_compatible_node(NULL, NULL, "arm,armv7-timer");
+	if (of_device_is_available(arm_timer)) {
+		pr_warn_once("ARM architected timer wrap issue i940 detected\n");
+		return 0;
+	}
+
+	if (pa == 0x48034000)		/* dra7 dmtimer3 */
+		return dmtimer_percpu_timer_init(np, 0);
+	else if (pa == 0x48036000)	/* dra7 dmtimer4 */
+		return dmtimer_percpu_timer_init(np, 1);
+
+	return 0;
+}
+
 /* Clocksource */
 static struct dmtimer_clocksource *
 to_dmtimer_clocksource(struct clocksource *cs)
@@ -763,6 +836,9 @@ static int __init dmtimer_systimer_init(struct device_node *np)
 	if (clockevent == pa)
 		return dmtimer_clockevent_init(np);
 
+	if (of_machine_is_compatible("ti,dra7"))
+		return dmtimer_percpu_quirk_init(np, pa);
+
 	return 0;
 }
 
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f14adb882338..cc7c3fda2aa6 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -135,6 +135,7 @@ enum cpuhp_state {
 	CPUHP_AP_RISCV_TIMER_STARTING,
 	CPUHP_AP_CLINT_TIMER_STARTING,
 	CPUHP_AP_CSKY_TIMER_STARTING,
+	CPUHP_AP_TI_GP_TIMER_STARTING,
 	CPUHP_AP_HYPERV_TIMER_STARTING,
 	CPUHP_AP_KVM_STARTING,
 	CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
-- 
cgit v1.2.3


From d3c4a43d9291279c28b26757351a6ab72c110753 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 Apr 2021 08:22:55 +0200
Subject: block: refactor blk_drop_partitions

Move the busy check and disk-wide sync into the only caller, so that
the remainder can be shared with del_gendisk.  Also pass the gendisk
instead of the bdev as that is all that is needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210406062303.811835-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h             |  1 -
 block/genhd.c           | 11 +----------
 block/partitions/core.c | 14 +++-----------
 fs/block_dev.c          |  8 +++++---
 include/linux/genhd.h   |  2 +-
 5 files changed, 10 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk.h b/block/blk.h
index 8f4337c5a9e6..8b3591aee0a5 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -349,7 +349,6 @@ char *disk_name(struct gendisk *hd, int partno, char *buf);
 #define ADDPART_FLAG_NONE	0
 #define ADDPART_FLAG_RAID	1
 #define ADDPART_FLAG_WHOLEDISK	2
-void delete_partition(struct block_device *part);
 int bdev_add_partition(struct block_device *bdev, int partno,
 		sector_t start, sector_t length);
 int bdev_del_partition(struct block_device *bdev, int partno);
diff --git a/block/genhd.c b/block/genhd.c
index 15f99da4543f..8303ec67fd70 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -667,9 +667,6 @@ EXPORT_SYMBOL(device_add_disk_no_queue_reg);
  */
 void del_gendisk(struct gendisk *disk)
 {
-	struct disk_part_iter piter;
-	struct block_device *part;
-
 	might_sleep();
 
 	if (WARN_ON_ONCE(!disk->queue))
@@ -683,13 +680,7 @@ void del_gendisk(struct gendisk *disk)
 	 * disk is marked as dead (GENHD_FL_UP cleared).
 	 */
 	down_write(&bdev_lookup_sem);
-
-	/* invalidate stuff */
-	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
-	while ((part = disk_part_iter_next(&piter)))
-		delete_partition(part);
-	disk_part_iter_exit(&piter);
-
+	blk_drop_partitions(disk);
 	fsync_bdev(disk->part0);
 	__invalidate_device(disk->part0, true);
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 8c1735292940..536f7c5bb0b6 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -285,7 +285,7 @@ struct device_type part_type = {
  * Must be called either with bd_mutex held, before a disk can be opened or
  * after all disk users are gone.
  */
-void delete_partition(struct block_device *part)
+static void delete_partition(struct block_device *part)
 {
 	fsync_bdev(part);
 	__invalidate_device(part, true);
@@ -526,23 +526,15 @@ static bool disk_unlock_native_capacity(struct gendisk *disk)
 	}
 }
 
-int blk_drop_partitions(struct block_device *bdev)
+void blk_drop_partitions(struct gendisk *disk)
 {
 	struct disk_part_iter piter;
 	struct block_device *part;
 
-	if (bdev->bd_part_count)
-		return -EBUSY;
-
-	sync_blockdev(bdev);
-	invalidate_bdev(bdev);
-
-	disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY);
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter)))
 		delete_partition(part);
 	disk_part_iter_exit(&piter);
-
-	return 0;
 }
 
 static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 92ed7d5df677..594a1bee9dd9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1243,9 +1243,11 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
 	clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
 
 rescan:
-	ret = blk_drop_partitions(bdev);
-	if (ret)
-		return ret;
+	if (bdev->bd_part_count)
+		return -EBUSY;
+	sync_blockdev(bdev);
+	invalidate_bdev(bdev);
+	blk_drop_partitions(disk);
 
 	/*
 	 * Historically we only set the capacity to zero for devices that
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index f364619092cc..16178a935c40 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -273,7 +273,7 @@ static inline sector_t get_capacity(struct gendisk *disk)
 
 int bdev_disk_changed(struct block_device *bdev, bool invalidate);
 int blk_add_partitions(struct gendisk *disk, struct block_device *bdev);
-int blk_drop_partitions(struct block_device *bdev);
+void blk_drop_partitions(struct gendisk *disk);
 
 extern struct gendisk *__alloc_disk_node(int minors, int node_id);
 extern void put_disk(struct gendisk *disk);
-- 
cgit v1.2.3


From 3212135a718b06be38811f2d9a320ae842e76409 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 Apr 2021 08:23:02 +0200
Subject: block: remove disk_part_iter

Just open code the xa_for_each in the remaining user.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210406062303.811835-12-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 92 ++++++++-------------------------------------------
 include/linux/genhd.h | 19 -----------
 2 files changed, 13 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index cbfe1ff19360..39ca97b0edc6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -161,81 +161,6 @@ static void part_in_flight_rw(struct block_device *part,
 		inflight[1] = 0;
 }
 
-/**
- * disk_part_iter_init - initialize partition iterator
- * @piter: iterator to initialize
- * @disk: disk to iterate over
- * @flags: DISK_PITER_* flags
- *
- * Initialize @piter so that it iterates over partitions of @disk.
- *
- * CONTEXT:
- * Don't care.
- */
-void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
-			  unsigned int flags)
-{
-	piter->disk = disk;
-	piter->part = NULL;
-	if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
-		piter->idx = 0;
-	else
-		piter->idx = 1;
-	piter->flags = flags;
-}
-
-/**
- * disk_part_iter_next - proceed iterator to the next partition and return it
- * @piter: iterator of interest
- *
- * Proceed @piter to the next partition and return it.
- *
- * CONTEXT:
- * Don't care.
- */
-struct block_device *disk_part_iter_next(struct disk_part_iter *piter)
-{
-	struct block_device *part;
-	unsigned long idx;
-
-	/* put the last partition */
-	disk_part_iter_exit(piter);
-
-	rcu_read_lock();
-	xa_for_each_start(&piter->disk->part_tbl, idx, part, piter->idx) {
-		if (!bdev_nr_sectors(part) &&
-		    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
-		    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
-		      piter->idx == 0))
-			continue;
-
-		piter->part = bdgrab(part);
-		if (!piter->part)
-			continue;
-		piter->idx = idx + 1;
-		break;
-	}
-	rcu_read_unlock();
-
-	return piter->part;
-}
-
-/**
- * disk_part_iter_exit - finish up partition iteration
- * @piter: iter of interest
- *
- * Called when iteration is over.  Cleans up @piter.
- *
- * CONTEXT:
- * Don't care.
- */
-void disk_part_iter_exit(struct disk_part_iter *piter)
-{
-	if (piter->part)
-		bdput(piter->part);
-	piter->part = NULL;
-}
-
 /*
  * Can be deleted altogether. Later.
  *
@@ -472,13 +397,22 @@ static char *bdevt_str(dev_t devt, char *buf)
 
 void disk_uevent(struct gendisk *disk, enum kobject_action action)
 {
-	struct disk_part_iter piter;
 	struct block_device *part;
+	unsigned long idx;
+
+	rcu_read_lock();
+	xa_for_each(&disk->part_tbl, idx, part) {
+		if (bdev_is_partition(part) && !bdev_nr_sectors(part))
+			continue;
+		if (!bdgrab(part))
+			continue;
 
-	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY_PART0);
-	while ((part = disk_part_iter_next(&piter)))
+		rcu_read_unlock();
 		kobject_uevent(bdev_kobj(part), action);
-	disk_part_iter_exit(&piter);
+		bdput(part);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(disk_uevent);
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 16178a935c40..7e9660ea967d 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -204,25 +204,6 @@ static inline dev_t disk_devt(struct gendisk *disk)
 
 void disk_uevent(struct gendisk *disk, enum kobject_action action);
 
-/*
- * Smarter partition iterator without context limits.
- */
-#define DISK_PITER_INCL_EMPTY	(1 << 1) /* include 0-sized parts */
-#define DISK_PITER_INCL_PART0	(1 << 2) /* include partition 0 */
-#define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */
-
-struct disk_part_iter {
-	struct gendisk		*disk;
-	struct block_device	*part;
-	unsigned long		idx;
-	unsigned int		flags;
-};
-
-extern void disk_part_iter_init(struct disk_part_iter *piter,
-				 struct gendisk *disk, unsigned int flags);
-struct block_device *disk_part_iter_next(struct disk_part_iter *piter);
-extern void disk_part_iter_exit(struct disk_part_iter *piter);
-
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk,
 			    const struct attribute_group **groups);
-- 
cgit v1.2.3


From 35eb1f5033cf301bda48a22f2edcaed2fda70bfd Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Tue, 30 Mar 2021 14:39:23 +0800
Subject: powercap: RAPL: Fix struct declaration in header file

struct rapl_package is declared twice in intel_rapl.h, once at line 80
and once earlier.

Code inspection suggests that the first instance should be struct
rapl_domain rather than rapl_package, so change it.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/intel_rapl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 50b8398ffd21..93780834fc8f 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -33,7 +33,7 @@ enum rapl_domain_reg_id {
 	RAPL_DOMAIN_REG_MAX,
 };
 
-struct rapl_package;
+struct rapl_domain;
 
 enum rapl_primitives {
 	ENERGY_COUNTER,
-- 
cgit v1.2.3


From 2efa3377230bab432fdeabe5828d19d2d49a96d2 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 1 Apr 2021 22:15:37 +0800
Subject: freezer: Remove unused inline function try_to_freeze_nowarn()

There is no caller in tree, so can remove it.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/freezer.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 27828145ca09..0621c5f86c39 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -279,7 +279,6 @@ static inline int freeze_kernel_threads(void) { return -ENOSYS; }
 static inline void thaw_processes(void) {}
 static inline void thaw_kernel_threads(void) {}
 
-static inline bool try_to_freeze_nowarn(void) { return false; }
 static inline bool try_to_freeze(void) { return false; }
 
 static inline void freezer_do_not_count(void) {}
-- 
cgit v1.2.3


From 953c1fd96b1a70bcbbfb10973c2126eba8d891c7 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 2 Apr 2021 14:14:22 +0800
Subject: PM: runtime: Replace inline function pm_runtime_callbacks_present()

Commit 9a7875461fd0 ("PM: runtime: Replace pm_runtime_callbacks_present()")
forgot to change the inline version.

Fixes: 9a7875461fd0 ("PM: runtime: Replace pm_runtime_callbacks_present()")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_runtime.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index b492ae00cc90..6c08a085367b 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -265,7 +265,7 @@ static inline void pm_runtime_no_callbacks(struct device *dev) {}
 static inline void pm_runtime_irq_safe(struct device *dev) {}
 static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; }
 
-static inline bool pm_runtime_callbacks_present(struct device *dev) { return false; }
+static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; }
 static inline void pm_runtime_mark_last_busy(struct device *dev) {}
 static inline void __pm_runtime_use_autosuspend(struct device *dev,
 						bool use) {}
-- 
cgit v1.2.3


From 8eb99e9a64a07ea08070591bdc2615526a103e62 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 7 Apr 2021 19:58:20 +0200
Subject: ACPI: utils: Add acpi_reduced_hardware() helper

Add a getter for the acpi_gbl_reduced_hardware variable so that modules
can check if they are running on an ACPI reduced-hw platform or not.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/utils.c    | 11 +++++++++++
 include/acpi/acpi_bus.h |  1 +
 include/linux/acpi.h    |  5 +++++
 3 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 682edd913b3b..b20774c48c74 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -872,6 +872,17 @@ acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
 }
 EXPORT_SYMBOL(acpi_dev_get_first_match_dev);
 
+/**
+ * acpi_reduced_hardware - Return if this is an ACPI-reduced-hw machine
+ *
+ * Return true when running on an ACPI-reduced-hw machine, false otherwise.
+ */
+bool acpi_reduced_hardware(void)
+{
+	return acpi_gbl_reduced_hardware;
+}
+EXPORT_SYMBOL_GPL(acpi_reduced_hardware);
+
 /*
  * acpi_backlight= handling, this is done here rather then in video_detect.c
  * because __setup cannot be used in modules.
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index f28b097c658f..d631cb52283e 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -78,6 +78,7 @@ acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev,
 
 bool acpi_dev_found(const char *hid);
 bool acpi_dev_present(const char *hid, const char *uid, s64 hrv);
+bool acpi_reduced_hardware(void);
 
 #ifdef CONFIG_ACPI
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 3bdcfc4401b7..e2e6db8313c8 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -748,6 +748,11 @@ acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
 	return NULL;
 }
 
+static inline bool acpi_reduced_hardware(void)
+{
+	return false;
+}
+
 static inline void acpi_dev_put(struct acpi_device *adev) {}
 
 static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
-- 
cgit v1.2.3


From d08a745729646f407277e904b02991458f20d261 Mon Sep 17 00:00:00 2001
From: Angela Czubak <acz@semihalf.com>
Date: Thu, 8 Apr 2021 12:37:59 +0200
Subject: resource: Prevent irqresource_disabled() from erasing flags

Some Chromebooks use hard-coded interrupts in their ACPI tables.
This is an excerpt as dumped on Relm:

...
            Name (_HID, "ELAN0001")  // _HID: Hardware ID
            Name (_DDN, "Elan Touchscreen ")  // _DDN: DOS Device Name
            Name (_UID, 0x05)  // _UID: Unique ID
            Name (ISTP, Zero)
            Method (_CRS, 0, NotSerialized)  // _CRS: Current Resource Settings
            {
                Name (BUF0, ResourceTemplate ()
                {
                    I2cSerialBusV2 (0x0010, ControllerInitiated, 0x00061A80,
                        AddressingMode7Bit, "\\_SB.I2C1",
                        0x00, ResourceConsumer, , Exclusive,
                        )
                    Interrupt (ResourceConsumer, Edge, ActiveLow, Exclusive, ,, )
                    {
                        0x000000B8,
                    }
                })
                Return (BUF0) /* \_SB_.I2C1.ETSA._CRS.BUF0 */
            }
...

This interrupt is hard-coded to 0xB8 = 184 which is too high to be mapped
to IO-APIC, so no triggering information is propagated as acpi_register_gsi()
fails and irqresource_disabled() is issued, which leads to erasing triggering
and polarity information.

Do not overwrite flags as it leads to erasing triggering and polarity
information which might be useful in case of hard-coded interrupts.
This way the information can be read later on even though mapping to
APIC domain failed.

Signed-off-by: Angela Czubak <acz@semihalf.com>
[ rjw: Changelog rearrangement ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/ioport.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 55de385c839c..647744d8514e 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -331,7 +331,7 @@ static inline void irqresource_disabled(struct resource *res, u32 irq)
 {
 	res->start = irq;
 	res->end = irq;
-	res->flags = IORESOURCE_IRQ | IORESOURCE_DISABLED | IORESOURCE_UNSET;
+	res->flags |= IORESOURCE_IRQ | IORESOURCE_DISABLED | IORESOURCE_UNSET;
 }
 
 extern struct address_space *iomem_get_mapping(void);
-- 
cgit v1.2.3


From 9885d016ffa9465f91498e7a70c413c30446ad49 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Wed, 7 Apr 2021 22:22:49 +0200
Subject: net: phy: marvell10g: add separate structure for 88X3340
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 88X3340 contains 4 cores similar to 88X3310, but there is a
difference: it does not support xaui host mode. Instead the
corresponding MACTYPE means
  rxaui / 5gbase-r / 2500base-x / sgmii without AN

Signed-off-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell10g.c | 58 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/marvell_phy.h  |  6 ++++-
 2 files changed, 61 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 2dc1317e601e..f74dfd993d8b 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -557,6 +557,21 @@ static int mv3310_init_interface(struct phy_device *phydev, int mactype)
 	return 0;
 }
 
+static int mv3340_init_interface(struct phy_device *phydev, int mactype)
+{
+	struct mv3310_priv *priv = dev_get_drvdata(&phydev->mdio.dev);
+	int err = 0;
+
+	priv->rate_match = false;
+
+	if (mactype == MV_V2_3340_PORT_CTRL_MACTYPE_RXAUI_NO_SGMII_AN)
+		priv->const_interface = PHY_INTERFACE_MODE_RXAUI;
+	else
+		err = mv3310_init_interface(phydev, mactype);
+
+	return err;
+}
+
 static int mv3310_config_init(struct phy_device *phydev)
 {
 	struct mv3310_priv *priv = dev_get_drvdata(&phydev->mdio.dev);
@@ -884,6 +899,16 @@ static void mv3310_init_supported_interfaces(unsigned long *mask)
 	__set_bit(PHY_INTERFACE_MODE_USXGMII, mask);
 }
 
+static void mv3340_init_supported_interfaces(unsigned long *mask)
+{
+	__set_bit(PHY_INTERFACE_MODE_SGMII, mask);
+	__set_bit(PHY_INTERFACE_MODE_2500BASEX, mask);
+	__set_bit(PHY_INTERFACE_MODE_5GBASER, mask);
+	__set_bit(PHY_INTERFACE_MODE_RXAUI, mask);
+	__set_bit(PHY_INTERFACE_MODE_10GBASER, mask);
+	__set_bit(PHY_INTERFACE_MODE_USXGMII, mask);
+}
+
 static void mv2110_init_supported_interfaces(unsigned long *mask)
 {
 	__set_bit(PHY_INTERFACE_MODE_SGMII, mask);
@@ -903,6 +928,16 @@ static const struct mv3310_chip mv3310_type = {
 #endif
 };
 
+static const struct mv3310_chip mv3340_type = {
+	.init_supported_interfaces = mv3340_init_supported_interfaces,
+	.get_mactype = mv3310_get_mactype,
+	.init_interface = mv3340_init_interface,
+
+#ifdef CONFIG_HWMON
+	.hwmon_read_temp_reg = mv3310_hwmon_read_temp_reg,
+#endif
+};
+
 static const struct mv3310_chip mv2110_type = {
 	.init_supported_interfaces = mv2110_init_supported_interfaces,
 	.get_mactype = mv2110_get_mactype,
@@ -916,7 +951,7 @@ static const struct mv3310_chip mv2110_type = {
 static struct phy_driver mv3310_drivers[] = {
 	{
 		.phy_id		= MARVELL_PHY_ID_88X3310,
-		.phy_id_mask	= MARVELL_PHY_ID_MASK,
+		.phy_id_mask	= MARVELL_PHY_ID_88X33X0_MASK,
 		.name		= "mv88x3310",
 		.driver_data	= &mv3310_type,
 		.get_features	= mv3310_get_features,
@@ -932,6 +967,24 @@ static struct phy_driver mv3310_drivers[] = {
 		.remove		= mv3310_remove,
 		.set_loopback	= genphy_c45_loopback,
 	},
+	{
+		.phy_id		= MARVELL_PHY_ID_88X3340,
+		.phy_id_mask	= MARVELL_PHY_ID_88X33X0_MASK,
+		.name		= "mv88x3340",
+		.driver_data	= &mv3340_type,
+		.get_features	= mv3310_get_features,
+		.config_init	= mv3310_config_init,
+		.probe		= mv3310_probe,
+		.suspend	= mv3310_suspend,
+		.resume		= mv3310_resume,
+		.config_aneg	= mv3310_config_aneg,
+		.aneg_done	= mv3310_aneg_done,
+		.read_status	= mv3310_read_status,
+		.get_tunable	= mv3310_get_tunable,
+		.set_tunable	= mv3310_set_tunable,
+		.remove		= mv3310_remove,
+		.set_loopback	= genphy_c45_loopback,
+	},
 	{
 		.phy_id		= MARVELL_PHY_ID_88E2110,
 		.phy_id_mask	= MARVELL_PHY_ID_MASK,
@@ -954,7 +1007,8 @@ static struct phy_driver mv3310_drivers[] = {
 module_phy_driver(mv3310_drivers);
 
 static struct mdio_device_id __maybe_unused mv3310_tbl[] = {
-	{ MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_88X33X0_MASK },
+	{ MARVELL_PHY_ID_88X3340, MARVELL_PHY_ID_88X33X0_MASK },
 	{ MARVELL_PHY_ID_88E2110, MARVELL_PHY_ID_MASK },
 	{ },
 };
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 274abd5fbac3..6b11a5411082 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -22,10 +22,14 @@
 #define MARVELL_PHY_ID_88E1545		0x01410ea0
 #define MARVELL_PHY_ID_88E1548P		0x01410ec0
 #define MARVELL_PHY_ID_88E3016		0x01410e60
-#define MARVELL_PHY_ID_88X3310		0x002b09a0
 #define MARVELL_PHY_ID_88E2110		0x002b09b0
 #define MARVELL_PHY_ID_88X2222		0x01410f10
 
+/* PHY IDs and mask for Alaska 10G PHYs */
+#define MARVELL_PHY_ID_88X33X0_MASK	0xfffffff8
+#define MARVELL_PHY_ID_88X3310		0x002b09a0
+#define MARVELL_PHY_ID_88X3340		0x002b09a8
+
 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
 #define MARVELL_PHY_ID_88E1111_FINISAR	0x01ff0cc0
 
-- 
cgit v1.2.3


From fbe82b3db3e58edca33a7e7d9157eb7bdda6e537 Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Thu, 8 Apr 2021 15:00:41 +0800
Subject: net: qed: remove unused including <linux/version.h>

Remove including <linux/version.h> that don't need it.

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Signed-off-by: Zhiqi Song <songzhiqi1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_ll2_if.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 2f64ed79cee9..ea273ba1c991 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -12,7 +12,6 @@
 #include <linux/netdevice.h>
 #include <linux/pci.h>
 #include <linux/skbuff.h>
-#include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/qed/qed_if.h>
-- 
cgit v1.2.3


From 714638e02d94fa28c9e030d13d03e663fe24925e Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 29 Mar 2021 13:50:36 +0300
Subject: i2c: Add support for software nodes

This makes it possible for the drivers to assign complete
software fwnodes to the devices instead of only the device
properties in those nodes.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-base.c | 15 ++++++++++++++-
 include/linux/i2c.h         |  4 +++-
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 63ebf722a424..266b2013b1f1 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -919,15 +919,27 @@ i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *inf
 		}
 	}
 
+	if (info->swnode) {
+		status = device_add_software_node(&client->dev, info->swnode);
+		if (status) {
+			dev_err(&adap->dev,
+				"Failed to add software node to client %s: %d\n",
+				client->name, status);
+			goto out_free_props;
+		}
+	}
+
 	status = device_register(&client->dev);
 	if (status)
-		goto out_free_props;
+		goto out_remove_swnode;
 
 	dev_dbg(&adap->dev, "client [%s] registered with bus id %s\n",
 		client->name, dev_name(&client->dev));
 
 	return client;
 
+out_remove_swnode:
+	device_remove_software_node(&client->dev);
 out_free_props:
 	if (info->properties)
 		device_remove_properties(&client->dev);
@@ -960,6 +972,7 @@ void i2c_unregister_device(struct i2c_client *client)
 
 	if (ACPI_COMPANION(&client->dev))
 		acpi_device_clear_enumerated(ACPI_COMPANION(&client->dev));
+	device_remove_software_node(&client->dev);
 	device_unregister(&client->dev);
 }
 EXPORT_SYMBOL_GPL(i2c_unregister_device);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 56622658b215..cb1f882a3e88 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -391,7 +391,8 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; }
  * @platform_data: stored in i2c_client.dev.platform_data
  * @of_node: pointer to OpenFirmware device node
  * @fwnode: device node supplied by the platform firmware
- * @properties: additional device properties for the device
+ * @properties: Deprecated - use swnode instead
+ * @swnode: software node for the device
  * @resources: resources associated with the device
  * @num_resources: number of resources in the @resources array
  * @irq: stored in i2c_client.irq
@@ -416,6 +417,7 @@ struct i2c_board_info {
 	struct device_node *of_node;
 	struct fwnode_handle *fwnode;
 	const struct property_entry *properties;
+	const struct software_node *swnode;
 	const struct resource *resources;
 	unsigned int	num_resources;
 	int		irq;
-- 
cgit v1.2.3


From cf68fffb66d60d96209446bfc4a15291dc5a5d41 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Apr 2021 11:28:26 -0700
Subject: add support for Clang CFI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds support for Clang’s forward-edge Control Flow
Integrity (CFI) checking. With CONFIG_CFI_CLANG, the compiler
injects a runtime check before each indirect function call to ensure
the target is a valid function with the correct static type. This
restricts possible call targets and makes it more difficult for
an attacker to exploit bugs that allow the modification of stored
function pointers. For more details, see:

  https://clang.llvm.org/docs/ControlFlowIntegrity.html

Clang requires CONFIG_LTO_CLANG to be enabled with CFI to gain
visibility to possible call targets. Kernel modules are supported
with Clang’s cross-DSO CFI mode, which allows checking between
independently compiled components.

With CFI enabled, the compiler injects a __cfi_check() function into
the kernel and each module for validating local call targets. For
cross-module calls that cannot be validated locally, the compiler
calls the global __cfi_slowpath_diag() function, which determines
the target module and calls the correct __cfi_check() function. This
patch includes a slowpath implementation that uses __module_address()
to resolve call targets, and with CONFIG_CFI_CLANG_SHADOW enabled, a
shadow map that speeds up module look-ups by ~3x.

Clang implements indirect call checking using jump tables and
offers two methods of generating them. With canonical jump tables,
the compiler renames each address-taken function to <function>.cfi
and points the original symbol to a jump table entry, which passes
__cfi_check() validation. This isn’t compatible with stand-alone
assembly code, which the compiler doesn’t instrument, and would
result in indirect calls to assembly code to fail. Therefore, we
default to using non-canonical jump tables instead, where the compiler
generates a local jump table entry <function>.cfi_jt for each
address-taken function, and replaces all references to the function
with the address of the jump table entry.

Note that because non-canonical jump table addresses are local
to each component, they break cross-module function address
equality. Specifically, the address of a global function will be
different in each module, as it's replaced with the address of a local
jump table entry. If this address is passed to a different module,
it won’t match the address of the same function taken there. This
may break code that relies on comparing addresses passed from other
components.

CFI checking can be disabled in a function with the __nocfi attribute.
Additionally, CFI can be disabled for an entire compilation unit by
filtering out CC_FLAGS_CFI.

By default, CFI failures result in a kernel panic to stop a potential
exploit. CONFIG_CFI_PERMISSIVE enables a permissive mode, where the
kernel prints out a rate-limited warning instead, and allows execution
to continue. This option is helpful for locating type mismatches, but
should only be enabled during development.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-2-samitolvanen@google.com
---
 Makefile                          |  17 ++
 arch/Kconfig                      |  45 ++++++
 include/asm-generic/bug.h         |  16 ++
 include/asm-generic/vmlinux.lds.h |  20 ++-
 include/linux/cfi.h               |  41 +++++
 include/linux/compiler-clang.h    |   2 +
 include/linux/compiler_types.h    |   4 +
 include/linux/init.h              |   2 +-
 include/linux/module.h            |  13 +-
 init/Kconfig                      |   2 +-
 kernel/Makefile                   |   4 +
 kernel/cfi.c                      | 329 ++++++++++++++++++++++++++++++++++++++
 kernel/module.c                   |  43 +++++
 scripts/Makefile.modfinal         |   2 +-
 14 files changed, 534 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/cfi.h
 create mode 100644 kernel/cfi.c

(limited to 'include/linux')

diff --git a/Makefile b/Makefile
index cc77fd45ca64..427f75249d5e 100644
--- a/Makefile
+++ b/Makefile
@@ -920,6 +920,23 @@ KBUILD_AFLAGS	+= -fno-lto
 export CC_FLAGS_LTO
 endif
 
+ifdef CONFIG_CFI_CLANG
+CC_FLAGS_CFI	:= -fsanitize=cfi \
+		   -fsanitize-cfi-cross-dso \
+		   -fno-sanitize-cfi-canonical-jump-tables \
+		   -fno-sanitize-trap=cfi \
+		   -fno-sanitize-blacklist
+
+ifdef CONFIG_CFI_PERMISSIVE
+CC_FLAGS_CFI	+= -fsanitize-recover=cfi
+endif
+
+# If LTO flags are filtered out, we must also filter out CFI.
+CC_FLAGS_LTO	+= $(CC_FLAGS_CFI)
+KBUILD_CFLAGS	+= $(CC_FLAGS_CFI)
+export CC_FLAGS_CFI
+endif
+
 ifdef CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_32B
 KBUILD_CFLAGS += -falign-functions=32
 endif
diff --git a/arch/Kconfig b/arch/Kconfig
index ecfd3520b676..f6a85ba6cba2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -692,6 +692,51 @@ config LTO_CLANG_THIN
 	  If unsure, say Y.
 endchoice
 
+config ARCH_SUPPORTS_CFI_CLANG
+	bool
+	help
+	  An architecture should select this option if it can support Clang's
+	  Control-Flow Integrity (CFI) checking.
+
+config CFI_CLANG
+	bool "Use Clang's Control Flow Integrity (CFI)"
+	depends on LTO_CLANG && ARCH_SUPPORTS_CFI_CLANG
+	# Clang >= 12:
+	# - https://bugs.llvm.org/show_bug.cgi?id=46258
+	# - https://bugs.llvm.org/show_bug.cgi?id=47479
+	depends on CLANG_VERSION >= 120000
+	select KALLSYMS
+	help
+	  This option enables Clang’s forward-edge Control Flow Integrity
+	  (CFI) checking, where the compiler injects a runtime check to each
+	  indirect function call to ensure the target is a valid function with
+	  the correct static type. This restricts possible call targets and
+	  makes it more difficult for an attacker to exploit bugs that allow
+	  the modification of stored function pointers. More information can be
+	  found from Clang's documentation:
+
+	    https://clang.llvm.org/docs/ControlFlowIntegrity.html
+
+config CFI_CLANG_SHADOW
+	bool "Use CFI shadow to speed up cross-module checks"
+	default y
+	depends on CFI_CLANG && MODULES
+	help
+	  If you select this option, the kernel builds a fast look-up table of
+	  CFI check functions in loaded modules to reduce performance overhead.
+
+	  If unsure, say Y.
+
+config CFI_PERMISSIVE
+	bool "Use CFI in permissive mode"
+	depends on CFI_CLANG
+	help
+	  When selected, Control Flow Integrity (CFI) violations result in a
+	  warning instead of a kernel panic. This option should only be used
+	  for finding indirect call type mismatches during development.
+
+	  If unsure, say N.
+
 config HAVE_ARCH_WITHIN_STACK_FRAMES
 	bool
 	help
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 76a10e0dca9f..b402494883b6 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -241,6 +241,22 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 # define WARN_ON_SMP(x)			({0;})
 #endif
 
+/*
+ * WARN_ON_FUNCTION_MISMATCH() warns if a value doesn't match a
+ * function address, and can be useful for catching issues with
+ * callback functions, for example.
+ *
+ * With CONFIG_CFI_CLANG, the warning is disabled because the
+ * compiler replaces function addresses taken in C code with
+ * local jump table addresses, which breaks cross-module function
+ * address equality.
+ */
+#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_MODULES)
+# define WARN_ON_FUNCTION_MISMATCH(x, fn) ({ 0; })
+#else
+# define WARN_ON_FUNCTION_MISMATCH(x, fn) WARN_ON_ONCE((x) != (fn))
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0331d5d49551..40a9c101565e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -544,6 +544,22 @@
 	. = ALIGN((align));						\
 	__end_rodata = .;
 
+
+/*
+ * .text..L.cfi.jumptable.* contain Control-Flow Integrity (CFI)
+ * jump table entries.
+ */
+#ifdef CONFIG_CFI_CLANG
+#define TEXT_CFI_JT							\
+		. = ALIGN(PMD_SIZE);					\
+		__cfi_jt_start = .;					\
+		*(.text..L.cfi.jumptable .text..L.cfi.jumptable.*)	\
+		. = ALIGN(PMD_SIZE);					\
+		__cfi_jt_end = .;
+#else
+#define TEXT_CFI_JT
+#endif
+
 /*
  * Non-instrumentable text section
  */
@@ -570,6 +586,7 @@
 		NOINSTR_TEXT						\
 		*(.text..refcount)					\
 		*(.ref.text)						\
+		TEXT_CFI_JT						\
 	MEM_KEEP(init.text*)						\
 	MEM_KEEP(exit.text*)						\
 
@@ -974,7 +991,8 @@
  * keep any .init_array.* sections.
  * https://bugs.llvm.org/show_bug.cgi?id=46478
  */
-#if defined(CONFIG_GCOV_KERNEL) || defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KCSAN)
+#if defined(CONFIG_GCOV_KERNEL) || defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KCSAN) || \
+	defined(CONFIG_CFI_CLANG)
 # ifdef CONFIG_CONSTRUCTORS
 #  define SANITIZER_DISCARDS						\
 	*(.eh_frame)
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
new file mode 100644
index 000000000000..879744aaa6e0
--- /dev/null
+++ b/include/linux/cfi.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Clang Control Flow Integrity (CFI) support.
+ *
+ * Copyright (C) 2021 Google LLC
+ */
+#ifndef _LINUX_CFI_H
+#define _LINUX_CFI_H
+
+#ifdef CONFIG_CFI_CLANG
+typedef void (*cfi_check_fn)(uint64_t id, void *ptr, void *diag);
+
+/* Compiler-generated function in each module, and the kernel */
+extern void __cfi_check(uint64_t id, void *ptr, void *diag);
+
+/*
+ * Force the compiler to generate a CFI jump table entry for a function
+ * and store the jump table address to __cfi_jt_<function>.
+ */
+#define __CFI_ADDRESSABLE(fn, __attr) \
+	const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn
+
+#ifdef CONFIG_CFI_CLANG_SHADOW
+
+extern void cfi_module_add(struct module *mod, unsigned long base_addr);
+extern void cfi_module_remove(struct module *mod, unsigned long base_addr);
+
+#else
+
+static inline void cfi_module_add(struct module *mod, unsigned long base_addr) {}
+static inline void cfi_module_remove(struct module *mod, unsigned long base_addr) {}
+
+#endif /* CONFIG_CFI_CLANG_SHADOW */
+
+#else /* !CONFIG_CFI_CLANG */
+
+#define __CFI_ADDRESSABLE(fn, __attr)
+
+#endif /* CONFIG_CFI_CLANG */
+
+#endif /* _LINUX_CFI_H */
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index d217c382b02d..6de9d0c9377e 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -61,3 +61,5 @@
 #if __has_feature(shadow_call_stack)
 # define __noscs	__attribute__((__no_sanitize__("shadow-call-stack")))
 #endif
+
+#define __nocfi		__attribute__((__no_sanitize__("cfi")))
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index e5dd5a4ae946..796935a37e37 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -242,6 +242,10 @@ struct ftrace_likely_data {
 # define __noscs
 #endif
 
+#ifndef __nocfi
+# define __nocfi
+#endif
+
 #ifndef asm_volatile_goto
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
diff --git a/include/linux/init.h b/include/linux/init.h
index 31f54de58429..b3ea15348fbd 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -47,7 +47,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(".init.text") __cold  __latent_entropy __noinitretpoline
+#define __init		__section(".init.text") __cold  __latent_entropy __noinitretpoline __nocfi
 #define __initdata	__section(".init.data")
 #define __initconst	__section(".init.rodata")
 #define __exitdata	__section(".exit.data")
diff --git a/include/linux/module.h b/include/linux/module.h
index da4b6fbe8ebe..8100bb477d86 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -26,6 +26,7 @@
 #include <linux/tracepoint-defs.h>
 #include <linux/srcu.h>
 #include <linux/static_call_types.h>
+#include <linux/cfi.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -128,13 +129,17 @@ extern void cleanup_module(void);
 #define module_init(initfn)					\
 	static inline initcall_t __maybe_unused __inittest(void)		\
 	{ return initfn; }					\
-	int init_module(void) __copy(initfn) __attribute__((alias(#initfn)));
+	int init_module(void) __copy(initfn)			\
+		__attribute__((alias(#initfn)));		\
+	__CFI_ADDRESSABLE(init_module, __initdata);
 
 /* This is only required if you want to be unloadable. */
 #define module_exit(exitfn)					\
 	static inline exitcall_t __maybe_unused __exittest(void)		\
 	{ return exitfn; }					\
-	void cleanup_module(void) __copy(exitfn) __attribute__((alias(#exitfn)));
+	void cleanup_module(void) __copy(exitfn)		\
+		__attribute__((alias(#exitfn)));		\
+	__CFI_ADDRESSABLE(cleanup_module, __exitdata);
 
 #endif
 
@@ -376,6 +381,10 @@ struct module {
 	const s32 *crcs;
 	unsigned int num_syms;
 
+#ifdef CONFIG_CFI_CLANG
+	cfi_check_fn cfi_check;
+#endif
+
 	/* Kernel parameters. */
 #ifdef CONFIG_SYSFS
 	struct mutex param_lock;
diff --git a/init/Kconfig b/init/Kconfig
index 5f5c776ef192..5babea38e346 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2296,7 +2296,7 @@ endif # MODULES
 
 config MODULES_TREE_LOOKUP
 	def_bool y
-	depends on PERF_EVENTS || TRACING
+	depends on PERF_EVENTS || TRACING || CFI_CLANG
 
 config INIT_ALL_POSSIBLE
 	bool
diff --git a/kernel/Makefile b/kernel/Makefile
index 320f1f3941b7..e8a6715f38dc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -41,6 +41,9 @@ KCSAN_SANITIZE_kcov.o := n
 UBSAN_SANITIZE_kcov.o := n
 CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
 
+# Don't instrument error handlers
+CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
+
 obj-y += sched/
 obj-y += locking/
 obj-y += power/
@@ -111,6 +114,7 @@ obj-$(CONFIG_BPF) += bpf/
 obj-$(CONFIG_KCSAN) += kcsan/
 obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
 obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_CFI_CLANG) += cfi.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
diff --git a/kernel/cfi.c b/kernel/cfi.c
new file mode 100644
index 000000000000..e17a56639766
--- /dev/null
+++ b/kernel/cfi.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ *
+ * Copyright (C) 2021 Google LLC
+ */
+
+#include <linux/hardirq.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+
+/* Compiler-defined handler names */
+#ifdef CONFIG_CFI_PERMISSIVE
+#define cfi_failure_handler	__ubsan_handle_cfi_check_fail
+#else
+#define cfi_failure_handler	__ubsan_handle_cfi_check_fail_abort
+#endif
+
+static inline void handle_cfi_failure(void *ptr)
+{
+	if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
+		WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+	else
+		panic("CFI failure (target: %pS)\n", ptr);
+}
+
+#ifdef CONFIG_MODULES
+#ifdef CONFIG_CFI_CLANG_SHADOW
+/*
+ * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
+ * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
+ */
+typedef u16 shadow_t;
+#define SHADOW_INVALID		((shadow_t)~0UL)
+
+struct cfi_shadow {
+	/* Page index for the beginning of the shadow */
+	unsigned long base;
+	/* An array of __cfi_check locations (as indices to the shadow) */
+	shadow_t shadow[1];
+} __packed;
+
+/*
+ * The shadow covers ~128M from the beginning of the module region. If
+ * the region is larger, we fall back to __module_address for the rest.
+ */
+#define __SHADOW_RANGE		(_UL(SZ_128M) >> PAGE_SHIFT)
+
+/* The in-memory size of struct cfi_shadow, always at least one page */
+#define __SHADOW_PAGES		((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
+#define SHADOW_PAGES		max(1UL, __SHADOW_PAGES)
+#define SHADOW_SIZE		(SHADOW_PAGES << PAGE_SHIFT)
+
+/* The actual size of the shadow array, minus metadata */
+#define SHADOW_ARR_SIZE		(SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
+#define SHADOW_ARR_SLOTS	(SHADOW_ARR_SIZE / sizeof(shadow_t))
+
+static DEFINE_MUTEX(shadow_update_lock);
+static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
+
+/* Returns the index in the shadow for the given address */
+static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
+{
+	unsigned long index;
+	unsigned long page = ptr >> PAGE_SHIFT;
+
+	if (unlikely(page < s->base))
+		return -1; /* Outside of module area */
+
+	index = page - s->base;
+
+	if (index >= SHADOW_ARR_SLOTS)
+		return -1; /* Cannot be addressed with shadow */
+
+	return (int)index;
+}
+
+/* Returns the page address for an index in the shadow */
+static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
+	int index)
+{
+	if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
+		return 0;
+
+	return (s->base + index) << PAGE_SHIFT;
+}
+
+/* Returns the __cfi_check function address for the given shadow location */
+static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
+	int index)
+{
+	if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
+		return 0;
+
+	if (unlikely(s->shadow[index] == SHADOW_INVALID))
+		return 0;
+
+	/* __cfi_check is always page aligned */
+	return (s->base + s->shadow[index]) << PAGE_SHIFT;
+}
+
+static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
+		struct cfi_shadow *next)
+{
+	int i, index, check;
+
+	/* Mark everything invalid */
+	memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
+
+	if (!prev)
+		return; /* No previous shadow */
+
+	/* If the base address didn't change, an update is not needed */
+	if (prev->base == next->base) {
+		memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
+		return;
+	}
+
+	/* Convert the previous shadow to the new address range */
+	for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
+		if (prev->shadow[i] == SHADOW_INVALID)
+			continue;
+
+		index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
+		if (index < 0)
+			continue;
+
+		check = ptr_to_shadow(next,
+				shadow_to_check_fn(prev, prev->shadow[i]));
+		if (check < 0)
+			continue;
+
+		next->shadow[index] = (shadow_t)check;
+	}
+}
+
+static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
+			unsigned long min_addr, unsigned long max_addr)
+{
+	int check_index;
+	unsigned long check = (unsigned long)mod->cfi_check;
+	unsigned long ptr;
+
+	if (unlikely(!PAGE_ALIGNED(check))) {
+		pr_warn("cfi: not using shadow for module %s\n", mod->name);
+		return;
+	}
+
+	check_index = ptr_to_shadow(s, check);
+	if (check_index < 0)
+		return; /* Module not addressable with shadow */
+
+	/* For each page, store the check function index in the shadow */
+	for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
+		int index = ptr_to_shadow(s, ptr);
+
+		if (index >= 0) {
+			/* Each page must only contain one module */
+			WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
+			s->shadow[index] = (shadow_t)check_index;
+		}
+	}
+}
+
+static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
+		unsigned long min_addr, unsigned long max_addr)
+{
+	unsigned long ptr;
+
+	for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
+		int index = ptr_to_shadow(s, ptr);
+
+		if (index >= 0)
+			s->shadow[index] = SHADOW_INVALID;
+	}
+}
+
+typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
+			unsigned long min_addr, unsigned long max_addr);
+
+static void update_shadow(struct module *mod, unsigned long base_addr,
+		update_shadow_fn fn)
+{
+	struct cfi_shadow *prev;
+	struct cfi_shadow *next;
+	unsigned long min_addr, max_addr;
+
+	next = vmalloc(SHADOW_SIZE);
+
+	mutex_lock(&shadow_update_lock);
+	prev = rcu_dereference_protected(cfi_shadow,
+					 mutex_is_locked(&shadow_update_lock));
+
+	if (next) {
+		next->base = base_addr >> PAGE_SHIFT;
+		prepare_next_shadow(prev, next);
+
+		min_addr = (unsigned long)mod->core_layout.base;
+		max_addr = min_addr + mod->core_layout.text_size;
+		fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
+
+		set_memory_ro((unsigned long)next, SHADOW_PAGES);
+	}
+
+	rcu_assign_pointer(cfi_shadow, next);
+	mutex_unlock(&shadow_update_lock);
+	synchronize_rcu();
+
+	if (prev) {
+		set_memory_rw((unsigned long)prev, SHADOW_PAGES);
+		vfree(prev);
+	}
+}
+
+void cfi_module_add(struct module *mod, unsigned long base_addr)
+{
+	update_shadow(mod, base_addr, add_module_to_shadow);
+}
+
+void cfi_module_remove(struct module *mod, unsigned long base_addr)
+{
+	update_shadow(mod, base_addr, remove_module_from_shadow);
+}
+
+static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
+	unsigned long ptr)
+{
+	int index;
+
+	if (unlikely(!s))
+		return NULL; /* No shadow available */
+
+	index = ptr_to_shadow(s, ptr);
+	if (index < 0)
+		return NULL; /* Cannot be addressed with shadow */
+
+	return (cfi_check_fn)shadow_to_check_fn(s, index);
+}
+
+static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
+{
+	cfi_check_fn fn;
+
+	rcu_read_lock_sched();
+	fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
+	rcu_read_unlock_sched();
+
+	return fn;
+}
+
+#else /* !CONFIG_CFI_CLANG_SHADOW */
+
+static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_CFI_CLANG_SHADOW */
+
+static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
+{
+	cfi_check_fn fn = NULL;
+	struct module *mod;
+
+	rcu_read_lock_sched();
+	mod = __module_address(ptr);
+	if (mod)
+		fn = mod->cfi_check;
+	rcu_read_unlock_sched();
+
+	return fn;
+}
+
+static inline cfi_check_fn find_check_fn(unsigned long ptr)
+{
+	cfi_check_fn fn = NULL;
+
+	if (is_kernel_text(ptr))
+		return __cfi_check;
+
+	/*
+	 * Indirect call checks can happen when RCU is not watching. Both
+	 * the shadow and __module_address use RCU, so we need to wake it
+	 * up if necessary.
+	 */
+	RCU_NONIDLE({
+		if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
+			fn = find_shadow_check_fn(ptr);
+
+		if (!fn)
+			fn = find_module_check_fn(ptr);
+	});
+
+	return fn;
+}
+
+void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+{
+	cfi_check_fn fn = find_check_fn((unsigned long)ptr);
+
+	if (likely(fn))
+		fn(id, ptr, diag);
+	else /* Don't allow unchecked modules */
+		handle_cfi_failure(ptr);
+}
+EXPORT_SYMBOL(__cfi_slowpath_diag);
+
+#else /* !CONFIG_MODULES */
+
+void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+{
+	handle_cfi_failure(ptr); /* No modules */
+}
+EXPORT_SYMBOL(__cfi_slowpath_diag);
+
+#endif /* CONFIG_MODULES */
+
+void cfi_failure_handler(void *data, void *ptr, void *vtable)
+{
+	handle_cfi_failure(ptr);
+}
+EXPORT_SYMBOL(cfi_failure_handler);
diff --git a/kernel/module.c b/kernel/module.c
index 30479355ab85..20fb004e7d8d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2146,6 +2146,8 @@ void __weak module_arch_freeing_init(struct module *mod)
 {
 }
 
+static void cfi_cleanup(struct module *mod);
+
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -2187,6 +2189,9 @@ static void free_module(struct module *mod)
 	synchronize_rcu();
 	mutex_unlock(&module_mutex);
 
+	/* Clean up CFI for the module. */
+	cfi_cleanup(mod);
+
 	/* This may be empty, but that's OK */
 	module_arch_freeing_init(mod);
 	module_memfree(mod->init_layout.base);
@@ -3866,6 +3871,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
 	return 0;
 }
 
+static void cfi_init(struct module *mod);
+
 /*
  * Allocate and load the module: note that size of section 0 is always
  * zero, and we rely on this for optional sections.
@@ -3997,6 +4004,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	flush_module_icache(mod);
 
+	/* Setup CFI for the module. */
+	cfi_init(mod);
+
 	/* Now copy in args */
 	mod->args = strndup_user(uargs, ~0UL >> 1);
 	if (IS_ERR(mod->args)) {
@@ -4070,6 +4080,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	synchronize_rcu();
 	kfree(mod->args);
  free_arch_cleanup:
+	cfi_cleanup(mod);
 	module_arch_cleanup(mod);
  free_modinfo:
 	free_modinfo(mod);
@@ -4415,6 +4426,38 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 #endif /* CONFIG_LIVEPATCH */
 #endif /* CONFIG_KALLSYMS */
 
+static void cfi_init(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+	initcall_t *init;
+	exitcall_t *exit;
+
+	rcu_read_lock_sched();
+	mod->cfi_check = (cfi_check_fn)
+		find_kallsyms_symbol_value(mod, "__cfi_check");
+	init = (initcall_t *)
+		find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
+	exit = (exitcall_t *)
+		find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
+	rcu_read_unlock_sched();
+
+	/* Fix init/exit functions to point to the CFI jump table */
+	if (init)
+		mod->init = *init;
+	if (exit)
+		mod->exit = *exit;
+
+	cfi_module_add(mod, module_addr_min);
+#endif
+}
+
+static void cfi_cleanup(struct module *mod)
+{
+#ifdef CONFIG_CFI_CLANG
+	cfi_module_remove(mod, module_addr_min);
+#endif
+}
+
 /* Maximum number of characters written by module_flags() */
 #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
 
diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal
index 735e11e9041b..dd87cea9fba7 100644
--- a/scripts/Makefile.modfinal
+++ b/scripts/Makefile.modfinal
@@ -23,7 +23,7 @@ modname = $(notdir $(@:.mod.o=))
 part-of-module = y
 
 quiet_cmd_cc_o_c = CC [M]  $@
-      cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
+      cmd_cc_o_c = $(CC) $(filter-out $(CC_FLAGS_CFI), $(c_flags)) -c -o $@ $<
 
 %.mod.o: %.mod.c FORCE
 	$(call if_changed_dep,cc_o_c)
-- 
cgit v1.2.3


From ff301ceb5299551c3650d0e07ba879b766da4cc0 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Apr 2021 11:28:27 -0700
Subject: cfi: add __cficanonical

With CONFIG_CFI_CLANG, the compiler replaces a function address taken
in C code with the address of a local jump table entry, which passes
runtime indirect call checks. However, the compiler won't replace
addresses taken in assembly code, which will result in a CFI failure
if we later jump to such an address in instrumented C code. The code
generated for the non-canonical jump table looks this:

  <noncanonical.cfi_jt>: /* In C, &noncanonical points here */
	jmp noncanonical
  ...
  <noncanonical>:        /* function body */
	...

This change adds the __cficanonical attribute, which tells the
compiler to use a canonical jump table for the function instead. This
means the compiler will rename the actual function to <function>.cfi
and points the original symbol to the jump table entry instead:

  <canonical>:           /* jump table entry */
	jmp canonical.cfi
  ...
  <canonical.cfi>:       /* function body */
	...

As a result, the address taken in assembly, or other non-instrumented
code always points to the jump table and therefore, can be used for
indirect calls in instrumented code without tripping CFI checks.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>   # pci.h
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-3-samitolvanen@google.com
---
 include/linux/compiler-clang.h | 1 +
 include/linux/compiler_types.h | 4 ++++
 include/linux/init.h           | 4 ++--
 include/linux/pci.h            | 4 ++--
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 6de9d0c9377e..adbe76b203e2 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -63,3 +63,4 @@
 #endif
 
 #define __nocfi		__attribute__((__no_sanitize__("cfi")))
+#define __cficanonical	__attribute__((__cfi_canonical_jump_table__))
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 796935a37e37..d29bda7f6ebd 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -246,6 +246,10 @@ struct ftrace_likely_data {
 # define __nocfi
 #endif
 
+#ifndef __cficanonical
+# define __cficanonical
+#endif
+
 #ifndef asm_volatile_goto
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
diff --git a/include/linux/init.h b/include/linux/init.h
index b3ea15348fbd..045ad1650ed1 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -220,8 +220,8 @@ extern bool initcall_debug;
 	__initcall_name(initstub, __iid, id)
 
 #define __define_initcall_stub(__stub, fn)			\
-	int __init __stub(void);				\
-	int __init __stub(void)					\
+	int __init __cficanonical __stub(void);			\
+	int __init __cficanonical __stub(void)			\
 	{ 							\
 		return fn();					\
 	}							\
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 86c799c97b77..39684b72db91 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1944,8 +1944,8 @@ enum pci_fixup_pass {
 #ifdef CONFIG_LTO_CLANG
 #define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
 				  class_shift, hook, stub)		\
-	void stub(struct pci_dev *dev);					\
-	void stub(struct pci_dev *dev)					\
+	void __cficanonical stub(struct pci_dev *dev);			\
+	void __cficanonical stub(struct pci_dev *dev)			\
 	{ 								\
 		hook(dev); 						\
 	}								\
-- 
cgit v1.2.3


From 5caf968262df0ec7a3377fb67d4a6bfa979cb028 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Apr 2021 11:28:28 -0700
Subject: mm: add generic function_nocfi macro

With CONFIG_CFI_CLANG, the compiler replaces function addresses
in instrumented C code with jump table addresses. This means that
__pa_symbol(function) returns the physical address of the jump table
entry instead of the actual function, which may not work as the jump
table code will immediately jump to a virtual address that may not be
mapped.

To avoid this address space confusion, this change adds a generic
definition for function_nocfi(), which architectures that support CFI
can override. The typical implementation of would use inline assembly
to take the function address, which avoids compiler instrumentation.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-4-samitolvanen@google.com
---
 include/linux/mm.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ba434287387..22cce9c7dd05 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,6 +124,16 @@ extern int mmap_rnd_compat_bits __read_mostly;
 #define lm_alias(x)	__va(__pa_symbol(x))
 #endif
 
+/*
+ * With CONFIG_CFI_CLANG, the compiler replaces function addresses in
+ * instrumented C code with jump table addresses. Architectures that
+ * support CFI can define this macro to return the actual function address
+ * when needed.
+ */
+#ifndef function_nocfi
+#define function_nocfi(x) (x)
+#endif
+
 /*
  * To prevent common memory management code establishing
  * a zero page mapping on a read fault.
-- 
cgit v1.2.3


From 9f5b4009980f369acb80b72235b2d66c3fd6eca6 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Apr 2021 11:28:33 -0700
Subject: bpf: disable CFI in dispatcher functions

BPF dispatcher functions are patched at runtime to perform direct
instead of indirect calls. Disable CFI for the dispatcher functions to
avoid conflicts.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-9-samitolvanen@google.com
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3625f019767d..2f46f98479e1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -650,7 +650,7 @@ struct bpf_dispatcher {
 	struct bpf_ksym ksym;
 };
 
-static __always_inline unsigned int bpf_dispatcher_nop_func(
+static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
 	const void *ctx,
 	const struct bpf_insn *insnsi,
 	unsigned int (*bpf_func)(const void *,
@@ -678,7 +678,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr);
 }
 
 #define DEFINE_BPF_DISPATCHER(name)					\
-	noinline unsigned int bpf_dispatcher_##name##_func(		\
+	noinline __nocfi unsigned int bpf_dispatcher_##name##_func(	\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
 		unsigned int (*bpf_func)(const void *,			\
-- 
cgit v1.2.3


From 4f0f586bf0c898233d8f316f471a21db2abd522d Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Apr 2021 11:28:34 -0700
Subject: treewide: Change list_sort to use const pointers

list_sort() internally casts the comparison function passed to it
to a different type with constant struct list_head pointers, and
uses this pointer to call the functions, which trips indirect call
Control-Flow Integrity (CFI) checking.

Instead of removing the consts, this change defines the
list_cmp_func_t type and changes the comparison function types of
all list_sort() callers to use const pointers, thus avoiding type
mismatches.

Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210408182843.1754385-10-samitolvanen@google.com
---
 arch/arm64/kvm/vgic/vgic-its.c                         |  8 ++++----
 arch/arm64/kvm/vgic/vgic.c                             |  3 ++-
 block/blk-mq-sched.c                                   |  3 ++-
 block/blk-mq.c                                         |  3 ++-
 drivers/acpi/nfit/core.c                               |  3 ++-
 drivers/acpi/numa/hmat.c                               |  3 ++-
 drivers/clk/keystone/sci-clk.c                         |  4 ++--
 drivers/gpu/drm/drm_modes.c                            |  3 ++-
 drivers/gpu/drm/i915/gt/intel_engine_user.c            |  3 ++-
 drivers/gpu/drm/i915/gvt/debugfs.c                     |  2 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c          |  3 ++-
 drivers/gpu/drm/radeon/radeon_cs.c                     |  4 ++--
 drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c |  3 ++-
 drivers/interconnect/qcom/bcm-voter.c                  |  2 +-
 drivers/md/raid5.c                                     |  3 ++-
 drivers/misc/sram.c                                    |  4 ++--
 drivers/nvme/host/core.c                               |  3 ++-
 drivers/pci/controller/cadence/pcie-cadence-host.c     |  3 ++-
 drivers/spi/spi-loopback-test.c                        |  3 ++-
 fs/btrfs/raid56.c                                      |  3 ++-
 fs/btrfs/tree-log.c                                    |  3 ++-
 fs/btrfs/volumes.c                                     |  3 ++-
 fs/ext4/fsmap.c                                        |  4 ++--
 fs/gfs2/glock.c                                        |  3 ++-
 fs/gfs2/log.c                                          |  2 +-
 fs/gfs2/lops.c                                         |  3 ++-
 fs/iomap/buffered-io.c                                 |  3 ++-
 fs/ubifs/gc.c                                          |  7 ++++---
 fs/ubifs/replay.c                                      |  4 ++--
 fs/xfs/scrub/bitmap.c                                  |  4 ++--
 fs/xfs/xfs_bmap_item.c                                 |  4 ++--
 fs/xfs/xfs_buf.c                                       |  6 +++---
 fs/xfs/xfs_extent_busy.c                               |  4 ++--
 fs/xfs/xfs_extent_busy.h                               |  3 ++-
 fs/xfs/xfs_extfree_item.c                              |  4 ++--
 fs/xfs/xfs_refcount_item.c                             |  4 ++--
 fs/xfs/xfs_rmap_item.c                                 |  4 ++--
 include/linux/list_sort.h                              |  7 ++++---
 lib/list_sort.c                                        | 17 ++++++-----------
 lib/test_list_sort.c                                   |  3 ++-
 net/tipc/name_table.c                                  |  4 ++--
 41 files changed, 90 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 40cbaca81333..b9518f94bd43 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -2190,8 +2190,8 @@ static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
 	return offset;
 }
 
-static int vgic_its_ite_cmp(void *priv, struct list_head *a,
-			    struct list_head *b)
+static int vgic_its_ite_cmp(void *priv, const struct list_head *a,
+			    const struct list_head *b)
 {
 	struct its_ite *itea = container_of(a, struct its_ite, ite_list);
 	struct its_ite *iteb = container_of(b, struct its_ite, ite_list);
@@ -2329,8 +2329,8 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
 	return offset;
 }
 
-static int vgic_its_device_cmp(void *priv, struct list_head *a,
-			       struct list_head *b)
+static int vgic_its_device_cmp(void *priv, const struct list_head *a,
+			       const struct list_head *b)
 {
 	struct its_device *deva = container_of(a, struct its_device, dev_list);
 	struct its_device *devb = container_of(b, struct its_device, dev_list);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 1c597c9885fa..15b666200f0b 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -255,7 +255,8 @@ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
  * Return negative if "a" sorts before "b", 0 to preserve order, and positive
  * to sort "b" before "a".
  */
-static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int vgic_irq_cmp(void *priv, const struct list_head *a,
+			const struct list_head *b)
 {
 	struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
 	struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e1e997af89a0..3ebd6f10f728 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -75,7 +75,8 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 	blk_mq_run_hw_queue(hctx, true);
 }
 
-static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int sched_rq_cmp(void *priv, const struct list_head *a,
+			const struct list_head *b)
 {
 	struct request *rqa = container_of(a, struct request, queuelist);
 	struct request *rqb = container_of(b, struct request, queuelist);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d4d7c1caa439..4e3a70ab5be1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1895,7 +1895,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	spin_unlock(&ctx->lock);
 }
 
-static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int plug_rq_cmp(void *priv, const struct list_head *a,
+		       const struct list_head *b)
 {
 	struct request *rqa = container_of(a, struct request, queuelist);
 	struct request *rqb = container_of(b, struct request, queuelist);
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 8c5dde628405..d15e3ee93b5b 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1195,7 +1195,8 @@ static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc,
 	return 0;
 }
 
-static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b)
+static int nfit_mem_cmp(void *priv, const struct list_head *_a,
+		const struct list_head *_b)
 {
 	struct nfit_mem *a = container_of(_a, typeof(*a), list);
 	struct nfit_mem *b = container_of(_b, typeof(*b), list);
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index cb73a5d6ea76..137a5dd880c2 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -558,7 +558,8 @@ static bool hmat_update_best(u8 type, u32 value, u32 *best)
 	return updated;
 }
 
-static int initiator_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int initiator_cmp(void *priv, const struct list_head *a,
+			 const struct list_head *b)
 {
 	struct memory_initiator *ia;
 	struct memory_initiator *ib;
diff --git a/drivers/clk/keystone/sci-clk.c b/drivers/clk/keystone/sci-clk.c
index aaf31abe1c8f..7e1b136e71ae 100644
--- a/drivers/clk/keystone/sci-clk.c
+++ b/drivers/clk/keystone/sci-clk.c
@@ -503,8 +503,8 @@ static int ti_sci_scan_clocks_from_fw(struct sci_clk_provider *provider)
 
 #else
 
-static int _cmp_sci_clk_list(void *priv, struct list_head *a,
-			     struct list_head *b)
+static int _cmp_sci_clk_list(void *priv, const struct list_head *a,
+			     const struct list_head *b)
 {
 	struct sci_clk *ca = container_of(a, struct sci_clk, node);
 	struct sci_clk *cb = container_of(b, struct sci_clk, node);
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index 1ac67d4505e0..6662d0457ad6 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -1290,7 +1290,8 @@ EXPORT_SYMBOL(drm_mode_prune_invalid);
  * Negative if @lh_a is better than @lh_b, zero if they're equivalent, or
  * positive if @lh_b is better than @lh_a.
  */
-static int drm_mode_compare(void *priv, struct list_head *lh_a, struct list_head *lh_b)
+static int drm_mode_compare(void *priv, const struct list_head *lh_a,
+			    const struct list_head *lh_b)
 {
 	struct drm_display_mode *a = list_entry(lh_a, struct drm_display_mode, head);
 	struct drm_display_mode *b = list_entry(lh_b, struct drm_display_mode, head);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c b/drivers/gpu/drm/i915/gt/intel_engine_user.c
index 34e6096f196e..da21d2a10cc9 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
@@ -49,7 +49,8 @@ static const u8 uabi_classes[] = {
 	[VIDEO_ENHANCEMENT_CLASS] = I915_ENGINE_CLASS_VIDEO_ENHANCE,
 };
 
-static int engine_cmp(void *priv, struct list_head *A, struct list_head *B)
+static int engine_cmp(void *priv, const struct list_head *A,
+		      const struct list_head *B)
 {
 	const struct intel_engine_cs *a =
 		container_of((struct rb_node *)A, typeof(*a), uabi_node);
diff --git a/drivers/gpu/drm/i915/gvt/debugfs.c b/drivers/gpu/drm/i915/gvt/debugfs.c
index 62e6a14ad58e..9f1c209d9251 100644
--- a/drivers/gpu/drm/i915/gvt/debugfs.c
+++ b/drivers/gpu/drm/i915/gvt/debugfs.c
@@ -41,7 +41,7 @@ struct diff_mmio {
 
 /* Compare two diff_mmio items. */
 static int mmio_offset_compare(void *priv,
-	struct list_head *a, struct list_head *b)
+	const struct list_head *a, const struct list_head *b)
 {
 	struct diff_mmio *ma;
 	struct diff_mmio *mb;
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index c1adea8765a9..52b9c39e0155 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -1076,7 +1076,8 @@ static int igt_ppgtt_shrink_boom(void *arg)
 	return exercise_ppgtt(arg, shrink_boom);
 }
 
-static int sort_holes(void *priv, struct list_head *A, struct list_head *B)
+static int sort_holes(void *priv, const struct list_head *A,
+		      const struct list_head *B)
 {
 	struct drm_mm_node *a = list_entry(A, typeof(*a), hole_stack);
 	struct drm_mm_node *b = list_entry(B, typeof(*b), hole_stack);
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 35e937d39b51..1a5c3db1d53b 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -393,8 +393,8 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data)
 	return 0;
 }
 
-static int cmp_size_smaller_first(void *priv, struct list_head *a,
-				  struct list_head *b)
+static int cmp_size_smaller_first(void *priv, const struct list_head *a,
+				  const struct list_head *b)
 {
 	struct radeon_bo_list *la = list_entry(a, struct radeon_bo_list, tv.head);
 	struct radeon_bo_list *lb = list_entry(b, struct radeon_bo_list, tv.head);
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
index d399523206c7..29d71267af78 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
@@ -83,7 +83,8 @@ usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt,
 	return interval;
 }
 
-static int interval_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int interval_cmp(void *priv, const struct list_head *a,
+			const struct list_head *b)
 {
 	struct usnic_uiom_interval_node *node_a, *node_b;
 
diff --git a/drivers/interconnect/qcom/bcm-voter.c b/drivers/interconnect/qcom/bcm-voter.c
index 1cc565bce2f4..d1591a28b743 100644
--- a/drivers/interconnect/qcom/bcm-voter.c
+++ b/drivers/interconnect/qcom/bcm-voter.c
@@ -39,7 +39,7 @@ struct bcm_voter {
 	u32 tcs_wait;
 };
 
-static int cmp_vcd(void *priv, struct list_head *a, struct list_head *b)
+static int cmp_vcd(void *priv, const struct list_head *a, const struct list_head *b)
 {
 	const struct qcom_icc_bcm *bcm_a = list_entry(a, struct qcom_icc_bcm, list);
 	const struct qcom_icc_bcm *bcm_b = list_entry(b, struct qcom_icc_bcm, list);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5d57a5bd171f..841e1c1aa5e6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -953,7 +953,8 @@ static void dispatch_bio_list(struct bio_list *tmp)
 		submit_bio_noacct(bio);
 }
 
-static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+static int cmp_stripe(void *priv, const struct list_head *a,
+		      const struct list_head *b)
 {
 	const struct r5pending_data *da = list_entry(a,
 				struct r5pending_data, sibling);
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 6c1a23cb3e8c..202bf951e909 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -144,8 +144,8 @@ static void sram_free_partitions(struct sram_dev *sram)
 	}
 }
 
-static int sram_reserve_cmp(void *priv, struct list_head *a,
-					struct list_head *b)
+static int sram_reserve_cmp(void *priv, const struct list_head *a,
+					const struct list_head *b)
 {
 	struct sram_reserve *ra = list_entry(a, struct sram_reserve, list);
 	struct sram_reserve *rb = list_entry(b, struct sram_reserve, list);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0896e21642be..5eaaa51a5e30 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3855,7 +3855,8 @@ out_unlock:
 	return ret;
 }
 
-static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int ns_cmp(void *priv, const struct list_head *a,
+		const struct list_head *b)
 {
 	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
 	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
diff --git a/drivers/pci/controller/cadence/pcie-cadence-host.c b/drivers/pci/controller/cadence/pcie-cadence-host.c
index 73dcf8cf98fb..ae1c55503513 100644
--- a/drivers/pci/controller/cadence/pcie-cadence-host.c
+++ b/drivers/pci/controller/cadence/pcie-cadence-host.c
@@ -345,7 +345,8 @@ static int cdns_pcie_host_bar_config(struct cdns_pcie_rc *rc,
 	return 0;
 }
 
-static int cdns_pcie_host_dma_ranges_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int cdns_pcie_host_dma_ranges_cmp(void *priv, const struct list_head *a,
+					 const struct list_head *b)
 {
 	struct resource_entry *entry1, *entry2;
 
diff --git a/drivers/spi/spi-loopback-test.c b/drivers/spi/spi-loopback-test.c
index df981e55c24c..f1cf2232f0b5 100644
--- a/drivers/spi/spi-loopback-test.c
+++ b/drivers/spi/spi-loopback-test.c
@@ -454,7 +454,8 @@ struct rx_ranges {
 	u8 *end;
 };
 
-static int rx_ranges_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int rx_ranges_cmp(void *priv, const struct list_head *a,
+			 const struct list_head *b)
 {
 	struct rx_ranges *rx_a = list_entry(a, struct rx_ranges, list);
 	struct rx_ranges *rx_b = list_entry(b, struct rx_ranges, list);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8c31357f08ed..f4139de63b2e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1634,7 +1634,8 @@ struct btrfs_plug_cb {
 /*
  * rbios on the plug list are sorted for easier merging.
  */
-static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int plug_cmp(void *priv, const struct list_head *a,
+		    const struct list_head *b)
 {
 	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
 						 plug_list);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 92a368627791..00a88bd8105e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4136,7 +4136,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int extent_cmp(void *priv, const struct list_head *a,
+		      const struct list_head *b)
 {
 	struct extent_map *em1, *em2;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1c6810bbaf8b..912dd8b9f156 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1224,7 +1224,8 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
 	return 0;
 }
 
-static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int devid_cmp(void *priv, const struct list_head *a,
+		     const struct list_head *b)
 {
 	struct btrfs_device *dev1, *dev2;
 
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 4c2a9fe30067..4493ef0c715e 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -354,8 +354,8 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
 
 /* Compare two fsmap items. */
 static int ext4_getfsmap_compare(void *priv,
-				 struct list_head *a,
-				 struct list_head *b)
+				 const struct list_head *a,
+				 const struct list_head *b)
 {
 	struct ext4_fsmap *fa;
 	struct ext4_fsmap *fb;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9567520d79f7..c06a6cdf05de 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1732,7 +1732,8 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 	spin_unlock(&gl->gl_lockref.lock);
 }
 
-static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int glock_cmp(void *priv, const struct list_head *a,
+		     const struct list_head *b)
 {
 	struct gfs2_glock *gla, *glb;
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6410281546f9..88649b43fcff 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -695,7 +695,7 @@ void log_flush_wait(struct gfs2_sbd *sdp)
 	}
 }
 
-static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int ip_cmp(void *priv, const struct list_head *a, const struct list_head *b)
 {
 	struct gfs2_inode *ipa, *ipb;
 
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index a82f4747aa8d..b4809967efc6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -634,7 +634,8 @@ static void gfs2_check_magic(struct buffer_head *bh)
 	kunmap_atomic(kaddr);
 }
 
-static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int blocknr_cmp(void *priv, const struct list_head *a,
+		       const struct list_head *b)
 {
 	struct gfs2_bufdata *bda, *bdb;
 
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 414769a6ad11..0129e6bab985 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1155,7 +1155,8 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
 
 static int
-iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b)
+iomap_ioend_compare(void *priv, const struct list_head *a,
+		const struct list_head *b)
 {
 	struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
 	struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a4aaeea63893..dc3e26e9ed7b 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -102,7 +102,8 @@ static int switch_gc_head(struct ubifs_info *c)
  * This function compares data nodes @a and @b. Returns %1 if @a has greater
  * inode or block number, and %-1 otherwise.
  */
-static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int data_nodes_cmp(void *priv, const struct list_head *a,
+			  const struct list_head *b)
 {
 	ino_t inuma, inumb;
 	struct ubifs_info *c = priv;
@@ -145,8 +146,8 @@ static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
  * first and sorted by length in descending order. Directory entry nodes go
  * after inode nodes and are sorted in ascending hash valuer order.
  */
-static int nondata_nodes_cmp(void *priv, struct list_head *a,
-			     struct list_head *b)
+static int nondata_nodes_cmp(void *priv, const struct list_head *a,
+			     const struct list_head *b)
 {
 	ino_t inuma, inumb;
 	struct ubifs_info *c = priv;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 0f8a6a16421b..4d17e5382b74 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -298,8 +298,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
  * entries @a and @b by comparing their sequence numer.  Returns %1 if @a has
  * greater sequence number and %-1 otherwise.
  */
-static int replay_entries_cmp(void *priv, struct list_head *a,
-			      struct list_head *b)
+static int replay_entries_cmp(void *priv, const struct list_head *a,
+			      const struct list_head *b)
 {
 	struct ubifs_info *c = priv;
 	struct replay_entry *ra, *rb;
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index f88694f22d05..813b5f219113 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -63,8 +63,8 @@ xbitmap_init(
 static int
 xbitmap_range_cmp(
 	void			*priv,
-	struct list_head	*a,
-	struct list_head	*b)
+	const struct list_head	*a,
+	const struct list_head	*b)
 {
 	struct xbitmap_range	*ap;
 	struct xbitmap_range	*bp;
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 2344757ede63..e3a691937e92 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -265,8 +265,8 @@ xfs_trans_log_finish_bmap_update(
 static int
 xfs_bmap_update_diff_items(
 	void				*priv,
-	struct list_head		*a,
-	struct list_head		*b)
+	const struct list_head		*a,
+	const struct list_head		*b)
 {
 	struct xfs_bmap_intent		*ba;
 	struct xfs_bmap_intent		*bb;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 37a1d12762d8..592800c8852f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2124,9 +2124,9 @@ xfs_buf_delwri_queue(
  */
 static int
 xfs_buf_cmp(
-	void		*priv,
-	struct list_head *a,
-	struct list_head *b)
+	void			*priv,
+	const struct list_head	*a,
+	const struct list_head	*b)
 {
 	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
 	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ef17c1f6db32..a4075685d9eb 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -629,8 +629,8 @@ xfs_extent_busy_wait_all(
 int
 xfs_extent_busy_ag_cmp(
 	void			*priv,
-	struct list_head	*l1,
-	struct list_head	*l2)
+	const struct list_head	*l1,
+	const struct list_head	*l2)
 {
 	struct xfs_extent_busy	*b1 =
 		container_of(l1, struct xfs_extent_busy, list);
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 990ab3891971..8aea07100092 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -58,7 +58,8 @@ void
 xfs_extent_busy_wait_all(struct xfs_mount *mp);
 
 int
-xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a,
+	const struct list_head *b);
 
 static inline void xfs_extent_busy_sort(struct list_head *list)
 {
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 93223ebb3372..2424230ca2c3 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -397,8 +397,8 @@ xfs_trans_free_extent(
 static int
 xfs_extent_free_diff_items(
 	void				*priv,
-	struct list_head		*a,
-	struct list_head		*b)
+	const struct list_head		*a,
+	const struct list_head		*b)
 {
 	struct xfs_mount		*mp = priv;
 	struct xfs_extent_free_item	*ra;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 07ebccbbf4df..746f4eda724c 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -269,8 +269,8 @@ xfs_trans_log_finish_refcount_update(
 static int
 xfs_refcount_update_diff_items(
 	void				*priv,
-	struct list_head		*a,
-	struct list_head		*b)
+	const struct list_head		*a,
+	const struct list_head		*b)
 {
 	struct xfs_mount		*mp = priv;
 	struct xfs_refcount_intent	*ra;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 49cebd68b672..dc4f0c9f0897 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -337,8 +337,8 @@ xfs_trans_log_finish_rmap_update(
 static int
 xfs_rmap_update_diff_items(
 	void				*priv,
-	struct list_head		*a,
-	struct list_head		*b)
+	const struct list_head		*a,
+	const struct list_head		*b)
 {
 	struct xfs_mount		*mp = priv;
 	struct xfs_rmap_intent		*ra;
diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h
index 20f178c24e9d..453105f74e05 100644
--- a/include/linux/list_sort.h
+++ b/include/linux/list_sort.h
@@ -6,8 +6,9 @@
 
 struct list_head;
 
+typedef int __attribute__((nonnull(2,3))) (*list_cmp_func_t)(void *,
+		const struct list_head *, const struct list_head *);
+
 __attribute__((nonnull(2,3)))
-void list_sort(void *priv, struct list_head *head,
-	       int (*cmp)(void *priv, struct list_head *a,
-			  struct list_head *b));
+void list_sort(void *priv, struct list_head *head, list_cmp_func_t cmp);
 #endif
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 52f0c258c895..a926d96ffd44 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -7,16 +7,13 @@
 #include <linux/list_sort.h>
 #include <linux/list.h>
 
-typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *,
-		struct list_head const *, struct list_head const *);
-
 /*
  * Returns a list organized in an intermediate format suited
  * to chaining of merge() calls: null-terminated, no reserved or
  * sentinel head node, "prev" links not maintained.
  */
 __attribute__((nonnull(2,3,4)))
-static struct list_head *merge(void *priv, cmp_func cmp,
+static struct list_head *merge(void *priv, list_cmp_func_t cmp,
 				struct list_head *a, struct list_head *b)
 {
 	struct list_head *head, **tail = &head;
@@ -52,7 +49,7 @@ static struct list_head *merge(void *priv, cmp_func cmp,
  * throughout.
  */
 __attribute__((nonnull(2,3,4,5)))
-static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
+static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head,
 			struct list_head *a, struct list_head *b)
 {
 	struct list_head *tail = head;
@@ -185,9 +182,7 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
  */
 __attribute__((nonnull(2,3)))
-void list_sort(void *priv, struct list_head *head,
-		int (*cmp)(void *priv, struct list_head *a,
-			struct list_head *b))
+void list_sort(void *priv, struct list_head *head, list_cmp_func_t cmp)
 {
 	struct list_head *list = head->next, *pending = NULL;
 	size_t count = 0;	/* Count of pending */
@@ -227,7 +222,7 @@ void list_sort(void *priv, struct list_head *head,
 		if (likely(bits)) {
 			struct list_head *a = *tail, *b = a->prev;
 
-			a = merge(priv, (cmp_func)cmp, b, a);
+			a = merge(priv, cmp, b, a);
 			/* Install the merged result in place of the inputs */
 			a->prev = b->prev;
 			*tail = a;
@@ -249,10 +244,10 @@ void list_sort(void *priv, struct list_head *head,
 
 		if (!next)
 			break;
-		list = merge(priv, (cmp_func)cmp, pending, list);
+		list = merge(priv, cmp, pending, list);
 		pending = next;
 	}
 	/* The final merge, rebuilding prev links */
-	merge_final(priv, (cmp_func)cmp, head, pending, list);
+	merge_final(priv, cmp, head, pending, list);
 }
 EXPORT_SYMBOL(list_sort);
diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c
index 1f017d3b610e..00daaf23316f 100644
--- a/lib/test_list_sort.c
+++ b/lib/test_list_sort.c
@@ -56,7 +56,8 @@ static int __init check(struct debug_el *ela, struct debug_el *elb)
 	return 0;
 }
 
-static int __init cmp(void *priv, struct list_head *a, struct list_head *b)
+static int __init cmp(void *priv, const struct list_head *a,
+		      const struct list_head *b)
 {
 	struct debug_el *ela, *elb;
 
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index ee5ac40ea2b6..f8141443f2e2 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -397,8 +397,8 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr,
  * Code reused: time_after32() for the same purpose
  */
 #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id)
-static int tipc_publ_sort(void *priv, struct list_head *a,
-			  struct list_head *b)
+static int tipc_publ_sort(void *priv, const struct list_head *a,
+			  const struct list_head *b)
 {
 	struct publication *pa, *pb;
 
-- 
cgit v1.2.3


From 2361db89aaadfb671db6911b0063e01ec8922c28 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 9 Mar 2021 17:43:38 -0800
Subject: libnvdimm: Notify disk drivers to revalidate region read-only

Previous kernels allowed the BLKROSET to override the disk's read-only
status. With that situation fixed the pmem driver needs to rely on
notification events to reevaluate the disk read-only status after the
host region has been marked read-write.

Recall that when libnvdimm determines that the persistent memory has
lost persistence (for example lack of energy to flush from DRAM to FLASH
on an NVDIMM-N device) it marks the region read-only, but that state can
be overridden by the user via:

   echo 0 > /sys/bus/nd/devices/regionX/read_only

...to date there is no notification that the region has restored
persistence, so the user override is the only recovery.

Fixes: 52f019d43c22 ("block: add a hard-readonly flag to struct gendisk")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Vishal Verma <vishal.l.verma@intel.com>
Tested-by: Vishal Verma <vishal.l.verma@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/161534060720.528671.2341213328968989192.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c         | 14 ++++++--------
 drivers/nvdimm/pmem.c        | 37 +++++++++++++++++++++++++++++++++----
 drivers/nvdimm/region_devs.c |  7 +++++++
 include/linux/nd.h           |  1 +
 4 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 48f0985ca8a0..3a777d0073b7 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -631,16 +631,14 @@ void nvdimm_check_and_set_ro(struct gendisk *disk)
 	struct nd_region *nd_region = to_nd_region(dev->parent);
 	int disk_ro = get_disk_ro(disk);
 
-	/*
-	 * Upgrade to read-only if the region is read-only preserve as
-	 * read-only if the disk is already read-only.
-	 */
-	if (disk_ro || nd_region->ro == disk_ro)
+	/* catch the disk up with the region ro state */
+	if (disk_ro == nd_region->ro)
 		return;
 
-	dev_info(dev, "%s read-only, marking %s read-only\n",
-			dev_name(&nd_region->dev), disk->disk_name);
-	set_disk_ro(disk, 1);
+	dev_info(dev, "%s read-%s, marking %s read-%s\n",
+		 dev_name(&nd_region->dev), nd_region->ro ? "only" : "write",
+		 disk->disk_name, nd_region->ro ? "only" : "write");
+	set_disk_ro(disk, nd_region->ro);
 }
 EXPORT_SYMBOL(nvdimm_check_and_set_ro);
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b8a85bfb2e95..7daac795db39 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -26,6 +26,7 @@
 #include <linux/mm.h>
 #include <asm/cacheflush.h>
 #include "pmem.h"
+#include "btt.h"
 #include "pfn.h"
 #include "nd.h"
 
@@ -585,7 +586,7 @@ static void nd_pmem_shutdown(struct device *dev)
 	nvdimm_flush(to_nd_region(dev->parent), NULL);
 }
 
-static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
+static void pmem_revalidate_poison(struct device *dev)
 {
 	struct nd_region *nd_region;
 	resource_size_t offset = 0, end_trunc = 0;
@@ -595,9 +596,6 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 	struct range range;
 	struct kernfs_node *bb_state;
 
-	if (event != NVDIMM_REVALIDATE_POISON)
-		return;
-
 	if (is_nd_btt(dev)) {
 		struct nd_btt *nd_btt = to_nd_btt(dev);
 
@@ -635,6 +633,37 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 		sysfs_notify_dirent(bb_state);
 }
 
+static void pmem_revalidate_region(struct device *dev)
+{
+	struct pmem_device *pmem;
+
+	if (is_nd_btt(dev)) {
+		struct nd_btt *nd_btt = to_nd_btt(dev);
+		struct btt *btt = nd_btt->btt;
+
+		nvdimm_check_and_set_ro(btt->btt_disk);
+		return;
+	}
+
+	pmem = dev_get_drvdata(dev);
+	nvdimm_check_and_set_ro(pmem->disk);
+}
+
+static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
+{
+	switch (event) {
+	case NVDIMM_REVALIDATE_POISON:
+		pmem_revalidate_poison(dev);
+		break;
+	case NVDIMM_REVALIDATE_REGION:
+		pmem_revalidate_region(dev);
+		break;
+	default:
+		dev_WARN_ONCE(dev, 1, "notify: unknown event: %d\n", event);
+		break;
+	}
+}
+
 MODULE_ALIAS("pmem");
 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index ef23119db574..51870eb51da6 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -518,6 +518,12 @@ static ssize_t read_only_show(struct device *dev,
 	return sprintf(buf, "%d\n", nd_region->ro);
 }
 
+static int revalidate_read_only(struct device *dev, void *data)
+{
+	nd_device_notify(dev, NVDIMM_REVALIDATE_REGION);
+	return 0;
+}
+
 static ssize_t read_only_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
@@ -529,6 +535,7 @@ static ssize_t read_only_store(struct device *dev,
 		return rc;
 
 	nd_region->ro = ro;
+	device_for_each_child(dev, NULL, revalidate_read_only);
 	return len;
 }
 static DEVICE_ATTR_RW(read_only);
diff --git a/include/linux/nd.h b/include/linux/nd.h
index cec526c8043d..ee9ad76afbba 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -11,6 +11,7 @@
 
 enum nvdimm_event {
 	NVDIMM_REVALIDATE_POISON,
+	NVDIMM_REVALIDATE_REGION,
 };
 
 enum nvdimm_claim_class {
-- 
cgit v1.2.3


From 9432bbd969c667fc9c4b1c140c5a745ff2a7b540 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 23 Mar 2021 16:49:03 +0100
Subject: static_call: Relax static_call_update() function argument type

static_call_update() had stronger type requirements than regular C,
relax them to match. Instead of requiring the @func argument has the
exact matching type, allow any type which C is willing to promote to the
right (function) pointer type. Specifically this allows (void *)
arguments.

This cleans up a bunch of static_call_update() callers for
PREEMPT_DYNAMIC and should get around silly GCC11 warnings for free.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/YFoN7nCl8OfGtpeh@hirez.programming.kicks-ass.net
---
 include/linux/static_call.h |  4 ++--
 kernel/sched/core.c         | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 85ecc789f4ff..8d50f62420ca 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -113,9 +113,9 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool
 
 #define static_call_update(name, func)					\
 ({									\
-	BUILD_BUG_ON(!__same_type(*(func), STATIC_CALL_TRAMP(name)));	\
+	typeof(&STATIC_CALL_TRAMP(name)) __F = (func);			\
 	__static_call_update(&STATIC_CALL_KEY(name),			\
-			     STATIC_CALL_TRAMP_ADDR(name), func);	\
+			     STATIC_CALL_TRAMP_ADDR(name), __F);	\
 })
 
 #ifdef CONFIG_HAVE_STATIC_CALL_INLINE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98191218d891..67f989001894 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5396,25 +5396,25 @@ static void sched_dynamic_update(int mode)
 	switch (mode) {
 	case preempt_dynamic_none:
 		static_call_update(cond_resched, __cond_resched);
-		static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
-		static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
-		static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
-		static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+		static_call_update(might_resched, (void *)&__static_call_return0);
+		static_call_update(preempt_schedule, NULL);
+		static_call_update(preempt_schedule_notrace, NULL);
+		static_call_update(irqentry_exit_cond_resched, NULL);
 		pr_info("Dynamic Preempt: none\n");
 		break;
 
 	case preempt_dynamic_voluntary:
 		static_call_update(cond_resched, __cond_resched);
 		static_call_update(might_resched, __cond_resched);
-		static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
-		static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
-		static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+		static_call_update(preempt_schedule, NULL);
+		static_call_update(preempt_schedule_notrace, NULL);
+		static_call_update(irqentry_exit_cond_resched, NULL);
 		pr_info("Dynamic Preempt: voluntary\n");
 		break;
 
 	case preempt_dynamic_full:
-		static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0);
-		static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+		static_call_update(cond_resched, (void *)&__static_call_return0);
+		static_call_update(might_resched, (void *)&__static_call_return0);
 		static_call_update(preempt_schedule, __preempt_schedule_func);
 		static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
 		static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
-- 
cgit v1.2.3


From 1a7a6e8072ea0e4582de2da63a9088841fde798e Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Tue, 6 Apr 2021 09:30:36 +0200
Subject: pwm: Clarify which state pwm_get_state() returns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Given that lowlevel drivers usually cannot implement exactly what a
consumer requests with pwm_apply_state() there is some rounding
involved.

pwm_get_state() returns the setting that was requested most recently by
the consumer (opposed to what was actually implemented in hardware in
reply to the last request). Clarify this in the function kerneldoc.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 Documentation/driver-api/pwm.rst | 6 +++++-
 include/linux/pwm.h              | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst
index ab62f1bb0366..381f3c46cdac 100644
--- a/Documentation/driver-api/pwm.rst
+++ b/Documentation/driver-api/pwm.rst
@@ -55,7 +55,11 @@ several parameter at once. For example, if you see pwm_config() and
 pwm_{enable,disable}() calls in the same function, this probably means you
 should switch to pwm_apply_state().
 
-The PWM user API also allows one to query the PWM state with pwm_get_state().
+The PWM user API also allows one to query the last applied PWM state with
+pwm_get_last_applied_state(). Note this is different to what the driver has
+actually implemented if the request cannot be implemented exactly with the
+hardware in use. There is currently no way for consumers to get the actually
+implemented settings.
 
 In addition to the PWM state, the PWM API also exposes PWM arguments, which
 are the reference PWM config one should use on this PWM.
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 8f4eefd129aa..5bb90af4997e 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -91,6 +91,11 @@ struct pwm_device {
  * pwm_get_state() - retrieve the current PWM state
  * @pwm: PWM device
  * @state: state to fill with the current PWM state
+ *
+ * The returned PWM state represents the state that was applied by a previous call to
+ * pwm_apply_state(). Drivers may have to slightly tweak that state before programming it to
+ * hardware. If pwm_apply_state() was never called, this returns either the current hardware
+ * state (if supported) or the default settings.
  */
 static inline void pwm_get_state(const struct pwm_device *pwm,
 				 struct pwm_state *state)
-- 
cgit v1.2.3


From a503d1628c9c341dc5e1a26272e38182dca3e823 Mon Sep 17 00:00:00 2001
From: Jarvis Jiang <jarvis.w.jiang@gmail.com>
Date: Thu, 8 Apr 2021 03:02:20 -0700
Subject: bus: mhi: fix typo in comments for struct mhi_channel_config

The word 'rung' is a typo in below comment, fix it.
* @event_ring: The event rung index that services this channel

Signed-off-by: Jarvis Jiang <jarvis.w.jiang@gmail.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/20210408100220.3853-1-jarvis.w.jiang@gmail.com
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 include/linux/mhi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index d095fba37d1e..944aa3aa3035 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -205,7 +205,7 @@ enum mhi_db_brst_mode {
  * @num: The number assigned to this channel
  * @num_elements: The number of elements that can be queued to this channel
  * @local_elements: The local ring length of the channel
- * @event_ring: The event rung index that services this channel
+ * @event_ring: The event ring index that services this channel
  * @dir: Direction that data may flow on this channel
  * @type: Channel type
  * @ee_mask: Execution Environment mask for this channel
-- 
cgit v1.2.3


From ae196ddb0d3186bc08e529b8ea4bf62161ddfce2 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 7 Apr 2021 09:55:52 +0300
Subject: usb: typec: Port mapping utility

Adding functions that can be used to link/unlink ports -
USB ports, TBT3/USB4 ports, DisplayPorts and so on - to
the USB Type-C connectors they are attached to inside a
system. The symlink that is created for the port device is
named "connector".

Initially only ACPI is supported. ACPI port object shares
the _PLD (Physical Location of Device) with the USB Type-C
connector that it's attached to.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20210407065555.88110-2-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/Makefile      |   2 +-
 drivers/usb/typec/class.c       |   7 +-
 drivers/usb/typec/class.h       |   9 ++
 drivers/usb/typec/port-mapper.c | 219 ++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/typec.h       |  13 +++
 5 files changed, 248 insertions(+), 2 deletions(-)
 create mode 100644 drivers/usb/typec/port-mapper.c

(limited to 'include/linux')

diff --git a/drivers/usb/typec/Makefile b/drivers/usb/typec/Makefile
index 1fb8b6668b1b..a0adb8947a30 100644
--- a/drivers/usb/typec/Makefile
+++ b/drivers/usb/typec/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_TYPEC)		+= typec.o
-typec-y				:= class.o mux.o bus.o
+typec-y				:= class.o mux.o bus.o port-mapper.o
 obj-$(CONFIG_TYPEC)		+= altmodes/
 obj-$(CONFIG_TYPEC_TCPM)	+= tcpm/
 obj-$(CONFIG_TYPEC_UCSI)	+= ucsi/
diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index d3e100238635..ff199e2d26c7 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -18,7 +18,7 @@
 
 static DEFINE_IDA(typec_index_ida);
 
-static struct class typec_class = {
+struct class typec_class = {
 	.name = "typec",
 	.owner = THIS_MODULE,
 };
@@ -1601,6 +1601,7 @@ static void typec_release(struct device *dev)
 	ida_destroy(&port->mode_ids);
 	typec_switch_put(port->sw);
 	typec_mux_put(port->mux);
+	free_pld(port->pld);
 	kfree(port->cap);
 	kfree(port);
 }
@@ -1983,6 +1984,8 @@ struct typec_port *typec_register_port(struct device *parent,
 
 	ida_init(&port->mode_ids);
 	mutex_init(&port->port_type_lock);
+	mutex_init(&port->port_list_lock);
+	INIT_LIST_HEAD(&port->port_list);
 
 	port->id = id;
 	port->ops = cap->ops;
@@ -2024,6 +2027,8 @@ struct typec_port *typec_register_port(struct device *parent,
 		return ERR_PTR(ret);
 	}
 
+	port->pld = get_pld(&port->dev);
+
 	return port;
 }
 EXPORT_SYMBOL_GPL(typec_register_port);
diff --git a/drivers/usb/typec/class.h b/drivers/usb/typec/class.h
index d414be58d122..52294f7020a8 100644
--- a/drivers/usb/typec/class.h
+++ b/drivers/usb/typec/class.h
@@ -54,6 +54,11 @@ struct typec_port {
 
 	const struct typec_capability	*cap;
 	const struct typec_operations   *ops;
+
+	struct list_head		port_list;
+	struct mutex			port_list_lock; /* Port list lock */
+
+	void				*pld;
 };
 
 #define to_typec_port(_dev_) container_of(_dev_, struct typec_port, dev)
@@ -72,5 +77,9 @@ extern const struct device_type typec_port_dev_type;
 #define is_typec_port(dev) ((dev)->type == &typec_port_dev_type)
 
 extern struct class typec_mux_class;
+extern struct class typec_class;
+
+void *get_pld(struct device *dev);
+void free_pld(void *pld);
 
 #endif /* __USB_TYPEC_CLASS__ */
diff --git a/drivers/usb/typec/port-mapper.c b/drivers/usb/typec/port-mapper.c
new file mode 100644
index 000000000000..5bee7a97242f
--- /dev/null
+++ b/drivers/usb/typec/port-mapper.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * USB Type-C Connector Class Port Mapping Utility
+ *
+ * Copyright (C) 2021, Intel Corporation
+ * Author: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+ */
+
+#include <linux/acpi.h>
+#include <linux/usb.h>
+#include <linux/usb/typec.h>
+
+#include "class.h"
+
+struct port_node {
+	struct list_head list;
+	struct device *dev;
+	void *pld;
+};
+
+static int acpi_pld_match(const struct acpi_pld_info *pld1,
+			  const struct acpi_pld_info *pld2)
+{
+	if (!pld1 || !pld2)
+		return 0;
+
+	/*
+	 * To speed things up, first checking only the group_position. It seems
+	 * to often have the first unique value in the _PLD.
+	 */
+	if (pld1->group_position == pld2->group_position)
+		return !memcmp(pld1, pld2, sizeof(struct acpi_pld_info));
+
+	return 0;
+}
+
+void *get_pld(struct device *dev)
+{
+#ifdef CONFIG_ACPI
+	struct acpi_pld_info *pld;
+	acpi_status status;
+
+	if (!has_acpi_companion(dev))
+		return NULL;
+
+	status = acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld);
+	if (ACPI_FAILURE(status))
+		return NULL;
+
+	return pld;
+#else
+	return NULL;
+#endif
+}
+
+void free_pld(void *pld)
+{
+#ifdef CONFIG_ACPI
+	ACPI_FREE(pld);
+#endif
+}
+
+static int __link_port(struct typec_port *con, struct port_node *node)
+{
+	int ret;
+
+	ret = sysfs_create_link(&node->dev->kobj, &con->dev.kobj, "connector");
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_link(&con->dev.kobj, &node->dev->kobj,
+				dev_name(node->dev));
+	if (ret) {
+		sysfs_remove_link(&node->dev->kobj, "connector");
+		return ret;
+	}
+
+	list_add_tail(&node->list, &con->port_list);
+
+	return 0;
+}
+
+static int link_port(struct typec_port *con, struct port_node *node)
+{
+	int ret;
+
+	mutex_lock(&con->port_list_lock);
+	ret = __link_port(con, node);
+	mutex_unlock(&con->port_list_lock);
+
+	return ret;
+}
+
+static void __unlink_port(struct typec_port *con, struct port_node *node)
+{
+	sysfs_remove_link(&con->dev.kobj, dev_name(node->dev));
+	sysfs_remove_link(&node->dev->kobj, "connector");
+	list_del(&node->list);
+}
+
+static void unlink_port(struct typec_port *con, struct port_node *node)
+{
+	mutex_lock(&con->port_list_lock);
+	__unlink_port(con, node);
+	mutex_unlock(&con->port_list_lock);
+}
+
+static struct port_node *create_port_node(struct device *port)
+{
+	struct port_node *node;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	node->dev = get_device(port);
+	node->pld = get_pld(port);
+
+	return node;
+}
+
+static void remove_port_node(struct port_node *node)
+{
+	put_device(node->dev);
+	free_pld(node->pld);
+	kfree(node);
+}
+
+static int connector_match(struct device *dev, const void *data)
+{
+	const struct port_node *node = data;
+
+	if (!is_typec_port(dev))
+		return 0;
+
+	return acpi_pld_match(to_typec_port(dev)->pld, node->pld);
+}
+
+static struct device *find_connector(struct port_node *node)
+{
+	if (!node->pld)
+		return NULL;
+
+	return class_find_device(&typec_class, NULL, node, connector_match);
+}
+
+/**
+ * typec_link_port - Link a port to its connector
+ * @port: The port device
+ *
+ * Find the connector of @port and create symlink named "connector" for it.
+ * Returns 0 on success, or errno in case of a failure.
+ *
+ * NOTE. The function increments the reference count of @port on success.
+ */
+int typec_link_port(struct device *port)
+{
+	struct device *connector;
+	struct port_node *node;
+	int ret = 0;
+
+	node = create_port_node(port);
+	if (IS_ERR(node))
+		return PTR_ERR(node);
+
+	connector = find_connector(node);
+	if (!connector)
+		goto remove_node;
+
+	ret = link_port(to_typec_port(connector), node);
+	if (ret)
+		goto put_connector;
+
+	return 0;
+
+put_connector:
+	put_device(connector);
+remove_node:
+	remove_port_node(node);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(typec_link_port);
+
+static int port_match_and_unlink(struct device *connector, void *port)
+{
+	struct port_node *node;
+	struct port_node *tmp;
+	int ret = 0;
+
+	if (!is_typec_port(connector))
+		return 0;
+
+	mutex_lock(&to_typec_port(connector)->port_list_lock);
+	list_for_each_entry_safe(node, tmp, &to_typec_port(connector)->port_list, list) {
+		ret = node->dev == port;
+		if (ret) {
+			unlink_port(to_typec_port(connector), node);
+			remove_port_node(node);
+			put_device(connector);
+			break;
+		}
+	}
+	mutex_unlock(&to_typec_port(connector)->port_list_lock);
+
+	return ret;
+}
+
+/**
+ * typec_unlink_port - Unlink port from its connector
+ * @port: The port device
+ *
+ * Removes the symlink "connector" and decrements the reference count of @port.
+ */
+void typec_unlink_port(struct device *port)
+{
+	class_for_each_device(&typec_class, NULL, port, port_match_and_unlink);
+}
+EXPORT_SYMBOL_GPL(typec_unlink_port);
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index 91b4303ca305..e2714722b0c9 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -298,4 +298,17 @@ int typec_find_port_data_role(const char *name);
 void typec_partner_set_svdm_version(struct typec_partner *partner,
 				    enum usb_pd_svdm_ver svdm_version);
 int typec_get_negotiated_svdm_version(struct typec_port *port);
+
+#if IS_REACHABLE(CONFIG_TYPEC)
+int typec_link_port(struct device *port);
+void typec_unlink_port(struct device *port);
+#else
+static inline int typec_link_port(struct device *port)
+{
+	return 0;
+}
+
+static inline void typec_unlink_port(struct device *port) { }
+#endif
+
 #endif /* __LINUX_USB_TYPEC_H */
-- 
cgit v1.2.3


From b433c4c789d612cf58739a772bbddbd949bafd20 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 7 Apr 2021 09:55:54 +0300
Subject: usb: Iterator for ports

Introducing usb_for_each_port(). It works the same way as
usb_for_each_dev(), but instead of going through every USB
device in the system, it walks through the USB ports in the
system.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20210407065555.88110-4-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/usb.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb.h    |  9 +++++++++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 2ce3667ec6fa..62368c4ed37a 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -398,6 +398,52 @@ int usb_for_each_dev(void *data, int (*fn)(struct usb_device *, void *))
 }
 EXPORT_SYMBOL_GPL(usb_for_each_dev);
 
+struct each_hub_arg {
+	void *data;
+	int (*fn)(struct device *, void *);
+};
+
+static int __each_hub(struct usb_device *hdev, void *data)
+{
+	struct each_hub_arg *arg = (struct each_hub_arg *)data;
+	struct usb_hub *hub;
+	int ret = 0;
+	int i;
+
+	hub = usb_hub_to_struct_hub(hdev);
+	if (!hub)
+		return 0;
+
+	mutex_lock(&usb_port_peer_mutex);
+
+	for (i = 0; i < hdev->maxchild; i++) {
+		ret = arg->fn(&hub->ports[i]->dev, arg->data);
+		if (ret)
+			break;
+	}
+
+	mutex_unlock(&usb_port_peer_mutex);
+
+	return ret;
+}
+
+/**
+ * usb_for_each_port - interate over all USB ports in the system
+ * @data: data pointer that will be handed to the callback function
+ * @fn: callback function to be called for each USB port
+ *
+ * Iterate over all USB ports and call @fn for each, passing it @data. If it
+ * returns anything other than 0, we break the iteration prematurely and return
+ * that value.
+ */
+int usb_for_each_port(void *data, int (*fn)(struct device *, void *))
+{
+	struct each_hub_arg arg = {data, fn};
+
+	return usb_for_each_dev(&arg, __each_hub);
+}
+EXPORT_SYMBOL_GPL(usb_for_each_port);
+
 /**
  * usb_release_dev - free a usb device structure when all users of it are finished.
  * @dev: device that's been disconnected
diff --git a/include/linux/usb.h b/include/linux/usb.h
index ddd2f5b2a282..eaae24217e8a 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -882,6 +882,15 @@ extern struct usb_host_interface *usb_find_alt_setting(
 		unsigned int iface_num,
 		unsigned int alt_num);
 
+#if IS_REACHABLE(CONFIG_USB)
+int usb_for_each_port(void *data, int (*fn)(struct device *, void *));
+#else
+static inline int usb_for_each_port(void *data, int (*fn)(struct device *, void *))
+{
+	return 0;
+}
+#endif
+
 /* port claiming functions */
 int usb_hub_claim_port(struct usb_device *hdev, unsigned port1,
 		struct usb_dev_state *owner);
-- 
cgit v1.2.3


From 7b458a4c5d7302947556e12c83cfe4da769665d0 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 9 Apr 2021 15:40:31 +0200
Subject: usb: typec: Add typec_port_register_altmodes()

This can be used by Type-C controller drivers which use a standard
usb-connector fwnode, with altmodes sub-node, to describe the available
altmodes.

Note there are is no devicetree bindings documentation for the altmodes
node, this is deliberate. ATM the fwnodes used to register the altmodes
are only used internally to pass platform info from a drivers/platform/x86
driver to the type-c subsystem.

When a devicetree user of this functionally comes up and the dt-bindings
have been hashed out the internal use can be adjusted to match the
dt-bindings.

Currently the typec_port_register_altmodes() function expects
an "altmodes" child fwnode on port->dev with this "altmodes" fwnode having
child fwnodes itself with each child containing 2 integer properties:

1. A "svid" property, which sets the id of the altmode, e.g. displayport
altmode has a svid of 0xff01.

2. A "vdo" property, typically used as a bitmask describing the
capabilities of the altmode, the bits in the vdo are specified in the
specification of the altmode.

Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20210409134033.105834-2-hdegoede@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/class.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/typec.h |  6 ++++++
 2 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c
index f1c2d823c650..b9429c9f65f6 100644
--- a/drivers/usb/typec/class.c
+++ b/drivers/usb/typec/class.c
@@ -1923,6 +1923,60 @@ typec_port_register_altmode(struct typec_port *port,
 }
 EXPORT_SYMBOL_GPL(typec_port_register_altmode);
 
+void typec_port_register_altmodes(struct typec_port *port,
+	const struct typec_altmode_ops *ops, void *drvdata,
+	struct typec_altmode **altmodes, size_t n)
+{
+	struct fwnode_handle *altmodes_node, *child;
+	struct typec_altmode_desc desc;
+	struct typec_altmode *alt;
+	size_t index = 0;
+	u32 svid, vdo;
+	int ret;
+
+	altmodes_node = device_get_named_child_node(&port->dev, "altmodes");
+	if (!altmodes_node)
+		return; /* No altmodes specified */
+
+	fwnode_for_each_child_node(altmodes_node, child) {
+		ret = fwnode_property_read_u32(child, "svid", &svid);
+		if (ret) {
+			dev_err(&port->dev, "Error reading svid for altmode %s\n",
+				fwnode_get_name(child));
+			continue;
+		}
+
+		ret = fwnode_property_read_u32(child, "vdo", &vdo);
+		if (ret) {
+			dev_err(&port->dev, "Error reading vdo for altmode %s\n",
+				fwnode_get_name(child));
+			continue;
+		}
+
+		if (index >= n) {
+			dev_err(&port->dev, "Error not enough space for altmode %s\n",
+				fwnode_get_name(child));
+			continue;
+		}
+
+		desc.svid = svid;
+		desc.vdo = vdo;
+		desc.mode = index + 1;
+		alt = typec_port_register_altmode(port, &desc);
+		if (IS_ERR(alt)) {
+			dev_err(&port->dev, "Error registering altmode %s\n",
+				fwnode_get_name(child));
+			continue;
+		}
+
+		alt->ops = ops;
+		typec_altmode_set_drvdata(alt, drvdata);
+		altmodes[index] = alt;
+		index++;
+	}
+}
+EXPORT_SYMBOL_GPL(typec_port_register_altmodes);
+
 /**
  * typec_register_port - Register a USB Type-C Port
  * @parent: Parent device
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
index e2714722b0c9..e2e44bb1dad8 100644
--- a/include/linux/usb/typec.h
+++ b/include/linux/usb/typec.h
@@ -17,6 +17,7 @@ struct typec_partner;
 struct typec_cable;
 struct typec_plug;
 struct typec_port;
+struct typec_altmode_ops;
 
 struct fwnode_handle;
 struct device;
@@ -138,6 +139,11 @@ struct typec_altmode
 struct typec_altmode
 *typec_port_register_altmode(struct typec_port *port,
 			     const struct typec_altmode_desc *desc);
+
+void typec_port_register_altmodes(struct typec_port *port,
+	const struct typec_altmode_ops *ops, void *drvdata,
+	struct typec_altmode **altmodes, size_t n);
+
 void typec_unregister_altmode(struct typec_altmode *altmode);
 
 struct typec_port *typec_altmode2port(struct typec_altmode *alt);
-- 
cgit v1.2.3


From b37c38484375f5a204500e0b10b40da268090995 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 9 Apr 2021 13:27:04 -0700
Subject: treewide: change my e-mail address, fix my name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change my e-mail address to kabel@kernel.org, and fix my name in
non-code parts (add diacritical mark).

Link: https://lkml.kernel.org/r/20210325171123.28093-2-kabel@kernel.org
Signed-off-by: Marek Behún <kabel@kernel.org>
Cc: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jassi Brar <jassisinghbrar@gmail.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/debugfs-moxtet                       |  4 ++--
 Documentation/ABI/testing/debugfs-turris-mox-rwtm              |  2 +-
 Documentation/ABI/testing/sysfs-bus-moxtet-devices             |  6 +++---
 Documentation/ABI/testing/sysfs-class-led-driver-turris-omnia  |  2 +-
 Documentation/ABI/testing/sysfs-firmware-turris-mox-rwtm       | 10 +++++-----
 .../devicetree/bindings/leds/cznic,turris-omnia-leds.yaml      |  2 +-
 MAINTAINERS                                                    |  2 +-
 arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts         |  2 +-
 drivers/bus/moxtet.c                                           |  4 ++--
 drivers/firmware/turris-mox-rwtm.c                             |  4 ++--
 drivers/gpio/gpio-moxtet.c                                     |  4 ++--
 drivers/leds/leds-turris-omnia.c                               |  4 ++--
 drivers/mailbox/armada-37xx-rwtm-mailbox.c                     |  4 ++--
 drivers/watchdog/armada_37xx_wdt.c                             |  4 ++--
 include/dt-bindings/bus/moxtet.h                               |  2 +-
 include/linux/armada-37xx-rwtm-mailbox.h                       |  2 +-
 include/linux/moxtet.h                                         |  2 +-
 17 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/debugfs-moxtet b/Documentation/ABI/testing/debugfs-moxtet
index 6eee10c3d5a1..637d8587d03d 100644
--- a/Documentation/ABI/testing/debugfs-moxtet
+++ b/Documentation/ABI/testing/debugfs-moxtet
@@ -1,7 +1,7 @@
 What:		/sys/kernel/debug/moxtet/input
 Date:		March 2019
 KernelVersion:	5.3
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) Read input from the shift registers, in hexadecimal.
 		Returns N+1 bytes, where N is the number of Moxtet connected
 		modules. The first byte is from the CPU board itself.
@@ -19,7 +19,7 @@ Description:	(Read) Read input from the shift registers, in hexadecimal.
 What:		/sys/kernel/debug/moxtet/output
 Date:		March 2019
 KernelVersion:	5.3
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(RW) Read last written value to the shift registers, in
 		hexadecimal, or write values to the shift registers, also
 		in hexadecimal.
diff --git a/Documentation/ABI/testing/debugfs-turris-mox-rwtm b/Documentation/ABI/testing/debugfs-turris-mox-rwtm
index 326df1b74707..813987d5de4e 100644
--- a/Documentation/ABI/testing/debugfs-turris-mox-rwtm
+++ b/Documentation/ABI/testing/debugfs-turris-mox-rwtm
@@ -1,7 +1,7 @@
 What:		/sys/kernel/debug/turris-mox-rwtm/do_sign
 Date:		Jun 2020
 KernelVersion:	5.8
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:
 
 		======= ===========================================================
diff --git a/Documentation/ABI/testing/sysfs-bus-moxtet-devices b/Documentation/ABI/testing/sysfs-bus-moxtet-devices
index 4a6d61b44f3f..32dccc00d57d 100644
--- a/Documentation/ABI/testing/sysfs-bus-moxtet-devices
+++ b/Documentation/ABI/testing/sysfs-bus-moxtet-devices
@@ -1,17 +1,17 @@
 What:		/sys/bus/moxtet/devices/moxtet-<name>.<addr>/module_description
 Date:		March 2019
 KernelVersion:	5.3
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) Moxtet module description. Format: string
 
 What:		/sys/bus/moxtet/devices/moxtet-<name>.<addr>/module_id
 Date:		March 2019
 KernelVersion:	5.3
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) Moxtet module ID. Format: %x
 
 What:		/sys/bus/moxtet/devices/moxtet-<name>.<addr>/module_name
 Date:		March 2019
 KernelVersion:	5.3
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) Moxtet module name. Format: string
diff --git a/Documentation/ABI/testing/sysfs-class-led-driver-turris-omnia b/Documentation/ABI/testing/sysfs-class-led-driver-turris-omnia
index 795a5de12fc1..c4d46970c1cf 100644
--- a/Documentation/ABI/testing/sysfs-class-led-driver-turris-omnia
+++ b/Documentation/ABI/testing/sysfs-class-led-driver-turris-omnia
@@ -1,7 +1,7 @@
 What:		/sys/class/leds/<led>/device/brightness
 Date:		July 2020
 KernelVersion:	5.9
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(RW) On the front panel of the Turris Omnia router there is also
 		a button which can be used to control the intensity of all the
 		LEDs at once, so that if they are too bright, user can dim them.
diff --git a/Documentation/ABI/testing/sysfs-firmware-turris-mox-rwtm b/Documentation/ABI/testing/sysfs-firmware-turris-mox-rwtm
index b8631f5a29c4..ea5e5b489bc7 100644
--- a/Documentation/ABI/testing/sysfs-firmware-turris-mox-rwtm
+++ b/Documentation/ABI/testing/sysfs-firmware-turris-mox-rwtm
@@ -1,21 +1,21 @@
 What:		/sys/firmware/turris-mox-rwtm/board_version
 Date:		August 2019
 KernelVersion:	5.4
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) Board version burned into eFuses of this Turris Mox board.
 		Format: %i
 
 What:		/sys/firmware/turris-mox-rwtm/mac_address*
 Date:		August 2019
 KernelVersion:	5.4
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) MAC addresses burned into eFuses of this Turris Mox board.
 		Format: %pM
 
 What:		/sys/firmware/turris-mox-rwtm/pubkey
 Date:		August 2019
 KernelVersion:	5.4
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) ECDSA public key (in pubkey hex compressed form) computed
 		as pair to the ECDSA private key burned into eFuses of this
 		Turris Mox Board.
@@ -24,7 +24,7 @@ Description:	(Read) ECDSA public key (in pubkey hex compressed form) computed
 What:		/sys/firmware/turris-mox-rwtm/ram_size
 Date:		August 2019
 KernelVersion:	5.4
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) RAM size in MiB of this Turris Mox board as was detected
 		during manufacturing and burned into eFuses. Can be 512 or 1024.
 		Format: %i
@@ -32,6 +32,6 @@ Description:	(Read) RAM size in MiB of this Turris Mox board as was detected
 What:		/sys/firmware/turris-mox-rwtm/serial_number
 Date:		August 2019
 KernelVersion:	5.4
-Contact:	Marek Behún <marek.behun@nic.cz>
+Contact:	Marek Behún <kabel@kernel.org>
 Description:	(Read) Serial number burned into eFuses of this Turris Mox device.
 		Format: %016X
diff --git a/Documentation/devicetree/bindings/leds/cznic,turris-omnia-leds.yaml b/Documentation/devicetree/bindings/leds/cznic,turris-omnia-leds.yaml
index fe7fa25877fd..c7ed2871da06 100644
--- a/Documentation/devicetree/bindings/leds/cznic,turris-omnia-leds.yaml
+++ b/Documentation/devicetree/bindings/leds/cznic,turris-omnia-leds.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
 title: CZ.NIC's Turris Omnia LEDs driver
 
 maintainers:
-  - Marek Behún <marek.behun@nic.cz>
+  - Marek Behún <kabel@kernel.org>
 
 description:
   This module adds support for the RGB LEDs found on the front panel of the
diff --git a/MAINTAINERS b/MAINTAINERS
index 2a6f7f5edd5a..ba5cd29d2a48 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1791,7 +1791,7 @@ F:	drivers/pinctrl/pinctrl-gemini.c
 F:	drivers/rtc/rtc-ftrtc010.c
 
 ARM/CZ.NIC TURRIS SUPPORT
-M:	Marek Behun <marek.behun@nic.cz>
+M:	Marek Behun <kabel@kernel.org>
 S:	Maintained
 W:	https://www.turris.cz/
 F:	Documentation/ABI/testing/debugfs-moxtet
diff --git a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts
index d239ab70ed99..53e817c5f6f3 100644
--- a/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts
+++ b/arch/arm64/boot/dts/marvell/armada-3720-turris-mox.dts
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: (GPL-2.0+ OR MIT)
 /*
  * Device Tree file for CZ.NIC Turris Mox Board
- * 2019 by Marek Behun <marek.behun@nic.cz>
+ * 2019 by Marek Behún <kabel@kernel.org>
  */
 
 /dts-v1/;
diff --git a/drivers/bus/moxtet.c b/drivers/bus/moxtet.c
index b20fdcbd035b..fd87a59837fa 100644
--- a/drivers/bus/moxtet.c
+++ b/drivers/bus/moxtet.c
@@ -2,7 +2,7 @@
 /*
  * Turris Mox module configuration bus driver
  *
- * Copyright (C) 2019 Marek Behun <marek.behun@nic.cz>
+ * Copyright (C) 2019 Marek Behún <kabel@kernel.org>
  */
 
 #include <dt-bindings/bus/moxtet.h>
@@ -879,6 +879,6 @@ static void __exit moxtet_exit(void)
 }
 module_exit(moxtet_exit);
 
-MODULE_AUTHOR("Marek Behun <marek.behun@nic.cz>");
+MODULE_AUTHOR("Marek Behun <kabel@kernel.org>");
 MODULE_DESCRIPTION("CZ.NIC's Turris Mox module configuration bus");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/firmware/turris-mox-rwtm.c b/drivers/firmware/turris-mox-rwtm.c
index 50bb2a6d6ccf..62f0d1a5dd32 100644
--- a/drivers/firmware/turris-mox-rwtm.c
+++ b/drivers/firmware/turris-mox-rwtm.c
@@ -2,7 +2,7 @@
 /*
  * Turris Mox rWTM firmware driver
  *
- * Copyright (C) 2019 Marek Behun <marek.behun@nic.cz>
+ * Copyright (C) 2019 Marek Behún <kabel@kernel.org>
  */
 
 #include <linux/armada-37xx-rwtm-mailbox.h>
@@ -547,4 +547,4 @@ module_platform_driver(turris_mox_rwtm_driver);
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Turris Mox rWTM firmware driver");
-MODULE_AUTHOR("Marek Behun <marek.behun@nic.cz>");
+MODULE_AUTHOR("Marek Behun <kabel@kernel.org>");
diff --git a/drivers/gpio/gpio-moxtet.c b/drivers/gpio/gpio-moxtet.c
index 8299909318f4..61f9efd6c64f 100644
--- a/drivers/gpio/gpio-moxtet.c
+++ b/drivers/gpio/gpio-moxtet.c
@@ -2,7 +2,7 @@
 /*
  *  Turris Mox Moxtet GPIO expander
  *
- *  Copyright (C) 2018 Marek Behun <marek.behun@nic.cz>
+ *  Copyright (C) 2018 Marek Behún <kabel@kernel.org>
  */
 
 #include <linux/bitops.h>
@@ -174,6 +174,6 @@ static struct moxtet_driver moxtet_gpio_driver = {
 };
 module_moxtet_driver(moxtet_gpio_driver);
 
-MODULE_AUTHOR("Marek Behun <marek.behun@nic.cz>");
+MODULE_AUTHOR("Marek Behun <kabel@kernel.org>");
 MODULE_DESCRIPTION("Turris Mox Moxtet GPIO expander");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/leds/leds-turris-omnia.c b/drivers/leds/leds-turris-omnia.c
index 7b2f4d0ae3fe..2f9a289ab245 100644
--- a/drivers/leds/leds-turris-omnia.c
+++ b/drivers/leds/leds-turris-omnia.c
@@ -2,7 +2,7 @@
 /*
  * CZ.NIC's Turris Omnia LEDs driver
  *
- * 2020 by Marek Behun <marek.behun@nic.cz>
+ * 2020 by Marek Behún <kabel@kernel.org>
  */
 
 #include <linux/i2c.h>
@@ -287,6 +287,6 @@ static struct i2c_driver omnia_leds_driver = {
 
 module_i2c_driver(omnia_leds_driver);
 
-MODULE_AUTHOR("Marek Behun <marek.behun@nic.cz>");
+MODULE_AUTHOR("Marek Behun <kabel@kernel.org>");
 MODULE_DESCRIPTION("CZ.NIC's Turris Omnia LEDs");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/mailbox/armada-37xx-rwtm-mailbox.c b/drivers/mailbox/armada-37xx-rwtm-mailbox.c
index 9f2ce7f03c67..456a117a65fd 100644
--- a/drivers/mailbox/armada-37xx-rwtm-mailbox.c
+++ b/drivers/mailbox/armada-37xx-rwtm-mailbox.c
@@ -2,7 +2,7 @@
 /*
  * rWTM BIU Mailbox driver for Armada 37xx
  *
- * Author: Marek Behun <marek.behun@nic.cz>
+ * Author: Marek Behún <kabel@kernel.org>
  */
 
 #include <linux/device.h>
@@ -203,4 +203,4 @@ module_platform_driver(armada_37xx_mbox_driver);
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("rWTM BIU Mailbox driver for Armada 37xx");
-MODULE_AUTHOR("Marek Behun <marek.behun@nic.cz>");
+MODULE_AUTHOR("Marek Behun <kabel@kernel.org>");
diff --git a/drivers/watchdog/armada_37xx_wdt.c b/drivers/watchdog/armada_37xx_wdt.c
index e5dcb26d85f0..1635f421ef2c 100644
--- a/drivers/watchdog/armada_37xx_wdt.c
+++ b/drivers/watchdog/armada_37xx_wdt.c
@@ -2,7 +2,7 @@
 /*
  * Watchdog driver for Marvell Armada 37xx SoCs
  *
- * Author: Marek Behun <marek.behun@nic.cz>
+ * Author: Marek Behún <kabel@kernel.org>
  */
 
 #include <linux/clk.h>
@@ -366,7 +366,7 @@ static struct platform_driver armada_37xx_wdt_driver = {
 
 module_platform_driver(armada_37xx_wdt_driver);
 
-MODULE_AUTHOR("Marek Behun <marek.behun@nic.cz>");
+MODULE_AUTHOR("Marek Behun <kabel@kernel.org>");
 MODULE_DESCRIPTION("Armada 37xx CPU Watchdog");
 
 MODULE_LICENSE("GPL v2");
diff --git a/include/dt-bindings/bus/moxtet.h b/include/dt-bindings/bus/moxtet.h
index dc9345440ebe..10528de7b3ef 100644
--- a/include/dt-bindings/bus/moxtet.h
+++ b/include/dt-bindings/bus/moxtet.h
@@ -2,7 +2,7 @@
 /*
  * Constant for device tree bindings for Turris Mox module configuration bus
  *
- * Copyright (C) 2019 Marek Behun <marek.behun@nic.cz>
+ * Copyright (C) 2019 Marek Behún <kabel@kernel.org>
  */
 
 #ifndef _DT_BINDINGS_BUS_MOXTET_H
diff --git a/include/linux/armada-37xx-rwtm-mailbox.h b/include/linux/armada-37xx-rwtm-mailbox.h
index 57bb54f6767a..ef4bd705eb65 100644
--- a/include/linux/armada-37xx-rwtm-mailbox.h
+++ b/include/linux/armada-37xx-rwtm-mailbox.h
@@ -2,7 +2,7 @@
 /*
  * rWTM BIU Mailbox driver for Armada 37xx
  *
- * Author: Marek Behun <marek.behun@nic.cz>
+ * Author: Marek Behún <kabel@kernel.org>
  */
 
 #ifndef _LINUX_ARMADA_37XX_RWTM_MAILBOX_H_
diff --git a/include/linux/moxtet.h b/include/linux/moxtet.h
index 490db6886dcc..79184948fab4 100644
--- a/include/linux/moxtet.h
+++ b/include/linux/moxtet.h
@@ -2,7 +2,7 @@
 /*
  * Turris Mox module configuration bus driver
  *
- * Copyright (C) 2019 Marek Behun <marek.behun@nic.cz>
+ * Copyright (C) 2019 Marek Behún <kabel@kernel.org>
  */
 
 #ifndef __LINUX_MOXTET_H
-- 
cgit v1.2.3


From fba863b816049b03f3fbb07b10ebdcfe5c4141f7 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 7 Apr 2021 17:51:56 +0200
Subject: net: phy: make PHY PM ops a no-op if MAC driver manages PHY PM

Resume callback of the PHY driver is called after the one for the MAC
driver. The PHY driver resume callback calls phy_init_hw(), and this is
potentially problematic if the MAC driver calls phy_start() in its resume
callback. One issue was reported with the fec driver and a KSZ8081 PHY
which seems to become unstable if a soft reset is triggered during aneg.

The new flag allows MAC drivers to indicate that they take care of
suspending/resuming the PHY. Then the MAC PM callbacks can handle
any dependency between MAC and PHY PM.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 6 ++++++
 include/linux/phy.h          | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index a009d1769b08..73d29fd5e03d 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -273,6 +273,9 @@ static __maybe_unused int mdio_bus_phy_suspend(struct device *dev)
 {
 	struct phy_device *phydev = to_phy_device(dev);
 
+	if (phydev->mac_managed_pm)
+		return 0;
+
 	/* We must stop the state machine manually, otherwise it stops out of
 	 * control, possibly with the phydev->lock held. Upon resume, netdev
 	 * may call phy routines that try to grab the same lock, and that may
@@ -294,6 +297,9 @@ static __maybe_unused int mdio_bus_phy_resume(struct device *dev)
 	struct phy_device *phydev = to_phy_device(dev);
 	int ret;
 
+	if (phydev->mac_managed_pm)
+		return 0;
+
 	if (!phydev->suspended_by_mdio_bus)
 		goto no_resume;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 8e2cf84b2318..98fb441dd72e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -493,6 +493,7 @@ struct macsec_ops;
  * @loopback_enabled: Set true if this PHY has been loopbacked successfully.
  * @downshifted_rate: Set true if link speed has been downshifted.
  * @is_on_sfp_module: Set true if PHY is located on an SFP module.
+ * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY
  * @state: State of the PHY for management purposes
  * @dev_flags: Device-specific flags used by the PHY driver.
  * @irq: IRQ number of the PHY's interrupt (-1 if none)
@@ -567,6 +568,7 @@ struct phy_device {
 	unsigned loopback_enabled:1;
 	unsigned downshifted_rate:1;
 	unsigned is_on_sfp_module:1;
+	unsigned mac_managed_pm:1;
 
 	unsigned autoneg:1;
 	/* The most recently read link state */
-- 
cgit v1.2.3


From 1b8b20868a6d64cfe8174a21b25b74367bdf0560 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 7 Apr 2021 11:52:02 +0200
Subject: tty: fix return value for unsupported ioctls

Drivers should return -ENOTTY ("Inappropriate I/O control operation")
when an ioctl isn't supported, while -EINVAL is used for invalid
arguments.

Fix up the TIOCMGET, TIOCMSET and TIOCGICOUNT helpers which returned
-EINVAL when a tty driver did not implement the corresponding
operations.

Note that the TIOCMGET and TIOCMSET helpers predate git and do not get a
corresponding Fixes tag below.

Fixes: d281da7ff6f7 ("tty: Make tiocgicount a handler")
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20210407095208.31838-3-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c       | 10 +++++-----
 include/linux/tty_driver.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index ebcde1f96adb..7738514a15fe 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2529,14 +2529,14 @@ out:
  *	@p: pointer to result
  *
  *	Obtain the modem status bits from the tty driver if the feature
- *	is supported. Return -EINVAL if it is not available.
+ *	is supported. Return -ENOTTY if it is not available.
  *
  *	Locking: none (up to the driver)
  */
 
 static int tty_tiocmget(struct tty_struct *tty, int __user *p)
 {
-	int retval = -EINVAL;
+	int retval = -ENOTTY;
 
 	if (tty->ops->tiocmget) {
 		retval = tty->ops->tiocmget(tty);
@@ -2554,7 +2554,7 @@ static int tty_tiocmget(struct tty_struct *tty, int __user *p)
  *	@p: pointer to desired bits
  *
  *	Set the modem status bits from the tty driver if the feature
- *	is supported. Return -EINVAL if it is not available.
+ *	is supported. Return -ENOTTY if it is not available.
  *
  *	Locking: none (up to the driver)
  */
@@ -2566,7 +2566,7 @@ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd,
 	unsigned int set, clear, val;
 
 	if (tty->ops->tiocmset == NULL)
-		return -EINVAL;
+		return -ENOTTY;
 
 	retval = get_user(val, p);
 	if (retval)
@@ -2606,7 +2606,7 @@ int tty_get_icount(struct tty_struct *tty,
 	if (tty->ops->get_icount)
 		return tty->ops->get_icount(tty, icount);
 	else
-		return -EINVAL;
+		return -ENOTTY;
 }
 EXPORT_SYMBOL_GPL(tty_get_icount);
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 61c3372d3f32..2f719b471d52 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -228,7 +228,7 @@
  *
  *	Called when the device receives a TIOCGICOUNT ioctl. Passed a kernel
  *	structure to complete. This method is optional and will only be called
- *	if provided (otherwise EINVAL will be returned).
+ *	if provided (otherwise ENOTTY will be returned).
  */
 
 #include <linux/export.h>
-- 
cgit v1.2.3


From 86b20677e8b657c8c9701edd907f356795e9e427 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Fri, 9 Apr 2021 09:35:12 +0200
Subject: tty: clarify that not all ttys have a class device

Commit 30004ac9c090 ("tty: add tty_struct->dev pointer to corresponding
device instance") added a struct device pointer field to struct
tty_struct which was populated with the corresponding tty class device
during initialisation.

Unfortunately, not all ttys have a class device (e.g. pseudoterminals
and serdev) in which case the device pointer will be set to NULL,
something which have bit driver authors over the years.

In retrospect perhaps this field should never have been added, but let's
at least document the current behaviour.

Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20210409073512.6876-1-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 51f56e5ec955..a228c0ee484f 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -284,7 +284,7 @@ struct tty_operations;
 struct tty_struct {
 	int	magic;
 	struct kref kref;
-	struct device *dev;
+	struct device *dev;	/* class device or NULL (e.g. ptys, serdev) */
 	struct tty_driver *driver;
 	const struct tty_operations *ops;
 	int index;
-- 
cgit v1.2.3


From 7ee3c61dcd28bf6e290e06ad382f13511dc790e9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 7 Apr 2021 21:43:39 +0200
Subject: netfilter: bridge: add pre_exit hooks for ebtable unregistration

Just like ip/ip6/arptables, the hooks have to be removed, then
synchronize_rcu() has to be called to make sure no more packets are being
processed before the ruleset data is released.

Place the hook unregistration in the pre_exit hook, then call the new
ebtables pre_exit function from there.

Years ago, when first netns support got added for netfilter+ebtables,
this used an older (now removed) netfilter hook unregister API, that did
a unconditional synchronize_rcu().

Now that all is done with call_rcu, ebtable_{filter,nat,broute} pernet exit
handlers may free the ebtable ruleset while packets are still in flight.

This can only happens on module removal, not during netns exit.

The new function expects the table name, not the table struct.

This is because upcoming patch set (targeting -next) will remove all
net->xt.{nat,filter,broute}_table instances, this makes it necessary
to avoid external references to those member variables.

The existing APIs will be converted, so follow the upcoming scheme of
passing name + hook type instead.

Fixes: aee12a0a3727e ("ebtables: remove nf_hook_register usage")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h |  5 +++--
 net/bridge/netfilter/ebtable_broute.c     |  8 +++++++-
 net/bridge/netfilter/ebtable_filter.c     |  8 +++++++-
 net/bridge/netfilter/ebtable_nat.c        |  8 +++++++-
 net/bridge/netfilter/ebtables.c           | 30 +++++++++++++++++++++++++++---
 5 files changed, 51 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 2f5c4e6ecd8a..3a956145a25c 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -110,8 +110,9 @@ extern int ebt_register_table(struct net *net,
 			      const struct ebt_table *table,
 			      const struct nf_hook_ops *ops,
 			      struct ebt_table **res);
-extern void ebt_unregister_table(struct net *net, struct ebt_table *table,
-				 const struct nf_hook_ops *);
+extern void ebt_unregister_table(struct net *net, struct ebt_table *table);
+void ebt_unregister_table_pre_exit(struct net *net, const char *tablename,
+				   const struct nf_hook_ops *ops);
 extern unsigned int ebt_do_table(struct sk_buff *skb,
 				 const struct nf_hook_state *state,
 				 struct ebt_table *table);
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 66e7af165494..32bc2821027f 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -105,14 +105,20 @@ static int __net_init broute_net_init(struct net *net)
 				  &net->xt.broute_table);
 }
 
+static void __net_exit broute_net_pre_exit(struct net *net)
+{
+	ebt_unregister_table_pre_exit(net, "broute", &ebt_ops_broute);
+}
+
 static void __net_exit broute_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.broute_table, &ebt_ops_broute);
+	ebt_unregister_table(net, net->xt.broute_table);
 }
 
 static struct pernet_operations broute_net_ops = {
 	.init = broute_net_init,
 	.exit = broute_net_exit,
+	.pre_exit = broute_net_pre_exit,
 };
 
 static int __init ebtable_broute_init(void)
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 78cb9b21022d..bcf982e12f16 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -99,14 +99,20 @@ static int __net_init frame_filter_net_init(struct net *net)
 				  &net->xt.frame_filter);
 }
 
+static void __net_exit frame_filter_net_pre_exit(struct net *net)
+{
+	ebt_unregister_table_pre_exit(net, "filter", ebt_ops_filter);
+}
+
 static void __net_exit frame_filter_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.frame_filter, ebt_ops_filter);
+	ebt_unregister_table(net, net->xt.frame_filter);
 }
 
 static struct pernet_operations frame_filter_net_ops = {
 	.init = frame_filter_net_init,
 	.exit = frame_filter_net_exit,
+	.pre_exit = frame_filter_net_pre_exit,
 };
 
 static int __init ebtable_filter_init(void)
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 0888936ef853..0d092773f816 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -99,14 +99,20 @@ static int __net_init frame_nat_net_init(struct net *net)
 				  &net->xt.frame_nat);
 }
 
+static void __net_exit frame_nat_net_pre_exit(struct net *net)
+{
+	ebt_unregister_table_pre_exit(net, "nat", ebt_ops_nat);
+}
+
 static void __net_exit frame_nat_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.frame_nat, ebt_ops_nat);
+	ebt_unregister_table(net, net->xt.frame_nat);
 }
 
 static struct pernet_operations frame_nat_net_ops = {
 	.init = frame_nat_net_init,
 	.exit = frame_nat_net_exit,
+	.pre_exit = frame_nat_net_pre_exit,
 };
 
 static int __init ebtable_nat_init(void)
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index ebe33b60efd6..d481ff24a150 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1232,10 +1232,34 @@ out:
 	return ret;
 }
 
-void ebt_unregister_table(struct net *net, struct ebt_table *table,
-			  const struct nf_hook_ops *ops)
+static struct ebt_table *__ebt_find_table(struct net *net, const char *name)
+{
+	struct ebt_table *t;
+
+	mutex_lock(&ebt_mutex);
+
+	list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
+		if (strcmp(t->name, name) == 0) {
+			mutex_unlock(&ebt_mutex);
+			return t;
+		}
+	}
+
+	mutex_unlock(&ebt_mutex);
+	return NULL;
+}
+
+void ebt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops)
+{
+	struct ebt_table *table = __ebt_find_table(net, name);
+
+	if (table)
+		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+}
+EXPORT_SYMBOL(ebt_unregister_table_pre_exit);
+
+void ebt_unregister_table(struct net *net, struct ebt_table *table)
 {
-	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
 	__ebt_unregister_table(net, table);
 }
 
-- 
cgit v1.2.3


From d163a925ebbc6eb5b562b0f1d72c7e817aa75c40 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 7 Apr 2021 21:43:40 +0200
Subject: netfilter: arp_tables: add pre_exit hook for table unregister

Same problem that also existed in iptables/ip(6)tables, when
arptable_filter is removed there is no longer a wait period before the
table/ruleset is free'd.

Unregister the hook in pre_exit, then remove the table in the exit
function.
This used to work correctly because the old nf_hook_unregister API
did unconditional synchronize_net.

The per-net hook unregister function uses call_rcu instead.

Fixes: b9e69e127397 ("netfilter: xtables: don't hook tables by default")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_arp/arp_tables.h |  5 +++--
 net/ipv4/netfilter/arp_tables.c          |  9 +++++++--
 net/ipv4/netfilter/arptable_filter.c     | 10 +++++++++-
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 7d3537c40ec9..26a13294318c 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -52,8 +52,9 @@ extern void *arpt_alloc_initial_table(const struct xt_table *);
 int arpt_register_table(struct net *net, const struct xt_table *table,
 			const struct arpt_replace *repl,
 			const struct nf_hook_ops *ops, struct xt_table **res);
-void arpt_unregister_table(struct net *net, struct xt_table *table,
-			   const struct nf_hook_ops *ops);
+void arpt_unregister_table(struct net *net, struct xt_table *table);
+void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table,
+				    const struct nf_hook_ops *ops);
 extern unsigned int arpt_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index d1e04d2b5170..6c26533480dd 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1539,10 +1539,15 @@ out_free:
 	return ret;
 }
 
-void arpt_unregister_table(struct net *net, struct xt_table *table,
-			   const struct nf_hook_ops *ops)
+void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table,
+				    const struct nf_hook_ops *ops)
 {
 	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+}
+EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
+
+void arpt_unregister_table(struct net *net, struct xt_table *table)
+{
 	__arpt_unregister_table(net, table);
 }
 
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index c216b9ad3bb2..6c300ba5634e 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -56,16 +56,24 @@ static int __net_init arptable_filter_table_init(struct net *net)
 	return err;
 }
 
+static void __net_exit arptable_filter_net_pre_exit(struct net *net)
+{
+	if (net->ipv4.arptable_filter)
+		arpt_unregister_table_pre_exit(net, net->ipv4.arptable_filter,
+					       arpfilter_ops);
+}
+
 static void __net_exit arptable_filter_net_exit(struct net *net)
 {
 	if (!net->ipv4.arptable_filter)
 		return;
-	arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
+	arpt_unregister_table(net, net->ipv4.arptable_filter);
 	net->ipv4.arptable_filter = NULL;
 }
 
 static struct pernet_operations arptable_filter_net_ops = {
 	.exit = arptable_filter_net_exit,
+	.pre_exit = arptable_filter_net_pre_exit,
 };
 
 static int __init arptable_filter_init(void)
-- 
cgit v1.2.3


From 4b2b4cc50ba6d607d1611ea6b2046a58d16e45eb Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 29 Mar 2021 13:50:47 +0300
Subject: i2c: Remove support for dangling device properties

From now on only accepting complete software nodes.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-boardinfo.c | 11 -----------
 drivers/i2c/i2c-core-base.c | 15 +--------------
 include/linux/i2c.h         |  2 --
 3 files changed, 1 insertion(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c
index 8bc51d4e69df..4df8ad092df3 100644
--- a/drivers/i2c/i2c-boardinfo.c
+++ b/drivers/i2c/i2c-boardinfo.c
@@ -47,7 +47,6 @@ EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num);
  *
  * The board info passed can safely be __initdata, but be careful of embedded
  * pointers (for platform_data, functions, etc) since that won't be copied.
- * Device properties are deep-copied though.
  */
 int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len)
 {
@@ -72,16 +71,6 @@ int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsig
 		devinfo->busnum = busnum;
 		devinfo->board_info = *info;
 
-		if (info->properties) {
-			devinfo->board_info.properties =
-					property_entries_dup(info->properties);
-			if (IS_ERR(devinfo->board_info.properties)) {
-				status = PTR_ERR(devinfo->board_info.properties);
-				kfree(devinfo);
-				break;
-			}
-		}
-
 		if (info->resources) {
 			devinfo->board_info.resources =
 				kmemdup(info->resources,
diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 266b2013b1f1..d296b5000a1b 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -909,23 +909,13 @@ i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *inf
 
 	i2c_dev_set_name(adap, client, info);
 
-	if (info->properties) {
-		status = device_add_properties(&client->dev, info->properties);
-		if (status) {
-			dev_err(&adap->dev,
-				"Failed to add properties to client %s: %d\n",
-				client->name, status);
-			goto out_err_put_of_node;
-		}
-	}
-
 	if (info->swnode) {
 		status = device_add_software_node(&client->dev, info->swnode);
 		if (status) {
 			dev_err(&adap->dev,
 				"Failed to add software node to client %s: %d\n",
 				client->name, status);
-			goto out_free_props;
+			goto out_err_put_of_node;
 		}
 	}
 
@@ -940,9 +930,6 @@ i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *inf
 
 out_remove_swnode:
 	device_remove_software_node(&client->dev);
-out_free_props:
-	if (info->properties)
-		device_remove_properties(&client->dev);
 out_err_put_of_node:
 	of_node_put(info->of_node);
 out_err:
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index cb1f882a3e88..54b3ccc71e37 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -391,7 +391,6 @@ static inline bool i2c_detect_slave_mode(struct device *dev) { return false; }
  * @platform_data: stored in i2c_client.dev.platform_data
  * @of_node: pointer to OpenFirmware device node
  * @fwnode: device node supplied by the platform firmware
- * @properties: Deprecated - use swnode instead
  * @swnode: software node for the device
  * @resources: resources associated with the device
  * @num_resources: number of resources in the @resources array
@@ -416,7 +415,6 @@ struct i2c_board_info {
 	void		*platform_data;
 	struct device_node *of_node;
 	struct fwnode_handle *fwnode;
-	const struct property_entry *properties;
 	const struct software_node *swnode;
 	const struct resource *resources;
 	unsigned int	num_resources;
-- 
cgit v1.2.3


From 07740c92ae57ca21204f1e0c6f59272cdf3190cc Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Thu, 8 Apr 2021 19:17:17 +0800
Subject: i2c: core: add managed function for adding i2c adapters

Some I2C controller drivers will only unregister the I2C
adapter in their .remove() callback, which can be done
by simply using a managed variant to add the I2C adapter.

So add the managed functions for adding the I2C adapter.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-base.c | 26 ++++++++++++++++++++++++++
 include/linux/i2c.h         |  1 +
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index c81cc9a09877..c4c28b585b50 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -1693,6 +1693,32 @@ void i2c_del_adapter(struct i2c_adapter *adap)
 }
 EXPORT_SYMBOL(i2c_del_adapter);
 
+static void devm_i2c_del_adapter(void *adapter)
+{
+	i2c_del_adapter(adapter);
+}
+
+/**
+ * devm_i2c_add_adapter - device-managed variant of i2c_add_adapter()
+ * @dev: managing device for adding this I2C adapter
+ * @adapter: the adapter to add
+ * Context: can sleep
+ *
+ * Add adapter with dynamic bus number, same with i2c_add_adapter()
+ * but the adapter will be auto deleted on driver detach.
+ */
+int devm_i2c_add_adapter(struct device *dev, struct i2c_adapter *adapter)
+{
+	int ret;
+
+	ret = i2c_add_adapter(adapter);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(dev, devm_i2c_del_adapter, adapter);
+}
+EXPORT_SYMBOL_GPL(devm_i2c_add_adapter);
+
 static void i2c_parse_timing(struct device *dev, char *prop_name, u32 *cur_val_p,
 			    u32 def_val, bool use_def)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 59b892c5bb05..ce473ddc0378 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -846,6 +846,7 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
  */
 #if IS_ENABLED(CONFIG_I2C)
 int i2c_add_adapter(struct i2c_adapter *adap);
+int devm_i2c_add_adapter(struct device *dev, struct i2c_adapter *adapter);
 void i2c_del_adapter(struct i2c_adapter *adap);
 int i2c_add_numbered_adapter(struct i2c_adapter *adap);
 
-- 
cgit v1.2.3


From 3b4c747cd32078172dd238929e38a43cfed83580 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Thu, 8 Apr 2021 19:17:18 +0800
Subject: i2c: core: add api to provide frequency mode strings

Some I2C drivers like Designware and HiSilicon will print the
bus frequency mode information, so add a public one that everyone
can make use of.

Tested-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Reviewed-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-base.c | 21 +++++++++++++++++++++
 include/linux/i2c.h         |  3 +++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index c4c28b585b50..5c16083694bf 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -76,6 +76,27 @@ void i2c_transfer_trace_unreg(void)
 	static_branch_dec(&i2c_trace_msg_key);
 }
 
+const char *i2c_freq_mode_string(u32 bus_freq_hz)
+{
+	switch (bus_freq_hz) {
+	case I2C_MAX_STANDARD_MODE_FREQ:
+		return "Standard Mode (100 kHz)";
+	case I2C_MAX_FAST_MODE_FREQ:
+		return "Fast Mode (400 kHz)";
+	case I2C_MAX_FAST_MODE_PLUS_FREQ:
+		return "Fast Mode Plus (1.0 MHz)";
+	case I2C_MAX_TURBO_MODE_FREQ:
+		return "Turbo Mode (1.4 MHz)";
+	case I2C_MAX_HIGH_SPEED_MODE_FREQ:
+		return "High Speed Mode (3.4 MHz)";
+	case I2C_MAX_ULTRA_FAST_MODE_FREQ:
+		return "Ultra Fast Mode (5.0 MHz)";
+	default:
+		return "Unknown Mode";
+	}
+}
+EXPORT_SYMBOL_GPL(i2c_freq_mode_string);
+
 const struct i2c_device_id *i2c_match_id(const struct i2c_device_id *id,
 						const struct i2c_client *client)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index ce473ddc0378..e5d8b9dad6bf 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -51,6 +51,9 @@ struct module;
 struct property_entry;
 
 #if IS_ENABLED(CONFIG_I2C)
+/* Return the Frequency mode string based on the bus frequency */
+const char *i2c_freq_mode_string(u32 bus_freq_hz);
+
 /*
  * The master routines are the ones normally used to transmit data to devices
  * on a bus (or read from them). Apart from two basic transfer functions to
-- 
cgit v1.2.3


From 5b5475826c5265cead7ce4ca6d34ec0c566c70aa Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 21 Mar 2021 18:38:32 -0700
Subject: i2c: ensure timely release of driver-allocated resources

More and more drivers rely on devres to manage their resources, however
if bus' probe() and release() methods are not trivial and control some
of resources as well (for example enable or disable clocks, or attach
device to a power domain), we need to make sure that driver-allocated
resources are released immediately after driver's remove() method
returns, and not postponed until driver core gets around to releasing
resources. To fix that we open a new devres group before calling
driver's probe() and explicitly release it when we return from driver's
remove().

Tested-by: Jeff LaBundy <jeff@labundy.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/i2c-core-base.c | 21 ++++++++++++++++++++-
 include/linux/i2c.h         |  3 +++
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 5c16083694bf..4351bf529d0f 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -539,6 +539,13 @@ static int i2c_device_probe(struct device *dev)
 	if (status)
 		goto err_clear_wakeup_irq;
 
+	client->devres_group_id = devres_open_group(&client->dev, NULL,
+						    GFP_KERNEL);
+	if (!client->devres_group_id) {
+		status = -ENOMEM;
+		goto err_detach_pm_domain;
+	}
+
 	/*
 	 * When there are no more users of probe(),
 	 * rename probe_new to probe.
@@ -551,11 +558,21 @@ static int i2c_device_probe(struct device *dev)
 	else
 		status = -EINVAL;
 
+	/*
+	 * Note that we are not closing the devres group opened above so
+	 * even resources that were attached to the device after probe is
+	 * run are released when i2c_device_remove() is executed. This is
+	 * needed as some drivers would allocate additional resources,
+	 * for example when updating firmware.
+	 */
+
 	if (status)
-		goto err_detach_pm_domain;
+		goto err_release_driver_resources;
 
 	return 0;
 
+err_release_driver_resources:
+	devres_release_group(&client->dev, client->devres_group_id);
 err_detach_pm_domain:
 	dev_pm_domain_detach(&client->dev, true);
 err_clear_wakeup_irq:
@@ -584,6 +601,8 @@ static int i2c_device_remove(struct device *dev)
 			dev_warn(dev, "remove failed (%pe), will be ignored\n", ERR_PTR(status));
 	}
 
+	devres_release_group(&client->dev, client->devres_group_id);
+
 	dev_pm_domain_detach(&client->dev, true);
 
 	dev_pm_clear_wake_irq(&client->dev);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e5d8b9dad6bf..e8f2ac8c9c3d 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -309,6 +309,8 @@ struct i2c_driver {
  *	userspace_devices list
  * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter
  *	calls it to pass on slave events to the slave driver.
+ * @devres_group_id: id of the devres group that will be created for resources
+ *	acquired when probing this device.
  *
  * An i2c_client identifies a single device (i.e. chip) connected to an
  * i2c bus. The behaviour exposed to Linux is defined by the driver
@@ -337,6 +339,7 @@ struct i2c_client {
 #if IS_ENABLED(CONFIG_I2C_SLAVE)
 	i2c_slave_cb_t slave_cb;	/* callback for slave mode	*/
 #endif
+	void *devres_group_id;		/* ID of probe devres group	*/
 };
 #define to_i2c_client(d) container_of(d, struct i2c_client, dev)
 
-- 
cgit v1.2.3


From 8f7b5054755e48cc7b217a41e3f1891e01338d2f Mon Sep 17 00:00:00 2001
From: Vincenzo Frascino <vincenzo.frascino@arm.com>
Date: Mon, 15 Mar 2021 13:20:14 +0000
Subject: kasan: Add report for async mode

KASAN provides an asynchronous mode of execution.

Add reporting functionality for this mode.

Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Andrey Konovalov <andreyknvl@google.com>
Tested-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Link: https://lore.kernel.org/r/20210315132019.33202-5-vincenzo.frascino@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/kasan.h |  6 ++++++
 mm/kasan/kasan.h      | 16 ++++++++++++++++
 mm/kasan/report.c     | 17 ++++++++++++++++-
 3 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b91732bd05d7..e7c20c79b342 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -376,6 +376,12 @@ static inline void *kasan_reset_tag(const void *addr)
 
 #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS*/
 
+#ifdef CONFIG_KASAN_HW_TAGS
+
+void kasan_report_async(void);
+
+#endif /* CONFIG_KASAN_HW_TAGS */
+
 #ifdef CONFIG_KASAN_SW_TAGS
 void __init kasan_init_sw_tags(void);
 #else
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index a3d0d165d97d..02e9656b857f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -7,17 +7,33 @@
 #include <linux/stackdepot.h>
 
 #ifdef CONFIG_KASAN_HW_TAGS
+
 #include <linux/static_key.h>
+
 DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+extern bool kasan_flag_async __ro_after_init;
+
 static inline bool kasan_stack_collection_enabled(void)
 {
 	return static_branch_unlikely(&kasan_flag_stacktrace);
 }
+
+static inline bool kasan_async_mode_enabled(void)
+{
+	return kasan_flag_async;
+}
 #else
+
 static inline bool kasan_stack_collection_enabled(void)
 {
 	return true;
 }
+
+static inline bool kasan_async_mode_enabled(void)
+{
+	return false;
+}
+
 #endif
 
 extern bool kasan_flag_panic __ro_after_init;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 87b271206163..8b0843a2cdd7 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -87,7 +87,8 @@ static void start_report(unsigned long *flags)
 
 static void end_report(unsigned long *flags, unsigned long addr)
 {
-	trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
+	if (!kasan_async_mode_enabled())
+		trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
 	pr_err("==================================================================\n");
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 	spin_unlock_irqrestore(&report_lock, *flags);
@@ -360,6 +361,20 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
 	end_report(&flags, (unsigned long)object);
 }
 
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_report_async(void)
+{
+	unsigned long flags;
+
+	start_report(&flags);
+	pr_err("BUG: KASAN: invalid-access\n");
+	pr_err("Asynchronous mode enabled: no access details available\n");
+	pr_err("\n");
+	dump_stack();
+	end_report(&flags, 0);
+}
+#endif /* CONFIG_KASAN_HW_TAGS */
+
 static void __kasan_report(unsigned long addr, size_t size, bool is_write,
 				unsigned long ip)
 {
-- 
cgit v1.2.3


From c781ff12a2f37a9795e13bf328e5053d3e69f9e0 Mon Sep 17 00:00:00 2001
From: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Date: Fri, 9 Apr 2021 11:06:34 +0300
Subject: ethtool: Allow network drivers to dump arbitrary EEPROM data

Define get_module_eeprom_by_page() ethtool callback and implement
netlink infrastructure.

get_module_eeprom_by_page() allows network drivers to dump a part of
module's EEPROM specified by page and bank numbers along with offset and
length. It is effectively a netlink replacement for get_module_info()
and get_module_eeprom() pair, which is needed due to emergence of
complex non-linear EEPROM layouts.

Signed-off-by: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst |  36 +++++-
 include/linux/ethtool.h                      |  33 +++++-
 include/uapi/linux/ethtool_netlink.h         |  19 +++
 net/ethtool/Makefile                         |   2 +-
 net/ethtool/eeprom.c                         | 171 +++++++++++++++++++++++++++
 net/ethtool/netlink.c                        |  11 ++
 net/ethtool/netlink.h                        |   2 +
 7 files changed, 270 insertions(+), 4 deletions(-)
 create mode 100644 net/ethtool/eeprom.c

(limited to 'include/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index ce4a69f8308f..bbecffc7b11a 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1338,6 +1338,38 @@ in an implementation specific way.
 ``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP
 module parameters. This does not mean autonegotiation.
 
+MODULE_EEPROM
+=============
+
+Fetch module EEPROM data dump.
+This interface is designed to allow dumps of at most 1/2 page at once. This
+means only dumps of 128 (or less) bytes are allowed, without crossing half page
+boundary located at offset 128. For pages other than 0 only high 128 bytes are
+accessible.
+
+Request contents:
+
+  =======================================  ======  ==========================
+  ``ETHTOOL_A_MODULE_EEPROM_HEADER``       nested  request header
+  ``ETHTOOL_A_MODULE_EEPROM_OFFSET``       u32     offset within a page
+  ``ETHTOOL_A_MODULE_EEPROM_LENGTH``       u32     amount of bytes to read
+  ``ETHTOOL_A_MODULE_EEPROM_PAGE``         u8      page number
+  ``ETHTOOL_A_MODULE_EEPROM_BANK``         u8      bank number
+  ``ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS``  u8      page I2C address
+  =======================================  ======  ==========================
+
+Kernel response contents:
+
+ +---------------------------------------------+--------+---------------------+
+ | ``ETHTOOL_A_MODULE_EEPROM_HEADER``          | nested | reply header        |
+ +---------------------------------------------+--------+---------------------+
+ | ``ETHTOOL_A_MODULE_EEPROM_DATA``            | nested | array of bytes from |
+ |                                             |        | module EEPROM       |
+ +---------------------------------------------+--------+---------------------+
+
+``ETHTOOL_A_MODULE_EEPROM_DATA`` has an attribute length equal to the amount of
+bytes driver actually read.
+
 Request translation
 ===================
 
@@ -1415,8 +1447,8 @@ are netlink only.
   ``ETHTOOL_GET_DUMP_FLAG``           n/a
   ``ETHTOOL_GET_DUMP_DATA``           n/a
   ``ETHTOOL_GET_TS_INFO``             ``ETHTOOL_MSG_TSINFO_GET``
-  ``ETHTOOL_GMODULEINFO``             n/a
-  ``ETHTOOL_GMODULEEEPROM``           n/a
+  ``ETHTOOL_GMODULEINFO``             ``ETHTOOL_MSG_MODULE_EEPROM_GET``
+  ``ETHTOOL_GMODULEEEPROM``           ``ETHTOOL_MSG_MODULE_EEPROM_GET``
   ``ETHTOOL_GEEE``                    ``ETHTOOL_MSG_EEE_GET``
   ``ETHTOOL_SEEE``                    ``ETHTOOL_MSG_EEE_SET``
   ``ETHTOOL_GRSSH``                   n/a
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4290e2fa3117..9f6f323af59a 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -81,6 +81,7 @@ enum {
 #define ETH_RSS_HASH_NO_CHANGE	0
 
 struct net_device;
+struct netlink_ext_ack;
 
 /* Some generic methods drivers may use in their ethtool_ops */
 u32 ethtool_op_get_link(struct net_device *dev);
@@ -262,6 +263,31 @@ struct ethtool_pause_stats {
 	u64 rx_pause_frames;
 };
 
+#define ETH_MODULE_EEPROM_PAGE_LEN	128
+#define ETH_MODULE_MAX_I2C_ADDRESS	0x7f
+
+/**
+ * struct ethtool_module_eeprom - EEPROM dump from specified page
+ * @offset: Offset within the specified EEPROM page to begin read, in bytes.
+ * @length: Number of bytes to read.
+ * @page: Page number to read from.
+ * @bank: Page bank number to read from, if applicable by EEPROM spec.
+ * @i2c_address: I2C address of a page. Value less than 0x7f expected. Most
+ *	EEPROMs use 0x50 or 0x51.
+ * @data: Pointer to buffer with EEPROM data of @length size.
+ *
+ * This can be used to manage pages during EEPROM dump in ethtool and pass
+ * required information to the driver.
+ */
+struct ethtool_module_eeprom {
+	__u32	offset;
+	__u32	length;
+	__u8	page;
+	__u8	bank;
+	__u8	i2c_address;
+	__u8	*data;
+};
+
 /**
  * struct ethtool_ops - optional netdev operations
  * @cap_link_lanes_supported: indicates if the driver supports lanes
@@ -414,6 +440,9 @@ struct ethtool_pause_stats {
  *	cannot use the standard PHY library helpers.
  * @get_phy_tunable: Read the value of a PHY tunable.
  * @set_phy_tunable: Set the value of a PHY tunable.
+ * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from
+ *	specified page. Returns a negative error code or the amount of bytes
+ *	read.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -519,6 +548,9 @@ struct ethtool_ops {
 				   const struct ethtool_tunable *, void *);
 	int	(*set_phy_tunable)(struct net_device *,
 				   const struct ethtool_tunable *, const void *);
+	int	(*get_module_eeprom_by_page)(struct net_device *dev,
+					     const struct ethtool_module_eeprom *page,
+					     struct netlink_ext_ack *extack);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
@@ -542,7 +574,6 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev,
 				       const struct ethtool_link_ksettings *cmd,
 				       u32 *dev_speed, u8 *dev_duplex);
 
-struct netlink_ext_ack;
 struct phy_device;
 struct phy_tdr_config;
 
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 7f1bdb5b31ba..9612dcd48a6a 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -44,6 +44,7 @@ enum {
 	ETHTOOL_MSG_TUNNEL_INFO_GET,
 	ETHTOOL_MSG_FEC_GET,
 	ETHTOOL_MSG_FEC_SET,
+	ETHTOOL_MSG_MODULE_EEPROM_GET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -84,6 +85,7 @@ enum {
 	ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY,
 	ETHTOOL_MSG_FEC_GET_REPLY,
 	ETHTOOL_MSG_FEC_NTF,
+	ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -646,6 +648,23 @@ enum {
 	ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1)
 };
 
+/* MODULE EEPROM */
+
+enum {
+	ETHTOOL_A_MODULE_EEPROM_UNSPEC,
+	ETHTOOL_A_MODULE_EEPROM_HEADER,			/* nest - _A_HEADER_* */
+
+	ETHTOOL_A_MODULE_EEPROM_OFFSET,			/* u32 */
+	ETHTOOL_A_MODULE_EEPROM_LENGTH,			/* u32 */
+	ETHTOOL_A_MODULE_EEPROM_PAGE,			/* u8 */
+	ETHTOOL_A_MODULE_EEPROM_BANK,			/* u8 */
+	ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS,		/* u8 */
+	ETHTOOL_A_MODULE_EEPROM_DATA,			/* nested */
+
+	__ETHTOOL_A_MODULE_EEPROM_CNT,
+	ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index c2dc9033a8f7..83842685fd8c 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o
+		   tunnels.o fec.o eeprom.o
diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c
new file mode 100644
index 000000000000..8536dd905da5
--- /dev/null
+++ b/net/ethtool/eeprom.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include "netlink.h"
+#include "common.h"
+
+struct eeprom_req_info {
+	struct ethnl_req_info	base;
+	u32			offset;
+	u32			length;
+	u8			page;
+	u8			bank;
+	u8			i2c_address;
+};
+
+struct eeprom_reply_data {
+	struct ethnl_reply_data base;
+	u32			length;
+	u8			*data;
+};
+
+#define MODULE_EEPROM_REQINFO(__req_base) \
+	container_of(__req_base, struct eeprom_req_info, base)
+
+#define MODULE_EEPROM_REPDATA(__reply_base) \
+	container_of(__reply_base, struct eeprom_reply_data, base)
+
+static int eeprom_prepare_data(const struct ethnl_req_info *req_base,
+			       struct ethnl_reply_data *reply_base,
+			       struct genl_info *info)
+{
+	struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+	struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base);
+	struct ethtool_module_eeprom page_data = {0};
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	if (!dev->ethtool_ops->get_module_eeprom_by_page)
+		return -EOPNOTSUPP;
+
+	page_data.offset = request->offset;
+	page_data.length = request->length;
+	page_data.i2c_address = request->i2c_address;
+	page_data.page = request->page;
+	page_data.bank = request->bank;
+	page_data.data = kmalloc(page_data.length, GFP_KERNEL);
+	if (!page_data.data)
+		return -ENOMEM;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret)
+		goto err_free;
+
+	ret = dev->ethtool_ops->get_module_eeprom_by_page(dev, &page_data,
+							  info->extack);
+	if (ret < 0)
+		goto err_ops;
+
+	reply->length = ret;
+	reply->data = page_data.data;
+
+	ethnl_ops_complete(dev);
+	return 0;
+
+err_ops:
+	ethnl_ops_complete(dev);
+err_free:
+	kfree(page_data.data);
+	return ret;
+}
+
+static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb,
+				struct netlink_ext_ack *extack)
+{
+	struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info);
+
+	if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] ||
+	    !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] ||
+	    !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] ||
+	    !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS])
+		return -EINVAL;
+
+	request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]);
+	request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]);
+	request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]);
+
+	if (!request->length)
+		return -EINVAL;
+
+	/* The following set of conditions limit the API to only dump 1/2
+	 * EEPROM page without crossing low page boundary located at offset 128.
+	 * This means user may only request dumps of length limited to 128 from
+	 * either low 128 bytes or high 128 bytes.
+	 * For pages higher than 0 only high 128 bytes are accessible.
+	 */
+	request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]);
+	if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE],
+				    "reading from lower half page is allowed for page 0 only");
+		return -EINVAL;
+	}
+
+	if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN &&
+	    request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH],
+				    "reading cross half page boundary is illegal");
+		return -EINVAL;
+	} else if (request->offset >= ETH_MODULE_EEPROM_PAGE_LEN * 2) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_OFFSET],
+				    "offset is out of bounds");
+		return -EINVAL;
+	} else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH],
+				    "reading cross page boundary is illegal");
+		return -EINVAL;
+	}
+
+	if (tb[ETHTOOL_A_MODULE_EEPROM_BANK])
+		request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]);
+
+	return 0;
+}
+
+static int eeprom_reply_size(const struct ethnl_req_info *req_base,
+			     const struct ethnl_reply_data *reply_base)
+{
+	const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base);
+
+	return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */
+}
+
+static int eeprom_fill_reply(struct sk_buff *skb,
+			     const struct ethnl_req_info *req_base,
+			     const struct ethnl_reply_data *reply_base)
+{
+	struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+
+	return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data);
+}
+
+static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+	struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+
+	kfree(reply->data);
+}
+
+const struct ethnl_request_ops ethnl_module_eeprom_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_MODULE_EEPROM_GET,
+	.reply_cmd		= ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_MODULE_EEPROM_HEADER,
+	.req_info_size		= sizeof(struct eeprom_req_info),
+	.reply_data_size	= sizeof(struct eeprom_reply_data),
+
+	.parse_request		= eeprom_parse_request,
+	.prepare_data		= eeprom_prepare_data,
+	.reply_size		= eeprom_reply_size,
+	.fill_reply		= eeprom_fill_reply,
+	.cleanup_data		= eeprom_cleanup_data,
+};
+
+const struct nla_policy ethnl_module_eeprom_get_policy[] = {
+	[ETHTOOL_A_MODULE_EEPROM_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_MODULE_EEPROM_OFFSET]	= { .type = NLA_U32 },
+	[ETHTOOL_A_MODULE_EEPROM_LENGTH]	= { .type = NLA_U32 },
+	[ETHTOOL_A_MODULE_EEPROM_PAGE]		= { .type = NLA_U8 },
+	[ETHTOOL_A_MODULE_EEPROM_BANK]		= { .type = NLA_U8 },
+	[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]	=
+		NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS),
+};
+
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 705a4b201564..5f5d7c4b3d4a 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -246,6 +246,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_EEE_GET]		= &ethnl_eee_request_ops,
 	[ETHTOOL_MSG_FEC_GET]		= &ethnl_fec_request_ops,
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
+	[ETHTOOL_MSG_MODULE_EEPROM_GET]	= &ethnl_module_eeprom_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -931,6 +932,16 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_fec_set_policy,
 		.maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_MODULE_EEPROM_GET,
+		.flags  = GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_module_eeprom_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 785f7ee45930..4305ac971bb0 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -345,6 +345,7 @@ extern const struct ethnl_request_ops ethnl_pause_request_ops;
 extern const struct ethnl_request_ops ethnl_eee_request_ops;
 extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
 extern const struct ethnl_request_ops ethnl_fec_request_ops;
+extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -378,6 +379,7 @@ extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_T
 extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];
 extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
 extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
+extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
-- 
cgit v1.2.3


From e19b0a3474ab9ef90dd110af9f39fc87329755f1 Mon Sep 17 00:00:00 2001
From: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Date: Fri, 9 Apr 2021 11:06:35 +0300
Subject: net/mlx5: Refactor module EEPROM query

Prepare for ethtool_ops::get_module_eeprom_data() implementation by
extracting common part of mlx5_query_module_eeprom() into a separate
function.

Signed-off-by: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 79 +++++++++++++++-----------
 include/linux/mlx5/port.h                      |  9 +++
 2 files changed, 54 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 4bb219565c58..9b9f870d67a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -353,67 +353,78 @@ static void mlx5_sfp_eeprom_params_set(u16 *i2c_addr, int *page_num, u16 *offset
 	*offset -= MLX5_EEPROM_PAGE_LENGTH;
 }
 
-int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
-			     u16 offset, u16 size, u8 *data)
+static int mlx5_query_mcia(struct mlx5_core_dev *dev,
+			   struct mlx5_module_eeprom_query_params *params, u8 *data)
 {
-	int module_num, status, err, page_num = 0;
 	u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {};
 	u32 out[MLX5_ST_SZ_DW(mcia_reg)];
-	u16 i2c_addr = 0;
-	u8 module_id;
+	int status, err;
 	void *ptr;
+	u16 size;
+
+	size = min_t(int, params->size, MLX5_EEPROM_MAX_BYTES);
+
+	MLX5_SET(mcia_reg, in, l, 0);
+	MLX5_SET(mcia_reg, in, size, size);
+	MLX5_SET(mcia_reg, in, module, params->module_number);
+	MLX5_SET(mcia_reg, in, device_address, params->offset);
+	MLX5_SET(mcia_reg, in, page_number, params->page);
+	MLX5_SET(mcia_reg, in, i2c_device_address, params->i2c_address);
 
-	err = mlx5_query_module_num(dev, &module_num);
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCIA, 0, 0);
 	if (err)
 		return err;
 
-	err = mlx5_query_module_id(dev, module_num, &module_id);
+	status = MLX5_GET(mcia_reg, out, status);
+	if (status) {
+		mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n",
+			      status);
+		return -EIO;
+	}
+
+	ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0);
+	memcpy(data, ptr, size);
+
+	return size;
+}
+
+int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
+			     u16 offset, u16 size, u8 *data)
+{
+	struct mlx5_module_eeprom_query_params query = {0};
+	u8 module_id;
+	int err;
+
+	err = mlx5_query_module_num(dev, &query.module_number);
+	if (err)
+		return err;
+
+	err = mlx5_query_module_id(dev, query.module_number, &module_id);
 	if (err)
 		return err;
 
 	switch (module_id) {
 	case MLX5_MODULE_ID_SFP:
-		mlx5_sfp_eeprom_params_set(&i2c_addr, &page_num, &offset);
+		mlx5_sfp_eeprom_params_set(&query.i2c_address, &query.page, &query.offset);
 		break;
 	case MLX5_MODULE_ID_QSFP:
 	case MLX5_MODULE_ID_QSFP_PLUS:
 	case MLX5_MODULE_ID_QSFP28:
-		mlx5_qsfp_eeprom_params_set(&i2c_addr, &page_num, &offset);
+		mlx5_qsfp_eeprom_params_set(&query.i2c_address, &query.page, &query.offset);
 		break;
 	default:
 		mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id);
 		return -EINVAL;
 	}
 
-	if (offset + size > MLX5_EEPROM_PAGE_LENGTH)
+	if (query.offset + size > MLX5_EEPROM_PAGE_LENGTH)
 		/* Cross pages read, read until offset 256 in low page */
 		size -= offset + size - MLX5_EEPROM_PAGE_LENGTH;
 
-	size = min_t(int, size, MLX5_EEPROM_MAX_BYTES);
+	query.size = size;
 
-	MLX5_SET(mcia_reg, in, l, 0);
-	MLX5_SET(mcia_reg, in, module, module_num);
-	MLX5_SET(mcia_reg, in, i2c_device_address, i2c_addr);
-	MLX5_SET(mcia_reg, in, page_number, page_num);
-	MLX5_SET(mcia_reg, in, device_address, offset);
-	MLX5_SET(mcia_reg, in, size, size);
-
-	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
-				   sizeof(out), MLX5_REG_MCIA, 0, 0);
-	if (err)
-		return err;
-
-	status = MLX5_GET(mcia_reg, out, status);
-	if (status) {
-		mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n",
-			      status);
-		return -EIO;
-	}
-
-	ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0);
-	memcpy(data, ptr, size);
-
-	return size;
+	return mlx5_query_mcia(dev, &query, data);
 }
 EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom);
 
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 23edd2db4803..90b87aa82db3 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -62,6 +62,15 @@ enum mlx5_an_status {
 #define MLX5_EEPROM_PAGE_LENGTH		256
 #define MLX5_EEPROM_HIGH_PAGE_LENGTH	128
 
+struct mlx5_module_eeprom_query_params {
+	u16 size;
+	u16 offset;
+	u16 i2c_address;
+	u32 page;
+	u32 bank;
+	u32 module_number;
+};
+
 enum mlx5e_link_mode {
 	MLX5E_1000BASE_CX_SGMII	 = 0,
 	MLX5E_1000BASE_KX	 = 1,
-- 
cgit v1.2.3


From e109d2b204daa223e6d3cdaa369071c3ea96dcbf Mon Sep 17 00:00:00 2001
From: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Date: Fri, 9 Apr 2021 11:06:36 +0300
Subject: net/mlx5: Implement get_module_eeprom_by_page()

Implement ethtool_ops::get_module_eeprom_by_page() to enable
support of new SFP standards.

Signed-off-by: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 44 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 41 ++++++++++++++++++++
 include/linux/mlx5/port.h                          |  2 +
 3 files changed, 87 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index b185a0452629..c8057a44d5ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1770,6 +1770,49 @@ static int mlx5e_get_module_eeprom(struct net_device *netdev,
 	return 0;
 }
 
+static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev,
+					   const struct ethtool_module_eeprom *page_data,
+					   struct netlink_ext_ack *extack)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_module_eeprom_query_params query;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 *data = page_data->data;
+	int size_read;
+	int i = 0;
+
+	if (!page_data->length)
+		return -EINVAL;
+
+	memset(data, 0, page_data->length);
+
+	query.offset = page_data->offset;
+	query.i2c_address = page_data->i2c_address;
+	query.bank = page_data->bank;
+	query.page = page_data->page;
+	while (i < page_data->length) {
+		query.size = page_data->length - i;
+		size_read = mlx5_query_module_eeprom_by_page(mdev, &query, data + i);
+
+		/* Done reading, return how many bytes was read */
+		if (!size_read)
+			return i;
+
+		if (size_read == -EINVAL)
+			return -EINVAL;
+		if (size_read < 0) {
+			netdev_err(priv->netdev, "%s: mlx5_query_module_eeprom_by_page failed:0x%x\n",
+				   __func__, size_read);
+			return i;
+		}
+
+		i += size_read;
+		query.offset += size_read;
+	}
+
+	return i;
+}
+
 int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
 			       struct ethtool_flash *flash)
 {
@@ -2159,6 +2202,7 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.set_wol	   = mlx5e_set_wol,
 	.get_module_info   = mlx5e_get_module_info,
 	.get_module_eeprom = mlx5e_get_module_eeprom,
+	.get_module_eeprom_by_page = mlx5e_get_module_eeprom_by_page,
 	.flash_device      = mlx5e_flash_device,
 	.get_priv_flags    = mlx5e_get_priv_flags,
 	.set_priv_flags    = mlx5e_set_priv_flags,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 9b9f870d67a4..522a41f8f1e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -428,6 +428,47 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom);
 
+int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev,
+				     struct mlx5_module_eeprom_query_params *params,
+				     u8 *data)
+{
+	u8 module_id;
+	int err;
+
+	err = mlx5_query_module_num(dev, &params->module_number);
+	if (err)
+		return err;
+
+	err = mlx5_query_module_id(dev, params->module_number, &module_id);
+	if (err)
+		return err;
+
+	switch (module_id) {
+	case MLX5_MODULE_ID_SFP:
+		if (params->page > 0)
+			return -EINVAL;
+		break;
+	case MLX5_MODULE_ID_QSFP:
+	case MLX5_MODULE_ID_QSFP28:
+	case MLX5_MODULE_ID_QSFP_PLUS:
+		if (params->page > 3)
+			return -EINVAL;
+		break;
+	default:
+		mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id);
+		return -EINVAL;
+	}
+
+	if (params->i2c_address != MLX5_I2C_ADDR_HIGH &&
+	    params->i2c_address != MLX5_I2C_ADDR_LOW) {
+		mlx5_core_err(dev, "I2C address not recognized: 0x%x\n", params->i2c_address);
+		return -EINVAL;
+	}
+
+	return mlx5_query_mcia(dev, params, data);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom_by_page);
+
 static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc,
 				int pvlc_size,  u8 local_port)
 {
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 90b87aa82db3..58d56adb9842 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -209,6 +209,8 @@ void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
 			 bool *enabled);
 int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
 			     u16 offset, u16 size, u8 *data);
+int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev,
+				     struct mlx5_module_eeprom_query_params *params, u8 *data);
 
 int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
 int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
-- 
cgit v1.2.3


From 4c88fa412a100f925b8ab1aa952a672895f69d35 Mon Sep 17 00:00:00 2001
From: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Date: Fri, 9 Apr 2021 11:06:37 +0300
Subject: net/mlx5: Add support for DSFP module EEPROM dumps

Allow the driver to recognise DSFP transceiver module ID and therefore
allow its EEPROM dumps using ethtool.

Signed-off-by: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 2 ++
 include/linux/mlx5/port.h                      | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 522a41f8f1e2..1ef2b6a848c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -454,6 +454,8 @@ int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev,
 		if (params->page > 3)
 			return -EINVAL;
 		break;
+	case MLX5_MODULE_ID_DSFP:
+		break;
 	default:
 		mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id);
 		return -EINVAL;
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 58d56adb9842..77ea4f9c5265 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -45,6 +45,7 @@ enum mlx5_module_id {
 	MLX5_MODULE_ID_QSFP             = 0xC,
 	MLX5_MODULE_ID_QSFP_PLUS        = 0xD,
 	MLX5_MODULE_ID_QSFP28           = 0x11,
+	MLX5_MODULE_ID_DSFP		= 0x1B,
 };
 
 enum mlx5_an_status {
-- 
cgit v1.2.3


From d740513f05a24b1a46722325974223980f068728 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Fri, 9 Apr 2021 11:06:40 +0300
Subject: phy: sfp: add netlink SFP support to generic SFP code

The new netlink API for reading SFP data requires a new op to be
implemented. The idea of the new netlink SFP code is that userspace is
responsible to parsing the EEPROM data and requesting pages, rather
than have the kernel decide what pages are interesting and returning
them. This allows greater flexibility for newer formats.

Currently the generic SFP code only supports simple SFPs. Allow i2c
address 0x50 and 0x51 to be accessed with page and bank must always be
0. This interface will later be extended when for example QSFP support
is added.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Vladyslav Tarasiuk <vladyslavt@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp-bus.c | 20 ++++++++++++++++++++
 drivers/net/phy/sfp.c     | 25 +++++++++++++++++++++++++
 drivers/net/phy/sfp.h     |  3 +++
 include/linux/sfp.h       | 10 ++++++++++
 4 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 2e11176c6b94..e61de66e973b 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -555,6 +555,26 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
 }
 EXPORT_SYMBOL_GPL(sfp_get_module_eeprom);
 
+/**
+ * sfp_get_module_eeprom_by_page() - Read a page from the SFP module EEPROM
+ * @bus: a pointer to the &struct sfp_bus structure for the sfp module
+ * @page: a &struct ethtool_module_eeprom
+ * @extack: extack for reporting problems
+ *
+ * Read an EEPROM page as specified by the supplied @page. See the
+ * documentation for &struct ethtool_module_eeprom for the page to be read.
+ *
+ * Returns 0 on success or a negative errno number. More error
+ * information might be provided via extack
+ */
+int sfp_get_module_eeprom_by_page(struct sfp_bus *bus,
+				  const struct ethtool_module_eeprom *page,
+				  struct netlink_ext_ack *extack)
+{
+	return bus->socket_ops->module_eeprom_by_page(bus->sfp, page, extack);
+}
+EXPORT_SYMBOL_GPL(sfp_get_module_eeprom_by_page);
+
 /**
  * sfp_upstream_start() - Inform the SFP that the network device is up
  * @bus: a pointer to the &struct sfp_bus structure for the sfp module
diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 7998acc689b7..37f722c763d7 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -2330,6 +2330,30 @@ static int sfp_module_eeprom(struct sfp *sfp, struct ethtool_eeprom *ee,
 	return 0;
 }
 
+static int sfp_module_eeprom_by_page(struct sfp *sfp,
+				     const struct ethtool_module_eeprom *page,
+				     struct netlink_ext_ack *extack)
+{
+	if (page->bank) {
+		NL_SET_ERR_MSG(extack, "Banks not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (page->page) {
+		NL_SET_ERR_MSG(extack, "Only page 0 supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (page->i2c_address != 0x50 &&
+	    page->i2c_address != 0x51) {
+		NL_SET_ERR_MSG(extack, "Only address 0x50 and 0x51 supported");
+		return -EOPNOTSUPP;
+	}
+
+	return sfp_read(sfp, page->i2c_address == 0x51, page->offset,
+			page->data, page->length);
+};
+
 static const struct sfp_socket_ops sfp_module_ops = {
 	.attach = sfp_attach,
 	.detach = sfp_detach,
@@ -2337,6 +2361,7 @@ static const struct sfp_socket_ops sfp_module_ops = {
 	.stop = sfp_stop,
 	.module_info = sfp_module_info,
 	.module_eeprom = sfp_module_eeprom,
+	.module_eeprom_by_page = sfp_module_eeprom_by_page,
 };
 
 static void sfp_timeout(struct work_struct *work)
diff --git a/drivers/net/phy/sfp.h b/drivers/net/phy/sfp.h
index b83f70526270..27226535c72b 100644
--- a/drivers/net/phy/sfp.h
+++ b/drivers/net/phy/sfp.h
@@ -14,6 +14,9 @@ struct sfp_socket_ops {
 	int (*module_info)(struct sfp *sfp, struct ethtool_modinfo *modinfo);
 	int (*module_eeprom)(struct sfp *sfp, struct ethtool_eeprom *ee,
 			     u8 *data);
+	int (*module_eeprom_by_page)(struct sfp *sfp,
+				     const struct ethtool_module_eeprom *page,
+				     struct netlink_ext_ack *extack);
 };
 
 int sfp_add_phy(struct sfp_bus *bus, struct phy_device *phydev);
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 38893e4dd0f0..302094b855fb 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -542,6 +542,9 @@ phy_interface_t sfp_select_interface(struct sfp_bus *bus,
 int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo);
 int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
 			  u8 *data);
+int sfp_get_module_eeprom_by_page(struct sfp_bus *bus,
+				  const struct ethtool_module_eeprom *page,
+				  struct netlink_ext_ack *extack);
 void sfp_upstream_start(struct sfp_bus *bus);
 void sfp_upstream_stop(struct sfp_bus *bus);
 void sfp_bus_put(struct sfp_bus *bus);
@@ -587,6 +590,13 @@ static inline int sfp_get_module_eeprom(struct sfp_bus *bus,
 	return -EOPNOTSUPP;
 }
 
+static inline int sfp_get_module_eeprom_by_page(struct sfp_bus *bus,
+						const struct ethtool_module_eeprom *page,
+						struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void sfp_upstream_start(struct sfp_bus *bus)
 {
 }
-- 
cgit v1.2.3


From c7aab1a7c52b82d9afd7e03c398eb03dc2aa0507 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 1 Apr 2021 19:53:29 -0600
Subject: task_work: add helper for more targeted task_work canceling

The only exported helper we have right now is task_work_cancel(), which
cancels any task_work from a given task where func matches the queued
work item. This is a bit too coarse for some use cases. Add a
task_work_cancel_match() that allows to more specifically target
individual work items outside of purely the callback function used.

task_work_cancel() can be trivially implemented on top of that, hence do
so.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/task_work.h |  2 ++
 kernel/task_work.c        | 35 ++++++++++++++++++++++++++++-------
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 0d848a1e9e62..5b8a93f288bb 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -22,6 +22,8 @@ enum task_work_notify_mode {
 int task_work_add(struct task_struct *task, struct callback_head *twork,
 			enum task_work_notify_mode mode);
 
+struct callback_head *task_work_cancel_match(struct task_struct *task,
+	bool (*match)(struct callback_head *, void *data), void *data);
 struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
 void task_work_run(void);
 
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 9cde961875c0..e9316198c64b 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -59,18 +59,17 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 }
 
 /**
- * task_work_cancel - cancel a pending work added by task_work_add()
+ * task_work_cancel_match - cancel a pending work added by task_work_add()
  * @task: the task which should execute the work
- * @func: identifies the work to remove
- *
- * Find the last queued pending work with ->func == @func and remove
- * it from queue.
+ * @match: match function to call
  *
  * RETURNS:
  * The found work or NULL if not found.
  */
 struct callback_head *
-task_work_cancel(struct task_struct *task, task_work_func_t func)
+task_work_cancel_match(struct task_struct *task,
+		       bool (*match)(struct callback_head *, void *data),
+		       void *data)
 {
 	struct callback_head **pprev = &task->task_works;
 	struct callback_head *work;
@@ -86,7 +85,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 	while ((work = READ_ONCE(*pprev))) {
-		if (work->func != func)
+		if (!match(work, data))
 			pprev = &work->next;
 		else if (cmpxchg(pprev, work, work->next) == work)
 			break;
@@ -96,6 +95,28 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	return work;
 }
 
+static bool task_work_func_match(struct callback_head *cb, void *data)
+{
+	return cb->func == data;
+}
+
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @func: identifies the work to remove
+ *
+ * Find the last queued pending work with ->func == @func and remove
+ * it from queue.
+ *
+ * RETURNS:
+ * The found work or NULL if not found.
+ */
+struct callback_head *
+task_work_cancel(struct task_struct *task, task_work_func_t func)
+{
+	return task_work_cancel_match(task, task_work_func_match, func);
+}
+
 /**
  * task_work_run - execute the works added by task_work_add()
  *
-- 
cgit v1.2.3


From 3f48cf18f886c97a7e775af10696bfed9ddcff31 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 11 Apr 2021 01:46:27 +0100
Subject: io_uring: unify files and task cancel

Now __io_uring_cancel() and __io_uring_files_cancel() are very similar
and mostly differ by how we count requests, merge them and allow
tctx_inflight() to handle counting.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1a5986a97df4dc1378f3fe0ca1eb483dbcf42112.1618101759.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c            | 56 ++++++++++++------------------------------------
 include/linux/io_uring.h | 12 +++++------
 2 files changed, 19 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 99d617c7eae9..8e2a0f9acba8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8920,13 +8920,10 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx)
 	}
 }
 
-static s64 tctx_inflight_tracked(struct io_uring_task *tctx)
-{
-	return atomic_read(&tctx->inflight_tracked);
-}
-
-static s64 tctx_inflight(struct io_uring_task *tctx)
+static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 {
+	if (tracked)
+		return atomic_read(&tctx->inflight_tracked);
 	return percpu_counter_sum(&tctx->inflight);
 }
 
@@ -8993,7 +8990,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
 	atomic_inc(&tctx->in_idle);
 	do {
 		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx);
+		inflight = tctx_inflight(tctx, false);
 		if (!inflight)
 			break;
 		io_uring_try_cancel_requests(ctx, current, NULL);
@@ -9004,43 +9001,18 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
 		 * avoids a race where a completion comes in before we did
 		 * prepare_to_wait().
 		 */
-		if (inflight == tctx_inflight(tctx))
+		if (inflight == tctx_inflight(tctx, false))
 			schedule();
 		finish_wait(&tctx->wait, &wait);
 	} while (1);
 	atomic_dec(&tctx->in_idle);
 }
 
-void __io_uring_files_cancel(struct files_struct *files)
-{
-	struct io_uring_task *tctx = current->io_uring;
-	DEFINE_WAIT(wait);
-	s64 inflight;
-
-	/* make sure overflow events are dropped */
-	atomic_inc(&tctx->in_idle);
-	do {
-		/* read completions before cancelations */
-		inflight = tctx_inflight_tracked(tctx);
-		if (!inflight)
-			break;
-		io_uring_try_cancel(files);
-
-		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-		if (inflight == tctx_inflight_tracked(tctx))
-			schedule();
-		finish_wait(&tctx->wait, &wait);
-	} while (1);
-	atomic_dec(&tctx->in_idle);
-
-	io_uring_clean_tctx(tctx);
-}
-
 /*
  * Find any io_uring fd that this task has registered or done IO on, and cancel
  * requests.
  */
-void __io_uring_task_cancel(void)
+void __io_uring_cancel(struct files_struct *files)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	DEFINE_WAIT(wait);
@@ -9048,15 +9020,14 @@ void __io_uring_task_cancel(void)
 
 	/* make sure overflow events are dropped */
 	atomic_inc(&tctx->in_idle);
-	__io_uring_files_cancel(NULL);
+	io_uring_try_cancel(files);
 
 	do {
 		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx);
+		inflight = tctx_inflight(tctx, !!files);
 		if (!inflight)
 			break;
-		io_uring_try_cancel(NULL);
-
+		io_uring_try_cancel(files);
 		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
 
 		/*
@@ -9064,16 +9035,17 @@ void __io_uring_task_cancel(void)
 		 * avoids a race where a completion comes in before we did
 		 * prepare_to_wait().
 		 */
-		if (inflight == tctx_inflight(tctx))
+		if (inflight == tctx_inflight(tctx, !!files))
 			schedule();
 		finish_wait(&tctx->wait, &wait);
 	} while (1);
-
 	atomic_dec(&tctx->in_idle);
 
 	io_uring_clean_tctx(tctx);
-	/* all current's requests should be gone, we can kill tctx */
-	__io_uring_free(current);
+	if (!files) {
+		/* for exec all current's requests should be gone, kill tctx */
+		__io_uring_free(current);
+	}
 }
 
 static void *io_uring_validate_mmap_request(struct file *file,
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 79cde9906be0..04b650bcbbe5 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -7,19 +7,17 @@
 
 #if defined(CONFIG_IO_URING)
 struct sock *io_uring_get_socket(struct file *file);
-void __io_uring_task_cancel(void);
-void __io_uring_files_cancel(struct files_struct *files);
+void __io_uring_cancel(struct files_struct *files);
 void __io_uring_free(struct task_struct *tsk);
 
-static inline void io_uring_task_cancel(void)
+static inline void io_uring_files_cancel(struct files_struct *files)
 {
 	if (current->io_uring)
-		__io_uring_task_cancel();
+		__io_uring_cancel(files);
 }
-static inline void io_uring_files_cancel(struct files_struct *files)
+static inline void io_uring_task_cancel(void)
 {
-	if (current->io_uring)
-		__io_uring_files_cancel(files);
+	return io_uring_files_cancel(NULL);
 }
 static inline void io_uring_free(struct task_struct *tsk)
 {
-- 
cgit v1.2.3


From 907d52310024fae6632aabfc7e833decaf185e5f Mon Sep 17 00:00:00 2001
From: Jeffle Xu <jefflexu@linux.alibaba.com>
Date: Thu, 1 Apr 2021 10:19:25 +0800
Subject: block: add queue_to_disk() to get gendisk from request_queue

Sometimes we need to get the corresponding gendisk from request_queue.

It is preferred that block drivers store private data in
gendisk->private_data rather than request_queue->queuedata, e.g. see:
commit c4a59c4e5db3 ("dm: stop using ->queuedata").

So if only request_queue is given, we need to get its corresponding
gendisk to get the private data stored in that gendisk.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jeffle Xu <jefflexu@linux.alibaba.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h       | 2 ++
 include/trace/events/kyber.h | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d5d320da51f8..30d2090583ad 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -689,6 +689,8 @@ static inline bool blk_account_rq(struct request *rq)
 	dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
 	(dir), (attrs))
 
+#define queue_to_disk(q)	(dev_to_disk(kobj_to_dev((q)->kobj.parent)))
+
 static inline bool queue_is_mq(struct request_queue *q)
 {
 	return q->mq_ops;
diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h
index c0e7d24ca256..f9802562edf6 100644
--- a/include/trace/events/kyber.h
+++ b/include/trace/events/kyber.h
@@ -30,7 +30,7 @@ TRACE_EVENT(kyber_latency,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
+		__entry->dev		= disk_devt(queue_to_disk(q));
 		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
 		strlcpy(__entry->type, type, sizeof(__entry->type));
 		__entry->percentile	= percentile;
@@ -59,7 +59,7 @@ TRACE_EVENT(kyber_adjust,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
+		__entry->dev		= disk_devt(queue_to_disk(q));
 		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
 		__entry->depth		= depth;
 	),
@@ -81,7 +81,7 @@ TRACE_EVENT(kyber_throttled,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= disk_devt(dev_to_disk(kobj_to_dev(q->kobj.parent)));
+		__entry->dev		= disk_devt(queue_to_disk(q));
 		strlcpy(__entry->domain, domain, sizeof(__entry->domain));
 	),
 
-- 
cgit v1.2.3


From 4c5b479975212065ef39786e115fde42847e95a9 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 7 Apr 2021 14:36:42 +0200
Subject: vfs: add fileattr ops

There's a substantial amount of boilerplate in filesystems handling
FS_IOC_[GS]ETFLAGS/ FS_IOC_FS[GS]ETXATTR ioctls.

Also due to userspace buffers being involved in the ioctl API this is
difficult to stack, as shown by overlayfs issues related to these ioctls.

Introduce a new internal API named "fileattr" (fsxattr can be confused with
xattr, xflags is inappropriate, since this is more than just flags).

There's significant overlap between flags and xflags and this API handles
the conversions automatically, so filesystems may choose which one to use.

In ->fileattr_get() a hint is provided to the filesystem whether flags or
xattr are being requested by userspace, but in this series this hint is
ignored by all filesystems, since generating all the attributes is cheap.

If a filesystem doesn't implemement the fileattr API, just fall back to
f_op->ioctl().  When all filesystems are converted, the fallback can be
removed.

32bit compat ioctls are now handled by the generic code as well.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 Documentation/filesystems/locking.rst |  11 +-
 Documentation/filesystems/vfs.rst     |  15 ++
 fs/ioctl.c                            | 325 ++++++++++++++++++++++++++++++++++
 include/linux/fileattr.h              |  59 ++++++
 include/linux/fs.h                    |   4 +
 5 files changed, 411 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/fileattr.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index b7dcc86c92a4..729ce492b662 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -80,13 +80,16 @@ prototypes::
 				struct file *, unsigned open_flag,
 				umode_t create_mode);
 	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+	int (*fileattr_set)(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct fileattr *fa);
+	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
 
 locking rules:
 	all may block
 
-============	=============================================
+=============	=============================================
 ops		i_rwsem(inode)
-============	=============================================
+=============	=============================================
 lookup:		shared
 create:		exclusive
 link:		exclusive (both)
@@ -107,7 +110,9 @@ fiemap:		no
 update_time:	no
 atomic_open:	shared (exclusive if O_CREAT is set in open flags)
 tmpfile:	no
-============	=============================================
+fileattr_get:	no or exclusive
+fileattr_set:	exclusive
+=============	=============================================
 
 
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 2049bbf5e388..14c31eced416 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -441,6 +441,9 @@ As of kernel 2.6.22, the following members are defined:
 				   unsigned open_flag, umode_t create_mode);
 		int (*tmpfile) (struct user_namespace *, struct inode *, struct dentry *, umode_t);
 	        int (*set_acl)(struct user_namespace *, struct inode *, struct posix_acl *, int);
+		int (*fileattr_set)(struct user_namespace *mnt_userns,
+				    struct dentry *dentry, struct fileattr *fa);
+		int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
 	};
 
 Again, all methods are called without any locks being held, unless
@@ -588,6 +591,18 @@ otherwise noted.
 	atomically creating, opening and unlinking a file in given
 	directory.
 
+``fileattr_get``
+	called on ioctl(FS_IOC_GETFLAGS) and ioctl(FS_IOC_FSGETXATTR) to
+	retrieve miscellaneous file flags and attributes.  Also called
+	before the relevant SET operation to check what is being changed
+	(in this case with i_rwsem locked exclusive).  If unset, then
+	fall back to f_op->ioctl().
+
+``fileattr_set``
+	called on ioctl(FS_IOC_SETFLAGS) and ioctl(FS_IOC_FSSETXATTR) to
+	change miscellaneous file flags and attributes.  Callers hold
+	i_rwsem exclusive.  If unset, then fall back to f_op->ioctl().
+
 
 The Address Space Object
 ========================
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 4e6cc0a7d69c..1e2204fa9963 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -19,6 +19,9 @@
 #include <linux/falloc.h>
 #include <linux/sched/signal.h>
 #include <linux/fiemap.h>
+#include <linux/mount.h>
+#include <linux/fscrypt.h>
+#include <linux/fileattr.h>
 
 #include "internal.h"
 
@@ -657,6 +660,307 @@ out:
 	return ret;
 }
 
+/**
+ * fileattr_fill_xflags - initialize fileattr with xflags
+ * @fa:		fileattr pointer
+ * @xflags:	FS_XFLAG_* flags
+ *
+ * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).  All
+ * other fields are zeroed.
+ */
+void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
+{
+	memset(fa, 0, sizeof(*fa));
+	fa->fsx_valid = true;
+	fa->fsx_xflags = xflags;
+	if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
+		fa->flags |= FS_IMMUTABLE_FL;
+	if (fa->fsx_xflags & FS_XFLAG_APPEND)
+		fa->flags |= FS_APPEND_FL;
+	if (fa->fsx_xflags & FS_XFLAG_SYNC)
+		fa->flags |= FS_SYNC_FL;
+	if (fa->fsx_xflags & FS_XFLAG_NOATIME)
+		fa->flags |= FS_NOATIME_FL;
+	if (fa->fsx_xflags & FS_XFLAG_NODUMP)
+		fa->flags |= FS_NODUMP_FL;
+	if (fa->fsx_xflags & FS_XFLAG_DAX)
+		fa->flags |= FS_DAX_FL;
+	if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
+		fa->flags |= FS_PROJINHERIT_FL;
+}
+EXPORT_SYMBOL(fileattr_fill_xflags);
+
+/**
+ * fileattr_fill_flags - initialize fileattr with flags
+ * @fa:		fileattr pointer
+ * @flags:	FS_*_FL flags
+ *
+ * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
+ * All other fields are zeroed.
+ */
+void fileattr_fill_flags(struct fileattr *fa, u32 flags)
+{
+	memset(fa, 0, sizeof(*fa));
+	fa->flags_valid = true;
+	fa->flags = flags;
+	if (fa->flags & FS_SYNC_FL)
+		fa->fsx_xflags |= FS_XFLAG_SYNC;
+	if (fa->flags & FS_IMMUTABLE_FL)
+		fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
+	if (fa->flags & FS_APPEND_FL)
+		fa->fsx_xflags |= FS_XFLAG_APPEND;
+	if (fa->flags & FS_NODUMP_FL)
+		fa->fsx_xflags |= FS_XFLAG_NODUMP;
+	if (fa->flags & FS_NOATIME_FL)
+		fa->fsx_xflags |= FS_XFLAG_NOATIME;
+	if (fa->flags & FS_DAX_FL)
+		fa->fsx_xflags |= FS_XFLAG_DAX;
+	if (fa->flags & FS_PROJINHERIT_FL)
+		fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+}
+EXPORT_SYMBOL(fileattr_fill_flags);
+
+/**
+ * vfs_fileattr_get - retrieve miscellaneous file attributes
+ * @dentry:	the object to retrieve from
+ * @fa:		fileattr pointer
+ *
+ * Call i_op->fileattr_get() callback, if exists.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (!inode->i_op->fileattr_get)
+		return -ENOIOCTLCMD;
+
+	return inode->i_op->fileattr_get(dentry, fa);
+}
+EXPORT_SYMBOL(vfs_fileattr_get);
+
+/**
+ * copy_fsxattr_to_user - copy fsxattr to userspace.
+ * @fa:		fileattr pointer
+ * @ufa:	fsxattr user pointer
+ *
+ * Return: 0 on success, or -EFAULT on failure.
+ */
+int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
+{
+	struct fsxattr xfa;
+
+	memset(&xfa, 0, sizeof(xfa));
+	xfa.fsx_xflags = fa->fsx_xflags;
+	xfa.fsx_extsize = fa->fsx_extsize;
+	xfa.fsx_nextents = fa->fsx_nextents;
+	xfa.fsx_projid = fa->fsx_projid;
+	xfa.fsx_cowextsize = fa->fsx_cowextsize;
+
+	if (copy_to_user(ufa, &xfa, sizeof(xfa)))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL(copy_fsxattr_to_user);
+
+static int copy_fsxattr_from_user(struct fileattr *fa,
+				  struct fsxattr __user *ufa)
+{
+	struct fsxattr xfa;
+
+	if (copy_from_user(&xfa, ufa, sizeof(xfa)))
+		return -EFAULT;
+
+	fileattr_fill_xflags(fa, xfa.fsx_xflags);
+	fa->fsx_extsize = xfa.fsx_extsize;
+	fa->fsx_nextents = xfa.fsx_nextents;
+	fa->fsx_projid = xfa.fsx_projid;
+	fa->fsx_cowextsize = xfa.fsx_cowextsize;
+
+	return 0;
+}
+
+/*
+ * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
+ * any invalid configurations.
+ *
+ * Note: must be called with inode lock held.
+ */
+static int fileattr_set_prepare(struct inode *inode,
+			      const struct fileattr *old_ma,
+			      struct fileattr *fa)
+{
+	int err;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 */
+	if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
+	if (err)
+		return err;
+
+	/*
+	 * Project Quota ID state is only allowed to change from within the init
+	 * namespace. Enforce that restriction only if we are trying to change
+	 * the quota ID state. Everything else is allowed in user namespaces.
+	 */
+	if (current_user_ns() != &init_user_ns) {
+		if (old_ma->fsx_projid != fa->fsx_projid)
+			return -EINVAL;
+		if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
+				FS_XFLAG_PROJINHERIT)
+			return -EINVAL;
+	}
+
+	/* Check extent size hints. */
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
+			!S_ISDIR(inode->i_mode))
+		return -EINVAL;
+
+	if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
+	    !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+		return -EINVAL;
+
+	/*
+	 * It is only valid to set the DAX flag on regular files and
+	 * directories on filesystems.
+	 */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+	    !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+		return -EINVAL;
+
+	/* Extent size hints of zero turn off the flags. */
+	if (fa->fsx_extsize == 0)
+		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+	if (fa->fsx_cowextsize == 0)
+		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+	return 0;
+}
+
+/**
+ * vfs_fileattr_set - change miscellaneous file attributes
+ * @mnt_userns:	user namespace of the mount
+ * @dentry:	the object to change
+ * @fa:		fileattr pointer
+ *
+ * After verifying permissions, call i_op->fileattr_set() callback, if
+ * exists.
+ *
+ * Verifying attributes involves retrieving current attributes with
+ * i_op->fileattr_get(), this also allows initializing attributes that have
+ * not been set by the caller to current values.  Inode lock is held
+ * thoughout to prevent racing with another instance.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct fileattr *fa)
+{
+	struct inode *inode = d_inode(dentry);
+	struct fileattr old_ma = {};
+	int err;
+
+	if (!inode->i_op->fileattr_set)
+		return -ENOIOCTLCMD;
+
+	if (!inode_owner_or_capable(mnt_userns, inode))
+		return -EPERM;
+
+	inode_lock(inode);
+	err = vfs_fileattr_get(dentry, &old_ma);
+	if (!err) {
+		/* initialize missing bits from old_ma */
+		if (fa->flags_valid) {
+			fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
+			fa->fsx_extsize = old_ma.fsx_extsize;
+			fa->fsx_nextents = old_ma.fsx_nextents;
+			fa->fsx_projid = old_ma.fsx_projid;
+			fa->fsx_cowextsize = old_ma.fsx_cowextsize;
+		} else {
+			fa->flags |= old_ma.flags & ~FS_COMMON_FL;
+		}
+		err = fileattr_set_prepare(inode, &old_ma, fa);
+		if (!err)
+			err = inode->i_op->fileattr_set(mnt_userns, dentry, fa);
+	}
+	inode_unlock(inode);
+
+	return err;
+}
+EXPORT_SYMBOL(vfs_fileattr_set);
+
+static int ioctl_getflags(struct file *file, unsigned int __user *argp)
+{
+	struct fileattr fa = { .flags_valid = true }; /* hint only */
+	int err;
+
+	err = vfs_fileattr_get(file->f_path.dentry, &fa);
+	if (!err)
+		err = put_user(fa.flags, argp);
+	return err;
+}
+
+static int ioctl_setflags(struct file *file, unsigned int __user *argp)
+{
+	struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+	struct dentry *dentry = file->f_path.dentry;
+	struct fileattr fa;
+	unsigned int flags;
+	int err;
+
+	err = get_user(flags, argp);
+	if (!err) {
+		err = mnt_want_write_file(file);
+		if (!err) {
+			fileattr_fill_flags(&fa, flags);
+			err = vfs_fileattr_set(mnt_userns, dentry, &fa);
+			mnt_drop_write_file(file);
+		}
+	}
+	return err;
+}
+
+static int ioctl_fsgetxattr(struct file *file, void __user *argp)
+{
+	struct fileattr fa = { .fsx_valid = true }; /* hint only */
+	int err;
+
+	err = vfs_fileattr_get(file->f_path.dentry, &fa);
+	if (!err)
+		err = copy_fsxattr_to_user(&fa, argp);
+
+	return err;
+}
+
+static int ioctl_fssetxattr(struct file *file, void __user *argp)
+{
+	struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+	struct dentry *dentry = file->f_path.dentry;
+	struct fileattr fa;
+	int err;
+
+	err = copy_fsxattr_from_user(&fa, argp);
+	if (!err) {
+		err = mnt_want_write_file(file);
+		if (!err) {
+			err = vfs_fileattr_set(mnt_userns, dentry, &fa);
+			mnt_drop_write_file(file);
+		}
+	}
+	return err;
+}
+
 /*
  * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
  * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
@@ -727,6 +1031,18 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
 		return put_user(i_size_read(inode) - filp->f_pos,
 				(int __user *)argp);
 
+	case FS_IOC_GETFLAGS:
+		return ioctl_getflags(filp, argp);
+
+	case FS_IOC_SETFLAGS:
+		return ioctl_setflags(filp, argp);
+
+	case FS_IOC_FSGETXATTR:
+		return ioctl_fsgetxattr(filp, argp);
+
+	case FS_IOC_FSSETXATTR:
+		return ioctl_fssetxattr(filp, argp);
+
 	default:
 		if (S_ISREG(inode->i_mode))
 			return file_ioctl(filp, cmd, argp);
@@ -827,6 +1143,15 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 		break;
 #endif
 
+	/*
+	 * These access 32-bit values anyway so no further handling is
+	 * necessary.
+	 */
+	case FS_IOC32_GETFLAGS:
+	case FS_IOC32_SETFLAGS:
+		cmd = (cmd == FS_IOC32_GETFLAGS) ?
+			FS_IOC_GETFLAGS : FS_IOC_SETFLAGS;
+		fallthrough;
 	/*
 	 * everything else in do_vfs_ioctl() takes either a compatible
 	 * pointer argument or no argument -- call it with a modified
diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h
new file mode 100644
index 000000000000..9e37e063ac69
--- /dev/null
+++ b/include/linux/fileattr.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_FILEATTR_H
+#define _LINUX_FILEATTR_H
+
+/* Flags shared betwen flags/xflags */
+#define FS_COMMON_FL \
+	(FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \
+	 FS_NODUMP_FL |	FS_NOATIME_FL | FS_DAX_FL | \
+	 FS_PROJINHERIT_FL)
+
+#define FS_XFLAG_COMMON \
+	(FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND | \
+	 FS_XFLAG_NODUMP | FS_XFLAG_NOATIME | FS_XFLAG_DAX | \
+	 FS_XFLAG_PROJINHERIT)
+
+/*
+ * Merged interface for miscellaneous file attributes.  'flags' originates from
+ * ext* and 'fsx_flags' from xfs.  There's some overlap between the two, which
+ * is handled by the VFS helpers, so filesystems are free to implement just one
+ * or both of these sub-interfaces.
+ */
+struct fileattr {
+	u32	flags;		/* flags (FS_IOC_GETFLAGS/FS_IOC_SETFLAGS) */
+	/* struct fsxattr: */
+	u32	fsx_xflags;	/* xflags field value (get/set) */
+	u32	fsx_extsize;	/* extsize field value (get/set)*/
+	u32	fsx_nextents;	/* nextents field value (get)	*/
+	u32	fsx_projid;	/* project identifier (get/set) */
+	u32	fsx_cowextsize;	/* CoW extsize field value (get/set)*/
+	/* selectors: */
+	bool	flags_valid:1;
+	bool	fsx_valid:1;
+};
+
+int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa);
+
+void fileattr_fill_xflags(struct fileattr *fa, u32 xflags);
+void fileattr_fill_flags(struct fileattr *fa, u32 flags);
+
+/**
+ * fileattr_has_fsx - check for extended flags/attributes
+ * @fa:		fileattr pointer
+ *
+ * Return: true if any attributes are present that are not represented in
+ * ->flags.
+ */
+static inline bool fileattr_has_fsx(const struct fileattr *fa)
+{
+	return fa->fsx_valid &&
+		((fa->fsx_xflags & ~FS_XFLAG_COMMON) || fa->fsx_extsize != 0 ||
+		 fa->fsx_projid != 0 ||	fa->fsx_cowextsize != 0);
+}
+
+int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry,
+		     struct fileattr *fa);
+
+#endif /* _LINUX_FILEATTR_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6a..156b78f42a28 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,6 +70,7 @@ struct fsverity_info;
 struct fsverity_operations;
 struct fs_context;
 struct fs_parameter_spec;
+struct fileattr;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -1963,6 +1964,9 @@ struct inode_operations {
 			struct dentry *, umode_t);
 	int (*set_acl)(struct user_namespace *, struct inode *,
 		       struct posix_acl *, int);
+	int (*fileattr_set)(struct user_namespace *mnt_userns,
+			    struct dentry *dentry, struct fileattr *fa);
+	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
 } ____cacheline_aligned;
 
 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
-- 
cgit v1.2.3


From 51db776a430edd7477a779be0dc5c6fef4a05884 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 7 Apr 2021 14:36:44 +0200
Subject: vfs: remove unused ioctl helpers

Remove vfs_ioc_setflags_prepare(), vfs_ioc_fssetxattr_check() and
simple_fill_fsxattr(), which are no longer used.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/inode.c         | 87 ------------------------------------------------------
 include/linux/fs.h | 12 --------
 2 files changed, 99 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index a047ab306f9a..ae526fd9c0a4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -12,7 +12,6 @@
 #include <linux/security.h>
 #include <linux/cdev.h>
 #include <linux/memblock.h>
-#include <linux/fscrypt.h>
 #include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/posix_acl.h>
@@ -2314,89 +2313,3 @@ struct timespec64 current_time(struct inode *inode)
 	return timestamp_truncate(now, inode);
 }
 EXPORT_SYMBOL(current_time);
-
-/*
- * Generic function to check FS_IOC_SETFLAGS values and reject any invalid
- * configurations.
- *
- * Note: the caller should be holding i_mutex, or else be sure that they have
- * exclusive access to the inode structure.
- */
-int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
-			     unsigned int flags)
-{
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-	 * the relevant capability.
-	 *
-	 * This test looks nicer. Thanks to Pauline Middelink
-	 */
-	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
-		return -EPERM;
-
-	return fscrypt_prepare_setflags(inode, oldflags, flags);
-}
-EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
-
-/*
- * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
- * configurations.
- *
- * Note: the caller should be holding i_mutex, or else be sure that they have
- * exclusive access to the inode structure.
- */
-int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
-			     struct fsxattr *fa)
-{
-	/*
-	 * Can't modify an immutable/append-only file unless we have
-	 * appropriate permission.
-	 */
-	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
-			(FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
-		return -EPERM;
-
-	/*
-	 * Project Quota ID state is only allowed to change from within the init
-	 * namespace. Enforce that restriction only if we are trying to change
-	 * the quota ID state. Everything else is allowed in user namespaces.
-	 */
-	if (current_user_ns() != &init_user_ns) {
-		if (old_fa->fsx_projid != fa->fsx_projid)
-			return -EINVAL;
-		if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
-				FS_XFLAG_PROJINHERIT)
-			return -EINVAL;
-	}
-
-	/* Check extent size hints. */
-	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
-		return -EINVAL;
-
-	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
-			!S_ISDIR(inode->i_mode))
-		return -EINVAL;
-
-	if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
-	    !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-		return -EINVAL;
-
-	/*
-	 * It is only valid to set the DAX flag on regular files and
-	 * directories on filesystems.
-	 */
-	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
-	    !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
-		return -EINVAL;
-
-	/* Extent size hints of zero turn off the flags. */
-	if (fa->fsx_extsize == 0)
-		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
-	if (fa->fsx_cowextsize == 0)
-		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
-
-	return 0;
-}
-EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 156b78f42a28..820fdc62ac30 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3571,18 +3571,6 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
 			   int advice);
 
-int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
-			     unsigned int flags);
-
-int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
-			     struct fsxattr *fa);
-
-static inline void simple_fill_fsxattr(struct fsxattr *fa, __u32 xflags)
-{
-	memset(fa, 0, sizeof(*fa));
-	fa->fsx_xflags = xflags;
-}
-
 /*
  * Flush file data before changing attributes.  Caller must hold any locks
  * required to prevent further writes to this file until we're done setting
-- 
cgit v1.2.3


From 6f822e1b5d9dda3d20e87365de138046e3baa03a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 12 Apr 2021 15:46:57 +0200
Subject: block: remove zero_fill_bio_iter

zero_fill_bio_iter is only used to implement zero_fill_bio, so
remove the indirection.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20210412134658.2623190-1-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 6 +++---
 include/linux/bio.h | 7 +------
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 26b7f721cda8..0fecb80872c2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -493,20 +493,20 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
 }
 EXPORT_SYMBOL(bio_kmalloc);
 
-void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
+void zero_fill_bio(struct bio *bio)
 {
 	unsigned long flags;
 	struct bio_vec bv;
 	struct bvec_iter iter;
 
-	__bio_for_each_segment(bv, bio, iter, start) {
+	bio_for_each_segment(bv, bio, iter) {
 		char *data = bvec_kmap_irq(&bv, &flags);
 		memset(data, 0, bv.bv_len);
 		flush_dcache_page(bv.bv_page);
 		bvec_kunmap_irq(data, &flags);
 	}
 }
-EXPORT_SYMBOL(zero_fill_bio_iter);
+EXPORT_SYMBOL(zero_fill_bio);
 
 /**
  * bio_truncate - truncate the bio to small size of @new_size
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d0246c92a6e8..a8021d79d45d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -485,14 +485,9 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern void bio_list_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
-void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
 void bio_truncate(struct bio *bio, unsigned new_size);
 void guard_bio_eod(struct bio *bio);
-
-static inline void zero_fill_bio(struct bio *bio)
-{
-	zero_fill_bio_iter(bio, bio->bi_iter);
-}
+void zero_fill_bio(struct bio *bio);
 
 extern const char *bio_devname(struct bio *bio, char *buffer);
 
-- 
cgit v1.2.3


From 5f03414d4098b5718f1a5e99b43b9d9cb6f3612a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 12 Apr 2021 15:46:58 +0200
Subject: block: move bio_list_copy_data to pktcdvd

bio_list_copy_data is only used by pktcdvd, so move it there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20210412134658.2623190-2-hch@lst.de
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c             | 37 -------------------------------------
 drivers/block/pktcdvd.c | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/bio.h     |  1 -
 3 files changed, 36 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 0fecb80872c2..303298996afe 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1221,43 +1221,6 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 }
 EXPORT_SYMBOL(bio_copy_data);
 
-/**
- * bio_list_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * Stops when it reaches the end of either the @src list or @dst list - that is,
- * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
- * bios).
- */
-void bio_list_copy_data(struct bio *dst, struct bio *src)
-{
-	struct bvec_iter src_iter = src->bi_iter;
-	struct bvec_iter dst_iter = dst->bi_iter;
-
-	while (1) {
-		if (!src_iter.bi_size) {
-			src = src->bi_next;
-			if (!src)
-				break;
-
-			src_iter = src->bi_iter;
-		}
-
-		if (!dst_iter.bi_size) {
-			dst = dst->bi_next;
-			if (!dst)
-				break;
-
-			dst_iter = dst->bi_iter;
-		}
-
-		bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-	}
-}
-EXPORT_SYMBOL(bio_list_copy_data);
-
 void bio_free_pages(struct bio *bio)
 {
 	struct bio_vec *bvec;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index fc4b0f1aa86d..bd3556585122 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1199,6 +1199,42 @@ try_next_bio:
 	return 1;
 }
 
+/**
+ * bio_list_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * Stops when it reaches the end of either the @src list or @dst list - that is,
+ * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
+ * bios).
+ */
+static void bio_list_copy_data(struct bio *dst, struct bio *src)
+{
+	struct bvec_iter src_iter = src->bi_iter;
+	struct bvec_iter dst_iter = dst->bi_iter;
+
+	while (1) {
+		if (!src_iter.bi_size) {
+			src = src->bi_next;
+			if (!src)
+				break;
+
+			src_iter = src->bi_iter;
+		}
+
+		if (!dst_iter.bi_size) {
+			dst = dst->bi_next;
+			if (!dst)
+				break;
+
+			dst_iter = dst->bi_iter;
+		}
+
+		bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+	}
+}
+
 /*
  * Assemble a bio to write one packet and queue the bio for processing
  * by the underlying block device.
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a8021d79d45d..a0b4cfdf62a4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -483,7 +483,6 @@ extern void bio_check_pages_dirty(struct bio *bio);
 extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 			       struct bio *src, struct bvec_iter *src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-extern void bio_list_copy_data(struct bio *dst, struct bio *src);
 extern void bio_free_pages(struct bio *bio);
 void bio_truncate(struct bio *bio, unsigned new_size);
 void guard_bio_eod(struct bio *bio);
-- 
cgit v1.2.3


From 51e0158a54321a48d260e95998393934bb0de52c Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Tue, 6 Apr 2021 20:21:11 -0700
Subject: skmsg: Pass psock pointer to ->psock_update_sk_prot()

Using sk_psock() to retrieve psock pointer from sock requires
RCU read lock, but we already get psock pointer before calling
->psock_update_sk_prot() in both cases, so we can just pass it
without bothering sk_psock().

Fixes: 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()")
Reported-by: syzbot+320a3bc8d80f478c37e4@syzkaller.appspotmail.com
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: syzbot+320a3bc8d80f478c37e4@syzkaller.appspotmail.com
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210407032111.33398-1-xiyou.wangcong@gmail.com
---
 include/linux/skmsg.h | 5 +++--
 include/net/sock.h    | 5 ++++-
 include/net/tcp.h     | 2 +-
 include/net/udp.h     | 2 +-
 net/core/sock_map.c   | 2 +-
 net/ipv4/tcp_bpf.c    | 3 +--
 net/ipv4/udp_bpf.c    | 3 +--
 7 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index f78e90a04a69..e2fb0a5a101e 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -99,7 +99,8 @@ struct sk_psock {
 	void (*saved_close)(struct sock *sk, long timeout);
 	void (*saved_write_space)(struct sock *sk);
 	void (*saved_data_ready)(struct sock *sk);
-	int  (*psock_update_sk_prot)(struct sock *sk, bool restore);
+	int  (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock,
+				     bool restore);
 	struct proto			*sk_proto;
 	struct mutex			work_mutex;
 	struct sk_psock_work_state	work_state;
@@ -405,7 +406,7 @@ static inline void sk_psock_restore_proto(struct sock *sk,
 {
 	sk->sk_prot->unhash = psock->saved_unhash;
 	if (psock->psock_update_sk_prot)
-		psock->psock_update_sk_prot(sk, true);
+		psock->psock_update_sk_prot(sk, psock, true);
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/sock.h b/include/net/sock.h
index 8b4155e756c2..c4bbdcd83f4d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1114,6 +1114,7 @@ struct inet_hashinfo;
 struct raw_hashinfo;
 struct smc_hashinfo;
 struct module;
+struct sk_psock;
 
 /*
  * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
@@ -1185,7 +1186,9 @@ struct proto {
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
 #ifdef CONFIG_BPF_SYSCALL
-	int			(*psock_update_sk_prot)(struct sock *sk, bool restore);
+	int			(*psock_update_sk_prot)(struct sock *sk,
+							struct sk_psock *psock,
+							bool restore);
 #endif
 
 	/* Keeping track of sockets in use */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eaea43afcc97..d05193cb0d99 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2215,7 +2215,7 @@ struct sk_psock;
 
 #ifdef CONFIG_BPF_SYSCALL
 struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
-int tcp_bpf_update_proto(struct sock *sk, bool restore);
+int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/include/net/udp.h b/include/net/udp.h
index f55aaeef7e91..360df454356c 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -543,7 +543,7 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
 #ifdef CONFIG_BPF_SYSCALL
 struct sk_psock;
 struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
-int udp_bpf_update_proto(struct sock *sk, bool restore);
+int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 #endif
 
 #endif	/* _UDP_H */
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 3d190d22b0d8..f473c51cbc4b 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -188,7 +188,7 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
 	if (!sk->sk_prot->psock_update_sk_prot)
 		return -EINVAL;
 	psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot;
-	return sk->sk_prot->psock_update_sk_prot(sk, false);
+	return sk->sk_prot->psock_update_sk_prot(sk, psock, false);
 }
 
 static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 3d622a0d0753..4930bc8ab47e 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -499,9 +499,8 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
 	       ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
 }
 
-int tcp_bpf_update_proto(struct sock *sk, bool restore)
+int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 {
-	struct sk_psock *psock = sk_psock(sk);
 	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
 	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
 
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 4a7e38c5d842..954c4591a6fd 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -103,10 +103,9 @@ static int __init udp_bpf_v4_build_proto(void)
 }
 core_initcall(udp_bpf_v4_build_proto);
 
-int udp_bpf_update_proto(struct sock *sk, bool restore)
+int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 {
 	int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
-	struct sk_psock *psock = sk_psock(sk);
 
 	if (restore) {
 		sk->sk_write_space = psock->saved_write_space;
-- 
cgit v1.2.3


From c2ef2f50ad0ccf5460bf4824bc6669240b6c7936 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:40 -0300
Subject: vfio/mdev: Remove kobj from mdev_parent_ops->create()

The kobj here is a type-erased version of mdev_type, which is already
stored in the struct mdev_device being passed in. It was only ever used to
compute the type_group_id, which is now extracted directly from the mdev.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <17-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c  | 2 +-
 drivers/s390/cio/vfio_ccw_ops.c   | 2 +-
 drivers/s390/crypto/vfio_ap_ops.c | 2 +-
 drivers/vfio/mdev/mdev_core.c     | 2 +-
 include/linux/mdev.h              | 3 +--
 samples/vfio-mdev/mbochs.c        | 2 +-
 samples/vfio-mdev/mdpy.c          | 2 +-
 samples/vfio-mdev/mtty.c          | 2 +-
 8 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 16e1e4a38aa1..6bf176e8426e 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -689,7 +689,7 @@ static void kvmgt_put_vfio_device(void *vgpu)
 	vfio_device_put(vdev->vfio_device);
 }
 
-static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
+static int intel_vgpu_create(struct mdev_device *mdev)
 {
 	struct intel_vgpu *vgpu = NULL;
 	struct intel_vgpu_type *type;
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 767ac41686fe..10407cf67583 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -110,7 +110,7 @@ static struct attribute_group *mdev_type_groups[] = {
 	NULL,
 };
 
-static int vfio_ccw_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
+static int vfio_ccw_mdev_create(struct mdev_device *mdev)
 {
 	struct vfio_ccw_private *private =
 		dev_get_drvdata(mdev_parent_dev(mdev));
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 1ffdd411201c..d319152dd484 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -322,7 +322,7 @@ static void vfio_ap_matrix_init(struct ap_config_info *info,
 	matrix->adm_max = info->apxa ? info->Nd : 15;
 }
 
-static int vfio_ap_mdev_create(struct kobject *kobj, struct mdev_device *mdev)
+static int vfio_ap_mdev_create(struct mdev_device *mdev)
 {
 	struct ap_matrix_mdev *matrix_mdev;
 
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 5ae06f951a09..10eff33ce1f2 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -286,7 +286,7 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid)
 		goto out_put_device;
 	}
 
-	ret = parent->ops->create(&type->kobj, mdev);
+	ret = parent->ops->create(mdev);
 	if (ret)
 		goto out_unlock;
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 41e919365223..c3a800051d61 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -61,7 +61,6 @@ unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj);
  * @create:		Called to allocate basic resources in parent device's
  *			driver for a particular mediated device. It is
  *			mandatory to provide create ops.
- *			@kobj: kobject of type for which 'create' is called.
  *			@mdev: mdev_device structure on of mediated device
  *			      that is being created
  *			Returns integer: success (0) or error (< 0)
@@ -107,7 +106,7 @@ struct mdev_parent_ops {
 	const struct attribute_group **mdev_attr_groups;
 	struct attribute_group **supported_type_groups;
 
-	int     (*create)(struct kobject *kobj, struct mdev_device *mdev);
+	int     (*create)(struct mdev_device *mdev);
 	int     (*remove)(struct mdev_device *mdev);
 	int     (*open)(struct mdev_device *mdev);
 	void    (*release)(struct mdev_device *mdev);
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index a1af30df10a2..ac4d0dc24907 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -506,7 +506,7 @@ static int mbochs_reset(struct mdev_device *mdev)
 	return 0;
 }
 
-static int mbochs_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mbochs_create(struct mdev_device *mdev)
 {
 	const struct mbochs_type *type =
 		&mbochs_types[mdev_get_type_group_id(mdev)];
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 08c15f9f06a8..da88fd7dd423 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -216,7 +216,7 @@ static int mdpy_reset(struct mdev_device *mdev)
 	return 0;
 }
 
-static int mdpy_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mdpy_create(struct mdev_device *mdev)
 {
 	const struct mdpy_type *type =
 		&mdpy_types[mdev_get_type_group_id(mdev)];
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 191a587a8d5a..f2e36c06ac6a 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -708,7 +708,7 @@ accessfailed:
 	return ret;
 }
 
-static int mtty_create(struct kobject *kobj, struct mdev_device *mdev)
+static int mtty_create(struct mdev_device *mdev)
 {
 	struct mdev_state *mdev_state;
 	int nr_ports = mdev_get_type_group_id(mdev) + 1;
-- 
cgit v1.2.3


From 9169cff168ff262b4b78597f542e23843d0c494a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Apr 2021 16:40:41 -0300
Subject: vfio/mdev: Correct the function signatures for the
 mdev_type_attributes

The driver core standard is to pass in the properly typed object, the
properly typed attribute and the buffer data. It stems from the root
kobject method:

  ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,..)

Each subclass of kobject should provide their own function with the same
signature but more specific types, eg struct device uses:

  ssize_t (*show)(struct device *dev, struct device_attribute *attr,..)

In this case the existing signature is:

  ssize_t (*show)(struct kobject *kobj, struct device *dev,..)

Where kobj is a 'struct mdev_type *' and dev is 'mdev_type->parent->dev'.

Change the mdev_type related sysfs attribute functions to:

  ssize_t (*show)(struct mdev_type *mtype, struct mdev_type_attribute *attr,..)

In order to restore type safety and match the driver core standard

There are no current users of 'attr', but if it is ever needed it would be
hard to add in retroactively, so do it now.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Message-Id: <18-v2-d36939638fc6+d54-vfio2_jgg@nvidia.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/gpu/drm/i915/gvt/gvt.c    | 21 +++++++++++----------
 drivers/s390/cio/vfio_ccw_ops.c   | 15 +++++++++------
 drivers/s390/crypto/vfio_ap_ops.c | 12 +++++++-----
 drivers/vfio/mdev/mdev_core.c     | 14 ++++++++++++--
 drivers/vfio/mdev/mdev_sysfs.c    | 11 ++++++-----
 include/linux/mdev.h              | 11 +++++++----
 samples/vfio-mdev/mbochs.c        | 26 +++++++++++++++-----------
 samples/vfio-mdev/mdpy.c          | 24 ++++++++++++++----------
 samples/vfio-mdev/mtty.c          | 18 +++++++++---------
 9 files changed, 90 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c
index 4b47a18e9dfa..3703814a669b 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.c
+++ b/drivers/gpu/drm/i915/gvt/gvt.c
@@ -54,14 +54,15 @@ intel_gvt_find_vgpu_type(struct intel_gvt *gvt, unsigned int type_group_id)
 	return &gvt->types[type_group_id];
 }
 
-static ssize_t available_instances_show(struct kobject *kobj,
-					struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	struct intel_vgpu_type *type;
 	unsigned int num = 0;
-	void *gvt = kdev_to_i915(dev)->gvt;
+	void *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
 
-	type = intel_gvt_find_vgpu_type(gvt, mtype_get_type_group_id(kobj));
+	type = intel_gvt_find_vgpu_type(gvt, mtype_get_type_group_id(mtype));
 	if (!type)
 		num = 0;
 	else
@@ -70,19 +71,19 @@ static ssize_t available_instances_show(struct kobject *kobj,
 	return sprintf(buf, "%u\n", num);
 }
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-		char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
 
-static ssize_t description_show(struct kobject *kobj, struct device *dev,
-		char *buf)
+static ssize_t description_show(struct mdev_type *mtype,
+				struct mdev_type_attribute *attr, char *buf)
 {
 	struct intel_vgpu_type *type;
-	void *gvt = kdev_to_i915(dev)->gvt;
+	void *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt;
 
-	type = intel_gvt_find_vgpu_type(gvt, mtype_get_type_group_id(kobj));
+	type = intel_gvt_find_vgpu_type(gvt, mtype_get_type_group_id(mtype));
 	if (!type)
 		return 0;
 
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 10407cf67583..491a64c61fff 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -71,23 +71,26 @@ static int vfio_ccw_mdev_notifier(struct notifier_block *nb,
 	return NOTIFY_DONE;
 }
 
-static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "I/O subchannel (Non-QDIO)\n");
 }
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_CCW_STRING);
 }
 static MDEV_TYPE_ATTR_RO(device_api);
 
-static ssize_t available_instances_show(struct kobject *kobj,
-					struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
-	struct vfio_ccw_private *private = dev_get_drvdata(dev);
+	struct vfio_ccw_private *private =
+		dev_get_drvdata(mtype_get_parent_dev(mtype));
 
 	return sprintf(buf, "%d\n", atomic_read(&private->avail));
 }
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index d319152dd484..0ce00c9311d3 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -366,15 +366,17 @@ static int vfio_ap_mdev_remove(struct mdev_device *mdev)
 	return 0;
 }
 
-static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_AP_MDEV_NAME_HWVIRT);
 }
 
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t available_instances_show(struct kobject *kobj,
-					struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	return sprintf(buf, "%d\n",
 		       atomic_read(&matrix_dev->available_instances));
@@ -382,8 +384,8 @@ static ssize_t available_instances_show(struct kobject *kobj,
 
 static MDEV_TYPE_ATTR_RO(available_instances);
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_AP_STRING);
 }
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 10eff33ce1f2..2a85d6fcb7dd 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -47,12 +47,22 @@ EXPORT_SYMBOL(mdev_get_type_group_id);
  * Used in mdev_type_attribute sysfs functions to return the index in the
  * supported_type_groups that the sysfs is called from.
  */
-unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj)
+unsigned int mtype_get_type_group_id(struct mdev_type *mtype)
 {
-	return container_of(mtype_kobj, struct mdev_type, kobj)->type_group_id;
+	return mtype->type_group_id;
 }
 EXPORT_SYMBOL(mtype_get_type_group_id);
 
+/*
+ * Used in mdev_type_attribute sysfs functions to return the parent struct
+ * device
+ */
+struct device *mtype_get_parent_dev(struct mdev_type *mtype)
+{
+	return mtype->parent->dev;
+}
+EXPORT_SYMBOL(mtype_get_parent_dev);
+
 /* Should be called holding parent_list_lock */
 static struct mdev_parent *__find_parent_device(struct device *dev)
 {
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 712fbc78b12e..f5cf1931c54e 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -26,7 +26,7 @@ static ssize_t mdev_type_attr_show(struct kobject *kobj,
 	ssize_t ret = -EIO;
 
 	if (attr->show)
-		ret = attr->show(kobj, type->parent->dev, buf);
+		ret = attr->show(type, attr, buf);
 	return ret;
 }
 
@@ -39,7 +39,7 @@ static ssize_t mdev_type_attr_store(struct kobject *kobj,
 	ssize_t ret = -EIO;
 
 	if (attr->store)
-		ret = attr->store(&type->kobj, type->parent->dev, buf, count);
+		ret = attr->store(type, attr, buf, count);
 	return ret;
 }
 
@@ -48,8 +48,9 @@ static const struct sysfs_ops mdev_type_sysfs_ops = {
 	.store = mdev_type_attr_store,
 };
 
-static ssize_t create_store(struct kobject *kobj, struct device *dev,
-			    const char *buf, size_t count)
+static ssize_t create_store(struct mdev_type *mtype,
+			    struct mdev_type_attribute *attr, const char *buf,
+			    size_t count)
 {
 	char *str;
 	guid_t uuid;
@@ -67,7 +68,7 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = mdev_device_create(to_mdev_type(kobj), &uuid);
+	ret = mdev_device_create(mtype, &uuid);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index c3a800051d61..1fb34ea394ad 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -47,7 +47,8 @@ static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
 }
 
 unsigned int mdev_get_type_group_id(struct mdev_device *mdev);
-unsigned int mtype_get_type_group_id(struct kobject *mtype_kobj);
+unsigned int mtype_get_type_group_id(struct mdev_type *mtype);
+struct device *mtype_get_parent_dev(struct mdev_type *mtype);
 
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
@@ -123,9 +124,11 @@ struct mdev_parent_ops {
 /* interface for exporting mdev supported type attributes */
 struct mdev_type_attribute {
 	struct attribute attr;
-	ssize_t (*show)(struct kobject *kobj, struct device *dev, char *buf);
-	ssize_t (*store)(struct kobject *kobj, struct device *dev,
-			 const char *buf, size_t count);
+	ssize_t (*show)(struct mdev_type *mtype,
+			struct mdev_type_attribute *attr, char *buf);
+	ssize_t (*store)(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, const char *buf,
+			 size_t count);
 };
 
 #define MDEV_TYPE_ATTR(_name, _mode, _show, _store)		\
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index ac4d0dc24907..861c76914e76 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1330,37 +1330,41 @@ static const struct attribute_group *mdev_dev_groups[] = {
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%s\n", kobj->name);
+	const struct mbochs_type *type =
+		&mbochs_types[mtype_get_type_group_id(mtype)];
+
+	return sprintf(buf, "%s\n", type->name);
 }
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-description_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t description_show(struct mdev_type *mtype,
+				struct mdev_type_attribute *attr, char *buf)
 {
 	const struct mbochs_type *type =
-		&mbochs_types[mtype_get_type_group_id(kobj)];
+		&mbochs_types[mtype_get_type_group_id(mtype)];
 
 	return sprintf(buf, "virtual display, %d MB video memory\n",
 		       type ? type->mbytes  : 0);
 }
 static MDEV_TYPE_ATTR_RO(description);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	const struct mbochs_type *type =
-		&mbochs_types[mtype_get_type_group_id(kobj)];
+		&mbochs_types[mtype_get_type_group_id(mtype)];
 	int count = (max_mbytes - mbochs_used_mbytes) / type->mbytes;
 
 	return sprintf(buf, "%d\n", count);
 }
 static MDEV_TYPE_ATTR_RO(available_instances);
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index da88fd7dd423..885b88ea20e2 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -652,18 +652,21 @@ static const struct attribute_group *mdev_dev_groups[] = {
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%s\n", kobj->name);
+	const struct mdpy_type *type =
+		&mdpy_types[mtype_get_type_group_id(mtype)];
+
+	return sprintf(buf, "%s\n", type->name);
 }
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-description_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t description_show(struct mdev_type *mtype,
+				struct mdev_type_attribute *attr, char *buf)
 {
 	const struct mdpy_type *type =
-		&mdpy_types[mtype_get_type_group_id(kobj)];
+		&mdpy_types[mtype_get_type_group_id(mtype)];
 
 	return sprintf(buf, "virtual display, %dx%d framebuffer\n",
 		       type ? type->width  : 0,
@@ -671,15 +674,16 @@ description_show(struct kobject *kobj, struct device *dev, char *buf)
 }
 static MDEV_TYPE_ATTR_RO(description);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	return sprintf(buf, "%d\n", max_devices - mdpy_count);
 }
 static MDEV_TYPE_ATTR_RO(available_instances);
 
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index f2e36c06ac6a..b9b24be4abda 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -1292,23 +1292,24 @@ static const struct attribute_group *mdev_dev_groups[] = {
 	NULL,
 };
 
-static ssize_t
-name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype,
+			 struct mdev_type_attribute *attr, char *buf)
 {
 	static const char *name_str[2] = { "Single port serial",
 					   "Dual port serial" };
 
 	return sysfs_emit(buf, "%s\n",
-			  name_str[mtype_get_type_group_id(kobj)]);
+			  name_str[mtype_get_type_group_id(mtype)]);
 }
 
 static MDEV_TYPE_ATTR_RO(name);
 
-static ssize_t
-available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *mtype,
+					struct mdev_type_attribute *attr,
+					char *buf)
 {
 	struct mdev_state *mds;
-	unsigned int ports = mtype_get_type_group_id(kobj) + 1;
+	unsigned int ports = mtype_get_type_group_id(mtype) + 1;
 	int used = 0;
 
 	list_for_each_entry(mds, &mdev_devices_list, next)
@@ -1319,9 +1320,8 @@ available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
 
 static MDEV_TYPE_ATTR_RO(available_instances);
 
-
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
-			       char *buf)
+static ssize_t device_api_show(struct mdev_type *mtype,
+			       struct mdev_type_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
 }
-- 
cgit v1.2.3


From 332d1a0373be32a3a3c152756bca45ff4f4e11b5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 25 Mar 2021 18:15:36 -0400
Subject: NFS: nfs4_bitmask_adjust() must not change the server global bitmasks

As currently set, the calls to nfs4_bitmask_adjust() will end up
overwriting the contents of the nfs_server cache_consistency_bitmask
field.
The intention here should be to modify a private copy of that mask in
the close/delegreturn/write arguments.

Fixes: 76bd5c016ef4 ("NFSv4: make cache consistency bitmask dynamic")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c       | 56 ++++++++++++++++++++++++++++++-------------------
 include/linux/nfs_xdr.h | 11 +++++++---
 2 files changed, 42 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bca1726bcd55..1cf98c40e3b2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -108,9 +108,10 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
 static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
 		const struct cred *, bool);
 #endif
-static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
-		struct nfs_server *server,
-		struct nfs4_label *label);
+static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ],
+			     const __u32 *src, struct inode *inode,
+			     struct nfs_server *server,
+			     struct nfs4_label *label);
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 static inline struct nfs4_label *
@@ -3591,6 +3592,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	struct nfs4_closedata *calldata = data;
 	struct nfs4_state *state = calldata->state;
 	struct inode *inode = calldata->inode;
+	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layout_hdr *lo;
 	bool is_rdonly, is_wronly, is_rdwr;
 	int call_close = 0;
@@ -3647,8 +3649,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
 		/* Close-to-open cache consistency revalidation */
 		if (!nfs4_have_delegation(inode, FMODE_READ)) {
-			calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
-			nfs4_bitmask_adjust(calldata->arg.bitmask, inode, NFS_SERVER(inode), NULL);
+			nfs4_bitmask_set(calldata->arg.bitmask_store,
+					 server->cache_consistency_bitmask,
+					 inode, server, NULL);
+			calldata->arg.bitmask = calldata->arg.bitmask_store;
 		} else
 			calldata->arg.bitmask = NULL;
 	}
@@ -5416,19 +5420,17 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
 	return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
 
-static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
-				struct nfs_server *server,
-				struct nfs4_label *label)
+static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src,
+			     struct inode *inode, struct nfs_server *server,
+			     struct nfs4_label *label)
 {
-
 	unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+	unsigned int i;
 
-	if ((cache_validity & NFS_INO_INVALID_DATA) ||
-		(cache_validity & NFS_INO_REVAL_PAGECACHE) ||
-		(cache_validity & NFS_INO_REVAL_FORCED) ||
-		(cache_validity & NFS_INO_INVALID_OTHER))
-		nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode);
+	memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ);
 
+	if (cache_validity & (NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE))
+		bitmask[0] |= FATTR4_WORD0_CHANGE;
 	if (cache_validity & NFS_INO_INVALID_ATIME)
 		bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
 	if (cache_validity & NFS_INO_INVALID_OTHER)
@@ -5437,16 +5439,22 @@ static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode,
 				FATTR4_WORD1_NUMLINKS;
 	if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
 		bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
-	if (cache_validity & NFS_INO_INVALID_CHANGE)
-		bitmask[0] |= FATTR4_WORD0_CHANGE;
 	if (cache_validity & NFS_INO_INVALID_CTIME)
 		bitmask[1] |= FATTR4_WORD1_TIME_METADATA;
 	if (cache_validity & NFS_INO_INVALID_MTIME)
 		bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
-	if (cache_validity & NFS_INO_INVALID_SIZE)
-		bitmask[0] |= FATTR4_WORD0_SIZE;
 	if (cache_validity & NFS_INO_INVALID_BLOCKS)
 		bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+
+	if (nfs4_have_delegation(inode, FMODE_READ) &&
+	    !(cache_validity & NFS_INO_REVAL_FORCED))
+		bitmask[0] &= ~FATTR4_WORD0_SIZE;
+	else if (cache_validity &
+		 (NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE))
+		bitmask[0] |= FATTR4_WORD0_SIZE;
+
+	for (i = 0; i < NFS4_BITMASK_SZ; i++)
+		bitmask[i] &= server->attr_bitmask[i];
 }
 
 static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
@@ -5459,8 +5467,10 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
 		hdr->args.bitmask = NULL;
 		hdr->res.fattr = NULL;
 	} else {
-		hdr->args.bitmask = server->cache_consistency_bitmask;
-		nfs4_bitmask_adjust(hdr->args.bitmask, hdr->inode, server, NULL);
+		nfs4_bitmask_set(hdr->args.bitmask_store,
+				 server->cache_consistency_bitmask,
+				 hdr->inode, server, NULL);
+		hdr->args.bitmask = hdr->args.bitmask_store;
 	}
 
 	if (!hdr->pgio_done_cb)
@@ -6502,8 +6512,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
 
 	data->args.fhandle = &data->fh;
 	data->args.stateid = &data->stateid;
-	data->args.bitmask = server->cache_consistency_bitmask;
-	nfs4_bitmask_adjust(data->args.bitmask, inode, server, NULL);
+	nfs4_bitmask_set(data->args.bitmask_store,
+			 server->cache_consistency_bitmask, inode, server,
+			 NULL);
+	data->args.bitmask = data->args.bitmask_store;
 	nfs_copy_fh(&data->fh, NFS_FH(inode));
 	nfs4_stateid_copy(&data->stateid, stateid);
 	data->res.fattr = &data->fattr;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 3327239fa2f9..cc29dee508f7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -15,6 +15,8 @@
 #define NFS_DEF_FILE_IO_SIZE	(4096U)
 #define NFS_MIN_FILE_IO_SIZE	(1024U)
 
+#define NFS_BITMASK_SZ		3
+
 struct nfs4_string {
 	unsigned int len;
 	char *data;
@@ -525,7 +527,8 @@ struct nfs_closeargs {
 	struct nfs_seqid *	seqid;
 	fmode_t			fmode;
 	u32			share_access;
-	u32 *			bitmask;
+	const u32 *		bitmask;
+	u32			bitmask_store[NFS_BITMASK_SZ];
 	struct nfs4_layoutreturn_args *lr_args;
 };
 
@@ -608,7 +611,8 @@ struct nfs4_delegreturnargs {
 	struct nfs4_sequence_args	seq_args;
 	const struct nfs_fh *fhandle;
 	const nfs4_stateid *stateid;
-	u32 * bitmask;
+	const u32 *bitmask;
+	u32 bitmask_store[NFS_BITMASK_SZ];
 	struct nfs4_layoutreturn_args *lr_args;
 };
 
@@ -648,7 +652,8 @@ struct nfs_pgio_args {
 	union {
 		unsigned int		replen;			/* used by read */
 		struct {
-			u32 *			bitmask;	/* used by write */
+			const u32 *		bitmask;	/* used by write */
+			u32 bitmask_store[NFS_BITMASK_SZ];	/* used by write */
 			enum nfs3_stable_how	stable;		/* used by write */
 		};
 	};
-- 
cgit v1.2.3


From 1fe976d308acb6374c899a4ee8025a0a016e453e Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Mon, 12 Apr 2021 18:57:39 +0200
Subject: net: phy: marvell: fix detection of PHY on Topaz switches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit fee2d546414d ("net: phy: marvell: mv88e6390 temperature
sensor reading"), Linux reports the temperature of Topaz hwmon as
constant -75°C.

This is because switches from the Topaz family (88E6141 / 88E6341) have
the address of the temperature sensor register different from Peridot.

This address is instead compatible with 88E1510 PHYs, as was used for
Topaz before the above mentioned commit.

Create a new mapping table between switch family and PHY ID for families
which don't have a model number. And define PHY IDs for Topaz and Peridot
families.

Create a new PHY ID and a new PHY driver for Topaz's internal PHY.
The only difference from Peridot's PHY driver is the HWMON probing
method.

Prior this change Topaz's internal PHY is detected by kernel as:

  PHY [...] driver [Marvell 88E6390] (irq=63)

And afterwards as:

  PHY [...] driver [Marvell 88E6341 Family] (irq=63)

Signed-off-by: Pali Rohár <pali@kernel.org>
BugLink: https://github.com/globalscaletechnologies/linux/issues/1
Fixes: fee2d546414d ("net: phy: marvell: mv88e6390 temperature sensor reading")
Reviewed-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 30 +++++++++++++-----------------
 drivers/net/phy/marvell.c        | 32 +++++++++++++++++++++++++++++---
 include/linux/marvell_phy.h      |  5 +++--
 3 files changed, 45 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 903d619e08ed..e08bf9377140 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3026,10 +3026,17 @@ out_resources:
 	return err;
 }
 
+/* prod_id for switch families which do not have a PHY model number */
+static const u16 family_prod_id_table[] = {
+	[MV88E6XXX_FAMILY_6341] = MV88E6XXX_PORT_SWITCH_ID_PROD_6341,
+	[MV88E6XXX_FAMILY_6390] = MV88E6XXX_PORT_SWITCH_ID_PROD_6390,
+};
+
 static int mv88e6xxx_mdio_read(struct mii_bus *bus, int phy, int reg)
 {
 	struct mv88e6xxx_mdio_bus *mdio_bus = bus->priv;
 	struct mv88e6xxx_chip *chip = mdio_bus->chip;
+	u16 prod_id;
 	u16 val;
 	int err;
 
@@ -3040,23 +3047,12 @@ static int mv88e6xxx_mdio_read(struct mii_bus *bus, int phy, int reg)
 	err = chip->info->ops->phy_read(chip, bus, phy, reg, &val);
 	mv88e6xxx_reg_unlock(chip);
 
-	if (reg == MII_PHYSID2) {
-		/* Some internal PHYs don't have a model number. */
-		if (chip->info->family != MV88E6XXX_FAMILY_6165)
-			/* Then there is the 6165 family. It gets is
-			 * PHYs correct. But it can also have two
-			 * SERDES interfaces in the PHY address
-			 * space. And these don't have a model
-			 * number. But they are not PHYs, so we don't
-			 * want to give them something a PHY driver
-			 * will recognise.
-			 *
-			 * Use the mv88e6390 family model number
-			 * instead, for anything which really could be
-			 * a PHY,
-			 */
-			if (!(val & 0x3f0))
-				val |= MV88E6XXX_PORT_SWITCH_ID_PROD_6390 >> 4;
+	/* Some internal PHYs don't have a model number. */
+	if (reg == MII_PHYSID2 && !(val & 0x3f0) &&
+	    chip->info->family < ARRAY_SIZE(family_prod_id_table)) {
+		prod_id = family_prod_id_table[chip->info->family];
+		if (prod_id)
+			val |= prod_id >> 4;
 	}
 
 	return err ? err : val;
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index e26a5d663f8a..8018ddf7f316 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -3021,9 +3021,34 @@ static struct phy_driver marvell_drivers[] = {
 		.get_stats = marvell_get_stats,
 	},
 	{
-		.phy_id = MARVELL_PHY_ID_88E6390,
+		.phy_id = MARVELL_PHY_ID_88E6341_FAMILY,
 		.phy_id_mask = MARVELL_PHY_ID_MASK,
-		.name = "Marvell 88E6390",
+		.name = "Marvell 88E6341 Family",
+		/* PHY_GBIT_FEATURES */
+		.flags = PHY_POLL_CABLE_TEST,
+		.probe = m88e1510_probe,
+		.config_init = marvell_config_init,
+		.config_aneg = m88e6390_config_aneg,
+		.read_status = marvell_read_status,
+		.config_intr = marvell_config_intr,
+		.handle_interrupt = marvell_handle_interrupt,
+		.resume = genphy_resume,
+		.suspend = genphy_suspend,
+		.read_page = marvell_read_page,
+		.write_page = marvell_write_page,
+		.get_sset_count = marvell_get_sset_count,
+		.get_strings = marvell_get_strings,
+		.get_stats = marvell_get_stats,
+		.get_tunable = m88e1540_get_tunable,
+		.set_tunable = m88e1540_set_tunable,
+		.cable_test_start = marvell_vct7_cable_test_start,
+		.cable_test_tdr_start = marvell_vct5_cable_test_tdr_start,
+		.cable_test_get_status = marvell_vct7_cable_test_get_status,
+	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E6390_FAMILY,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E6390 Family",
 		/* PHY_GBIT_FEATURES */
 		.flags = PHY_POLL_CABLE_TEST,
 		.probe = m88e6390_probe,
@@ -3107,7 +3132,8 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E1540, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1545, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK },
-	{ MARVELL_PHY_ID_88E6390, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E6341_FAMILY, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E6390_FAMILY, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1340S, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1548P, MARVELL_PHY_ID_MASK },
 	{ }
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 52b1610eae68..c544b70dfbd2 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -28,11 +28,12 @@
 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
 #define MARVELL_PHY_ID_88E1111_FINISAR	0x01ff0cc0
 
-/* The MV88e6390 Ethernet switch contains embedded PHYs. These PHYs do
+/* These Ethernet switch families contain embedded PHYs, but they do
  * not have a model ID. So the switch driver traps reads to the ID2
  * register and returns the switch family ID
  */
-#define MARVELL_PHY_ID_88E6390		0x01410f90
+#define MARVELL_PHY_ID_88E6341_FAMILY	0x01410f41
+#define MARVELL_PHY_ID_88E6390_FAMILY	0x01410f90
 
 #define MARVELL_PHY_FAMILY_ID(id)	((id) >> 4)
 
-- 
cgit v1.2.3


From 1f3208b2d6975f31b9c7c6bf174b84fe9c97492f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 25 Mar 2021 11:04:34 -0400
Subject: NFS: Add a cache validity flag argument to nfs_revalidate_inode()

Add an argument to nfs_revalidate_inode() to allow callers to specify
which attributes they need to check for validity.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           |  2 +-
 fs/nfs/export.c        |  6 +-----
 fs/nfs/inode.c         | 25 +++++++------------------
 fs/nfs/nfs3acl.c       |  2 +-
 fs/nfs/nfs4proc.c      |  6 +++---
 include/linux/nfs_fs.h |  2 +-
 6 files changed, 14 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0cd7c59a6601..e924d65c125e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -3006,7 +3006,7 @@ out_notsup:
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
-	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	res = nfs_revalidate_inode(inode, NFS_INO_INVALID_OTHER);
 	if (res == 0)
 		res = generic_permission(&init_user_ns, inode, mask);
 	goto out;
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index b347e3ce0cc8..37a1a88df771 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -169,11 +169,7 @@ out:
 
 static u64 nfs_fetch_iversion(struct inode *inode)
 {
-	struct nfs_server *server = NFS_SERVER(inode);
-
-	if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
-						   NFS_INO_REVAL_PAGECACHE))
-		__nfs_revalidate_inode(server, inode);
+	nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
 	return inode_peek_iversion_raw(inode);
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d34da63202cc..b9aac408f03a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -802,16 +802,6 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
 	dput(parent);
 }
 
-static bool nfs_need_revalidate_inode(struct inode *inode)
-{
-	if (NFS_I(inode)->cache_validity &
-			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
-		return true;
-	if (nfs_attribute_cache_expired(inode))
-		return true;
-	return false;
-}
-
 static u32 nfs_get_valid_attrmask(struct inode *inode)
 {
 	unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
@@ -1004,7 +994,6 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 {
 	struct nfs_inode *nfsi;
 	struct inode *inode;
-	struct nfs_server *server;
 
 	if (!(ctx->mode & FMODE_WRITE))
 		return;
@@ -1020,10 +1009,10 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 		return;
 	if (!list_empty(&nfsi->open_files))
 		return;
-	server = NFS_SERVER(inode);
-	if (server->flags & NFS_MOUNT_NOCTO)
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO)
 		return;
-	nfs_revalidate_inode(server, inode);
+	nfs_revalidate_inode(inode,
+			     NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
 }
 EXPORT_SYMBOL_GPL(nfs_close_context);
 
@@ -1278,16 +1267,16 @@ int nfs_attribute_cache_expired(struct inode *inode)
 
 /**
  * nfs_revalidate_inode - Revalidate the inode attributes
- * @server: pointer to nfs_server struct
  * @inode: pointer to inode struct
+ * @flags: cache flags to check
  *
  * Updates inode attribute information by retrieving the data from the server.
  */
-int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+int nfs_revalidate_inode(struct inode *inode, unsigned long flags)
 {
-	if (!nfs_need_revalidate_inode(inode))
+	if (!nfs_check_cache_invalid(inode, flags))
 		return NFS_STALE(inode) ? -ESTALE : 0;
-	return __nfs_revalidate_inode(server, inode);
+	return __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 }
 EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bb386a691e69..9ec560aa4a50 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -65,7 +65,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
 	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	status = nfs_revalidate_inode(server, inode);
+	status = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
 	if (status < 0)
 		return ERR_PTR(status);
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1cf98c40e3b2..6b990fe5bc1f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5868,7 +5868,7 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
 
 	if (!nfs4_server_supports_acls(server))
 		return -EOPNOTSUPP;
-	ret = nfs_revalidate_inode(server, inode);
+	ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
 	if (ret < 0)
 		return ret;
 	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
@@ -7619,7 +7619,7 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler,
 			return -EACCES;
 	}
 
-	ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
 	if (ret)
 		return ret;
 
@@ -7650,7 +7650,7 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len)
 			return 0;
 	}
 
-	ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index eadaabd18dc7..624ffd47a9d4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -386,7 +386,7 @@ extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct user_namespace *, struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_attribute_cache_expired(struct inode *inode);
-extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_clear_invalid_mapping(struct address_space *mapping);
 extern bool nfs_mapping_need_revalidate_inode(struct inode *inode);
-- 
cgit v1.2.3


From fabf2b341502e894001d70f91309dd6f3785e2dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 25 Mar 2021 13:14:42 -0400
Subject: NFS: Separate tracking of file nlinks cache validity from the
 mode/uid/gid

Rename can cause us to revalidate the access cache, so lets track the
nlinks separately from the mode/uid/gid.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           |  2 +-
 fs/nfs/inode.c         | 15 ++++++++++-----
 fs/nfs/nfs4proc.c      | 13 +++++++------
 fs/nfs/nfstrace.h      |  4 +++-
 include/linux/nfs_fs.h |  2 ++
 5 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e924d65c125e..f748d2294261 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1711,7 +1711,7 @@ static void nfs_drop_nlink(struct inode *inode)
 	NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter();
 	nfs_set_cache_invalid(
 		inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
-			       NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED);
+			       NFS_INO_INVALID_NLINK | NFS_INO_REVAL_FORCED);
 	spin_unlock(&inode->i_lock);
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3d18e66a4b8f..7bf9138330f2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -538,7 +538,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
 			set_nlink(inode, fattr->nlink);
 		else if (nfs_server_capable(inode, NFS_CAP_NLINK))
-			nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
 		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
 			inode->i_uid = fattr->uid;
 		else if (nfs_server_capable(inode, NFS_CAP_OWNER))
@@ -801,8 +801,10 @@ static u32 nfs_get_valid_attrmask(struct inode *inode)
 		reply_mask |= STATX_MTIME;
 	if (!(cache_validity & NFS_INO_INVALID_SIZE))
 		reply_mask |= STATX_SIZE;
+	if (!(cache_validity & NFS_INO_INVALID_NLINK))
+		reply_mask |= STATX_NLINK;
 	if (!(cache_validity & NFS_INO_INVALID_OTHER))
-		reply_mask |= STATX_UID | STATX_GID | STATX_MODE | STATX_NLINK;
+		reply_mask |= STATX_UID | STATX_GID | STATX_MODE;
 	if (!(cache_validity & NFS_INO_INVALID_BLOCKS))
 		reply_mask |= STATX_BLOCKS;
 	return reply_mask;
@@ -868,7 +870,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		do_update |= cache_validity & NFS_INO_INVALID_MTIME;
 	if (request_mask & STATX_SIZE)
 		do_update |= cache_validity & NFS_INO_INVALID_SIZE;
-	if (request_mask & (STATX_UID | STATX_GID | STATX_MODE | STATX_NLINK))
+	if (request_mask & STATX_NLINK)
+		do_update |= cache_validity & NFS_INO_INVALID_NLINK;
+	if (request_mask & (STATX_UID | STATX_GID | STATX_MODE))
 		do_update |= cache_validity & NFS_INO_INVALID_OTHER;
 	if (request_mask & STATX_BLOCKS)
 		do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
@@ -1518,7 +1522,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 	/* Has the link count changed? */
 	if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
-		invalid |= NFS_INO_INVALID_OTHER;
+		invalid |= NFS_INO_INVALID_NLINK;
 
 	ts = inode->i_atime;
 	if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
@@ -1942,6 +1946,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 					| NFS_INO_INVALID_MTIME
 					| NFS_INO_INVALID_SIZE
 					| NFS_INO_INVALID_BLOCKS
+					| NFS_INO_INVALID_NLINK
 					| NFS_INO_INVALID_OTHER;
 				if (S_ISDIR(inode->i_mode))
 					nfs_force_lookup_revalidate(inode);
@@ -2074,7 +2079,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		}
 	} else if (server->caps & NFS_CAP_NLINK) {
 		nfsi->cache_validity |= save_cache_validity &
-				(NFS_INO_INVALID_OTHER
+				(NFS_INO_INVALID_NLINK
 				| NFS_INO_REVAL_FORCED);
 		cache_revalidated = false;
 	}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dc595ce40ca..a74c1c3c4192 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1167,14 +1167,14 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
 static void
 nfs4_inc_nlink_locked(struct inode *inode)
 {
-	nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+	nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
 	inc_nlink(inode);
 }
 
 static void
 nfs4_dec_nlink_locked(struct inode *inode)
 {
-	nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+	nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
 	drop_nlink(inode);
 }
 
@@ -4717,11 +4717,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 			/* Note: If we moved a directory, nlink will change */
 			nfs4_update_changeattr(old_dir, &res->old_cinfo,
 					res->old_fattr->time_start,
-					NFS_INO_INVALID_OTHER |
+					NFS_INO_INVALID_NLINK |
 					    NFS_INO_INVALID_DATA);
 			nfs4_update_changeattr(new_dir, &res->new_cinfo,
 					res->new_fattr->time_start,
-					NFS_INO_INVALID_OTHER |
+					NFS_INO_INVALID_NLINK |
 					    NFS_INO_INVALID_DATA);
 		} else
 			nfs4_update_changeattr(old_dir, &res->old_cinfo,
@@ -5433,8 +5433,9 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src,
 		bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
 	if (cache_validity & NFS_INO_INVALID_OTHER)
 		bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
-				FATTR4_WORD1_OWNER_GROUP |
-				FATTR4_WORD1_NUMLINKS;
+				FATTR4_WORD1_OWNER_GROUP;
+	if (cache_validity & NFS_INO_INVALID_NLINK)
+		bitmask[1] |= FATTR4_WORD1_NUMLINKS;
 	if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
 		bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL;
 	if (cache_validity & NFS_INO_INVALID_CTIME)
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index cdba6eebe3cb..a0ebc53160dd 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -48,6 +48,7 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER);
 TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK);
 
 #define nfs_show_cache_validity(v) \
 	__print_flags(v, "|", \
@@ -65,7 +66,8 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR);
 			{ NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \
 			{ NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \
 			{ NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \
-			{ NFS_INO_INVALID_XATTR, "INVALID_XATTR" })
+			{ NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \
+			{ NFS_INO_INVALID_NLINK, "INVALID_NLINK" })
 
 TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
 TRACE_DEFINE_ENUM(NFS_INO_STALE);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 624ffd47a9d4..41165b988dfb 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -246,11 +246,13 @@ struct nfs4_copy_state {
 				BIT(13)		/* Deferred cache invalidation */
 #define NFS_INO_INVALID_BLOCKS	BIT(14)         /* cached blocks are invalid */
 #define NFS_INO_INVALID_XATTR	BIT(15)		/* xattrs are invalid */
+#define NFS_INO_INVALID_NLINK	BIT(16)		/* cached nlinks is invalid */
 
 #define NFS_INO_INVALID_ATTR	(NFS_INO_INVALID_CHANGE \
 		| NFS_INO_INVALID_CTIME \
 		| NFS_INO_INVALID_MTIME \
 		| NFS_INO_INVALID_SIZE \
+		| NFS_INO_INVALID_NLINK \
 		| NFS_INO_INVALID_OTHER)	/* inode metadata is invalid */
 
 /*
-- 
cgit v1.2.3


From 720869eb19f3161980d6d4631d3df7e8c5355993 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 13 Apr 2021 09:41:16 -0400
Subject: NFS: Separate tracking of file mode cache validity from the uid/gid

chown()/chgrp() and chmod() are separate operations, and in addition,
there are mode operations that are performed automatically by the
server. So let's track mode validity separately from the file ownership
validity.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           |  5 +++--
 fs/nfs/inode.c         | 18 ++++++++++++------
 fs/nfs/nfs4proc.c      | 14 +++++++++-----
 fs/nfs/nfstrace.h      |  4 +++-
 fs/nfs/write.c         |  2 +-
 include/linux/nfs_fs.h |  2 ++
 6 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f748d2294261..d2835d211a73 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2948,7 +2948,7 @@ static int nfs_execute_ok(struct inode *inode, int mask)
 
 	if (S_ISDIR(inode->i_mode))
 		return 0;
-	if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_OTHER)) {
+	if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) {
 		if (mask & MAY_NOT_BLOCK)
 			return -ECHILD;
 		ret = __nfs_revalidate_inode(server, inode);
@@ -3006,7 +3006,8 @@ out_notsup:
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
-	res = nfs_revalidate_inode(inode, NFS_INO_INVALID_OTHER);
+	res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE |
+						  NFS_INO_INVALID_OTHER);
 	if (res == 0)
 		res = generic_permission(&init_user_ns, inode, mask);
 	goto out;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7bf9138330f2..81e3e140e923 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -199,7 +199,8 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 
 	if (have_delegation) {
 		if (!(flags & NFS_INO_REVAL_FORCED))
-			flags &= ~NFS_INO_INVALID_OTHER;
+			flags &= ~(NFS_INO_INVALID_MODE |
+				   NFS_INO_INVALID_OTHER);
 		flags &= ~(NFS_INO_INVALID_CHANGE
 				| NFS_INO_INVALID_SIZE
 				| NFS_INO_INVALID_XATTR);
@@ -472,7 +473,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		nfsi->cache_validity = 0;
 		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
 				&& nfs_server_capable(inode, NFS_CAP_MODE))
-			nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
 		/* Why so? Because we want revalidate for devices/FIFOs, and
 		 * that's precisely what we have in nfs_file_inode_operations.
 		 */
@@ -803,8 +804,10 @@ static u32 nfs_get_valid_attrmask(struct inode *inode)
 		reply_mask |= STATX_SIZE;
 	if (!(cache_validity & NFS_INO_INVALID_NLINK))
 		reply_mask |= STATX_NLINK;
+	if (!(cache_validity & NFS_INO_INVALID_MODE))
+		reply_mask |= STATX_MODE;
 	if (!(cache_validity & NFS_INO_INVALID_OTHER))
-		reply_mask |= STATX_UID | STATX_GID | STATX_MODE;
+		reply_mask |= STATX_UID | STATX_GID;
 	if (!(cache_validity & NFS_INO_INVALID_BLOCKS))
 		reply_mask |= STATX_BLOCKS;
 	return reply_mask;
@@ -872,7 +875,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		do_update |= cache_validity & NFS_INO_INVALID_SIZE;
 	if (request_mask & STATX_NLINK)
 		do_update |= cache_validity & NFS_INO_INVALID_NLINK;
-	if (request_mask & (STATX_UID | STATX_GID | STATX_MODE))
+	if (request_mask & STATX_MODE)
+		do_update |= cache_validity & NFS_INO_INVALID_MODE;
+	if (request_mask & (STATX_UID | STATX_GID))
 		do_update |= cache_validity & NFS_INO_INVALID_OTHER;
 	if (request_mask & STATX_BLOCKS)
 		do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
@@ -1510,7 +1515,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
 		invalid |= NFS_INO_INVALID_ACCESS
 			| NFS_INO_INVALID_ACL
-			| NFS_INO_INVALID_OTHER;
+			| NFS_INO_INVALID_MODE;
 	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
 		invalid |= NFS_INO_INVALID_ACCESS
 			| NFS_INO_INVALID_ACL
@@ -1947,6 +1952,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 					| NFS_INO_INVALID_SIZE
 					| NFS_INO_INVALID_BLOCKS
 					| NFS_INO_INVALID_NLINK
+					| NFS_INO_INVALID_MODE
 					| NFS_INO_INVALID_OTHER;
 				if (S_ISDIR(inode->i_mode))
 					nfs_force_lookup_revalidate(inode);
@@ -2037,7 +2043,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		}
 	} else if (server->caps & NFS_CAP_MODE) {
 		nfsi->cache_validity |= save_cache_validity &
-				(NFS_INO_INVALID_OTHER
+				(NFS_INO_INVALID_MODE
 				| NFS_INO_REVAL_FORCED);
 		cache_revalidated = false;
 	}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a74c1c3c4192..bc90f2a12d5d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -302,9 +302,10 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
 	if (!(cache_validity & NFS_INO_INVALID_CHANGE))
 		dst[0] &= ~FATTR4_WORD0_CHANGE;
 
+	if (!(cache_validity & NFS_INO_INVALID_MODE))
+		dst[1] &= ~FATTR4_WORD1_MODE;
 	if (!(cache_validity & NFS_INO_INVALID_OTHER))
-		dst[1] &= ~(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
-			    FATTR4_WORD1_OWNER_GROUP);
+		dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP);
 }
 
 static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -3344,7 +3345,9 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
 	unsigned long adjust_flags = NFS_INO_INVALID_CHANGE;
 	int err;
 
-	if (sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+	if (sattr->ia_valid & (ATTR_MODE | ATTR_KILL_SUID | ATTR_KILL_SGID))
+		adjust_flags |= NFS_INO_INVALID_MODE;
+	if (sattr->ia_valid & (ATTR_UID | ATTR_GID))
 		adjust_flags |= NFS_INO_INVALID_OTHER;
 
 	do {
@@ -5431,9 +5434,10 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src,
 		bitmask[0] |= FATTR4_WORD0_CHANGE;
 	if (cache_validity & NFS_INO_INVALID_ATIME)
 		bitmask[1] |= FATTR4_WORD1_TIME_ACCESS;
+	if (cache_validity & NFS_INO_INVALID_MODE)
+		bitmask[1] |= FATTR4_WORD1_MODE;
 	if (cache_validity & NFS_INO_INVALID_OTHER)
-		bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER |
-				FATTR4_WORD1_OWNER_GROUP;
+		bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP;
 	if (cache_validity & NFS_INO_INVALID_NLINK)
 		bitmask[1] |= FATTR4_WORD1_NUMLINKS;
 	if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL)
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index a0ebc53160dd..41a161cd31f6 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -49,6 +49,7 @@ TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR);
 TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK);
+TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE);
 
 #define nfs_show_cache_validity(v) \
 	__print_flags(v, "|", \
@@ -67,7 +68,8 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK);
 			{ NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \
 			{ NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \
 			{ NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \
-			{ NFS_INO_INVALID_NLINK, "INVALID_NLINK" })
+			{ NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \
+			{ NFS_INO_INVALID_MODE, "INVALID_MODE" })
 
 TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS);
 TRACE_DEFINE_ENUM(NFS_INO_STALE);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7a39b3d424da..61d1174935b6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1604,7 +1604,7 @@ static int nfs_writeback_done(struct rpc_task *task,
 	/* Deal with the suid/sgid bit corner case */
 	if (nfs_should_remove_suid(inode)) {
 		spin_lock(&inode->i_lock);
-		nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
+		nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
 		spin_unlock(&inode->i_lock);
 	}
 	return 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 41165b988dfb..ffba254d2098 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -247,12 +247,14 @@ struct nfs4_copy_state {
 #define NFS_INO_INVALID_BLOCKS	BIT(14)         /* cached blocks are invalid */
 #define NFS_INO_INVALID_XATTR	BIT(15)		/* xattrs are invalid */
 #define NFS_INO_INVALID_NLINK	BIT(16)		/* cached nlinks is invalid */
+#define NFS_INO_INVALID_MODE	BIT(17)		/* cached mode is invalid */
 
 #define NFS_INO_INVALID_ATTR	(NFS_INO_INVALID_CHANGE \
 		| NFS_INO_INVALID_CTIME \
 		| NFS_INO_INVALID_MTIME \
 		| NFS_INO_INVALID_SIZE \
 		| NFS_INO_INVALID_NLINK \
+		| NFS_INO_INVALID_MODE \
 		| NFS_INO_INVALID_OTHER)	/* inode metadata is invalid */
 
 /*
-- 
cgit v1.2.3


From 7f08a3359a3c1e39c2a118fbbe583d8c8db14ace Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 26 Mar 2021 09:50:19 -0400
Subject: NFSv4: Add support for the NFSv4.2 "change_attr_type" attribute

The change_attr_type allows the server to provide a description of how
the change attribute will behave. This again will allow the client to
optimise its behaviour w.r.t. attribute revalidation.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c           |  3 +++
 fs/nfs/nfs3xdr.c          |  1 +
 fs/nfs/nfs4proc.c         |  1 +
 fs/nfs/nfs4xdr.c          | 32 ++++++++++++++++++++++++++++++++
 fs/nfs/proc.c             |  1 +
 include/linux/nfs4.h      |  9 +++++++++
 include/linux/nfs_fs_sb.h |  3 +++
 include/linux/nfs_xdr.h   |  2 ++
 8 files changed, 52 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 399a8eb15397..2aeb4e52a4f1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -792,6 +792,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 	server->maxfilesize = fsinfo->maxfilesize;
 
 	server->time_delta = fsinfo->time_delta;
+	server->change_attr_type = fsinfo->change_attr_type;
 
 	server->clone_blksize = fsinfo->clone_blksize;
 	/* We're airborne Set socket buffersize */
@@ -933,6 +934,8 @@ struct nfs_server *nfs_alloc_server(void)
 		return NULL;
 	}
 
+	server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
 	ida_init(&server->openowner_id);
 	ida_init(&server->lockowner_id);
 	pnfs_init_server(server);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ed1c83738c30..83ad62c81fc7 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2227,6 +2227,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
 
 	/* ignore properties */
 	result->lease_time = 0;
+	result->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bc90f2a12d5d..6992c88a25e7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -264,6 +264,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
 			| FATTR4_WORD1_FS_LAYOUT_TYPES,
 			FATTR4_WORD2_LAYOUT_BLKSIZE
 			| FATTR4_WORD2_CLONE_BLKSIZE
+			| FATTR4_WORD2_CHANGE_ATTR_TYPE
 			| FATTR4_WORD2_XATTR_SUPPORT
 };
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d8a1911dd39e..edac4718dec1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -153,6 +153,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 				 5 /* fs layout types */ + \
 				 1 /* layout blksize */ + \
 				 1 /* clone blksize */ + \
+				 1 /* change attr type */ + \
 				 1 /* xattr support */)
 #define encode_renew_maxsz	(op_encode_hdr_maxsz + 3)
 #define decode_renew_maxsz	(op_decode_hdr_maxsz)
@@ -4846,6 +4847,32 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
 	return 0;
 }
 
+static int decode_attr_change_attr_type(struct xdr_stream *xdr,
+					uint32_t *bitmap,
+					enum nfs4_change_attr_type *res)
+{
+	u32 tmp = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+
+	dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+	if (bitmap[2] & FATTR4_WORD2_CHANGE_ATTR_TYPE) {
+		if (xdr_stream_decode_u32(xdr, &tmp))
+			return -EIO;
+		bitmap[2] &= ~FATTR4_WORD2_CHANGE_ATTR_TYPE;
+	}
+
+	switch(tmp) {
+	case NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR:
+	case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER:
+	case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS:
+	case NFS4_CHANGE_TYPE_IS_TIME_METADATA:
+		*res = tmp;
+		break;
+	default:
+		*res = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+	}
+	return 0;
+}
+
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
 	unsigned int savep;
@@ -4894,6 +4921,11 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 	if (status)
 		goto xdr_error;
 
+	status = decode_attr_change_attr_type(xdr, bitmap,
+					      &fsinfo->change_attr_type);
+	if (status)
+		goto xdr_error;
+
 	status = decode_attr_xattrsupport(xdr, bitmap,
 					  &fsinfo->xattr_support);
 	if (status)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 73ab7c59d3a7..ea19dbf12301 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -91,6 +91,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	info->dtpref = fsinfo.tsize;
 	info->maxfilesize = 0x7FFFFFFF;
 	info->lease_time = 0;
+	info->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA;
 	return 0;
 }
 
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 5b4c67c91f56..15004c469807 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -452,6 +452,7 @@ enum lock_type4 {
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_CLONE_BLKSIZE	(1UL << 13)
+#define FATTR4_WORD2_CHANGE_ATTR_TYPE	(1UL << 15)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
 #define FATTR4_WORD2_MODE_UMASK		(1UL << 17)
 #define FATTR4_WORD2_XATTR_SUPPORT	(1UL << 18)
@@ -709,6 +710,14 @@ struct nl4_server {
 	} u;
 };
 
+enum nfs4_change_attr_type {
+	NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0,
+	NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1,
+	NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2,
+	NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3,
+	NFS4_CHANGE_TYPE_IS_UNDEFINED = 4,
+};
+
 /*
  * Options for setxattr. These match the flags for setxattr(2).
  */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6f76b32a0238..fbcdfd9f7a7f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -180,6 +180,9 @@ struct nfs_server {
 #define NFS_OPTION_FSCACHE	0x00000001	/* - local caching enabled */
 #define NFS_OPTION_MIGRATION	0x00000002	/* - NFSv4 migration enabled */
 
+	enum nfs4_change_attr_type
+				change_attr_type;/* Description of change attribute */
+
 	struct nfs_fsid		fsid;
 	__u64			maxfilesize;	/* maximum file size */
 	struct timespec64	time_delta;	/* smallest time granularity */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index cc29dee508f7..717ecc87c9e7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -152,6 +152,8 @@ struct nfs_fsinfo {
 	__u32			layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */
 	__u32			blksize; /* preferred pnfs io block size */
 	__u32			clone_blksize; /* granularity of a CLONE operation */
+	enum nfs4_change_attr_type
+				change_attr_type; /* Info about change attr */
 	__u32			xattr_support; /* User xattrs supported */
 };
 
-- 
cgit v1.2.3


From 655cdafdec1105d0552aa19ffb5ffef7aead1548 Mon Sep 17 00:00:00 2001
From: Zhang Yunkai <zhang.yunkai@zte.com.cn>
Date: Tue, 13 Apr 2021 10:52:56 +0000
Subject: lightnvm: remove duplicate include in lightnvm.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'linux/blkdev.h' and 'uapi/linux/lightnvm.h' included in 'lightnvm.h'
is duplicated.It is also included in the 5th and 7th line.

Signed-off-by: Zhang Yunkai <zhang.yunkai@zte.com.cn>
Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
Link: https://lore.kernel.org/r/20210413105257.159260-4-matias.bjorling@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lightnvm.h      | 2 --
 include/uapi/linux/lightnvm.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 1db223710b28..0908abda9c1b 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -112,10 +112,8 @@ struct nvm_dev_ops {
 
 #ifdef CONFIG_NVM
 
-#include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/dmapool.h>
-#include <uapi/linux/lightnvm.h>
 
 enum {
 	/* HW Responsibilities */
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index ead2e72e5c88..2745afd9b8fa 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -22,7 +22,6 @@
 
 #ifdef __KERNEL__
 #include <linux/const.h>
-#include <linux/ioctl.h>
 #else /* __KERNEL__ */
 #include <stdio.h>
 #include <sys/ioctl.h>
-- 
cgit v1.2.3


From 63f9c44bca5e10fb1fd86aee7e38039ed98f95cc Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@nvidia.com>
Date: Sun, 11 Apr 2021 15:29:18 +0300
Subject: net/mlx5: Add MEMIC operations related bits

Add the MEMIC operations bits and structures to the mlx5_ifc file.

Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 432290b58a0b..47241ebfcf7d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -133,6 +133,7 @@ enum {
 	MLX5_CMD_OP_PAGE_FAULT_RESUME             = 0x204,
 	MLX5_CMD_OP_ALLOC_MEMIC                   = 0x205,
 	MLX5_CMD_OP_DEALLOC_MEMIC                 = 0x206,
+	MLX5_CMD_OP_MODIFY_MEMIC                  = 0x207,
 	MLX5_CMD_OP_CREATE_EQ                     = 0x301,
 	MLX5_CMD_OP_DESTROY_EQ                    = 0x302,
 	MLX5_CMD_OP_QUERY_EQ                      = 0x303,
@@ -1017,7 +1018,11 @@ struct mlx5_ifc_device_mem_cap_bits {
 
 	u8         header_modify_sw_icm_start_address[0x40];
 
-	u8         reserved_at_180[0x680];
+	u8         reserved_at_180[0x80];
+
+	u8         memic_operations[0x20];
+
+	u8         reserved_at_220[0x5e0];
 };
 
 struct mlx5_ifc_device_event_cap_bits {
@@ -10417,6 +10422,41 @@ struct mlx5_ifc_destroy_vport_lag_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
+enum {
+	MLX5_MODIFY_MEMIC_OP_MOD_ALLOC,
+	MLX5_MODIFY_MEMIC_OP_MOD_DEALLOC,
+};
+
+struct mlx5_ifc_modify_memic_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x18];
+	u8         memic_operation_type[0x8];
+
+	u8         memic_start_addr[0x40];
+
+	u8         reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_modify_memic_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         memic_operation_addr[0x40];
+
+	u8         reserved_at_c0[0x140];
+};
+
 struct mlx5_ifc_alloc_memic_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_at_10[0x10];
-- 
cgit v1.2.3


From 83216e3988cd196183542937c9bd58b279f946af Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Mon, 12 Apr 2021 19:47:17 +0200
Subject: of: net: pass the dst buffer to of_get_mac_address()

of_get_mac_address() returns a "const void*" pointer to a MAC address.
Lately, support to fetch the MAC address by an NVMEM provider was added.
But this will only work with platform devices. It will not work with
PCI devices (e.g. of an integrated root complex) and esp. not with DSA
ports.

There is an of_* variant of the nvmem binding which works without
devices. The returned data of a nvmem_cell_read() has to be freed after
use. On the other hand the return of_get_mac_address() points to some
static data without a lifetime. The trick for now, was to allocate a
device resource managed buffer which is then returned. This will only
work if we have an actual device.

Change it, so that the caller of of_get_mac_address() has to supply a
buffer where the MAC address is written to. Unfortunately, this will
touch all drivers which use the of_get_mac_address().

Usually the code looks like:

  const char *addr;
  addr = of_get_mac_address(np);
  if (!IS_ERR(addr))
    ether_addr_copy(ndev->dev_addr, addr);

This can then be simply rewritten as:

  of_get_mac_address(np, ndev->dev_addr);

Sometimes is_valid_ether_addr() is used to test the MAC address.
of_get_mac_address() already makes sure, it just returns a valid MAC
address. Thus we can just test its return code. But we have to be
careful if there are still other sources for the MAC address before the
of_get_mac_address(). In this case we have to keep the
is_valid_ether_addr() call.

The following coccinelle patch was used to convert common cases to the
new style. Afterwards, I've manually gone over the drivers and fixed the
return code variable: either used a new one or if one was already
available use that. Mansour Moufid, thanks for that coccinelle patch!

<spml>
@a@
identifier x;
expression y, z;
@@
- x = of_get_mac_address(y);
+ x = of_get_mac_address(y, z);
  <...
- ether_addr_copy(z, x);
  ...>

@@
identifier a.x;
@@
- if (<+... x ...+>) {}

@@
identifier a.x;
@@
  if (<+... x ...+>) {
      ...
  }
- else {}

@@
identifier a.x;
expression e;
@@
- if (<+... x ...+>@e)
-     {}
- else
+ if (!(e))
      {...}

@@
expression x, y, z;
@@
- x = of_get_mac_address(y, z);
+ of_get_mac_address(y, z);
  ... when != x
</spml>

All drivers, except drivers/net/ethernet/aeroflex/greth.c, were
compile-time tested.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Michael Walle <michael@walle.cc>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/mach-mvebu/kirkwood.c                     |  3 +-
 arch/powerpc/sysdev/tsi108_dev.c                   |  5 +-
 drivers/net/ethernet/aeroflex/greth.c              |  6 +--
 drivers/net/ethernet/allwinner/sun4i-emac.c        | 10 ++--
 drivers/net/ethernet/altera/altera_tse_main.c      |  7 +--
 drivers/net/ethernet/arc/emac_main.c               |  8 +--
 drivers/net/ethernet/atheros/ag71xx.c              |  7 +--
 drivers/net/ethernet/broadcom/bcm4908_enet.c       |  7 +--
 drivers/net/ethernet/broadcom/bcmsysport.c         |  7 +--
 drivers/net/ethernet/broadcom/bgmac-bcma.c         | 10 ++--
 drivers/net/ethernet/broadcom/bgmac-platform.c     | 11 ++--
 drivers/net/ethernet/cadence/macb_main.c           | 11 ++--
 drivers/net/ethernet/cavium/octeon/octeon_mgmt.c   |  8 +--
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c  |  5 +-
 drivers/net/ethernet/davicom/dm9000.c              | 10 ++--
 drivers/net/ethernet/ethoc.c                       |  6 +--
 drivers/net/ethernet/ezchip/nps_enet.c             |  7 +--
 drivers/net/ethernet/freescale/fec_main.c          |  7 +--
 drivers/net/ethernet/freescale/fec_mpc52xx.c       |  7 +--
 drivers/net/ethernet/freescale/fman/mac.c          |  9 ++--
 .../net/ethernet/freescale/fs_enet/fs_enet-main.c  |  5 +-
 drivers/net/ethernet/freescale/gianfar.c           |  8 +--
 drivers/net/ethernet/freescale/ucc_geth.c          |  5 +-
 drivers/net/ethernet/hisilicon/hisi_femac.c        |  7 +--
 drivers/net/ethernet/hisilicon/hix5hd2_gmac.c      |  7 +--
 drivers/net/ethernet/lantiq_xrx200.c               |  7 +--
 drivers/net/ethernet/marvell/mv643xx_eth.c         |  5 +-
 drivers/net/ethernet/marvell/mvneta.c              |  6 +--
 .../net/ethernet/marvell/prestera/prestera_main.c  | 11 ++--
 drivers/net/ethernet/marvell/pxa168_eth.c          |  9 +---
 drivers/net/ethernet/marvell/sky2.c                |  8 ++-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c        | 11 ++--
 drivers/net/ethernet/micrel/ks8851_common.c        |  7 ++-
 drivers/net/ethernet/microchip/lan743x_main.c      |  5 +-
 drivers/net/ethernet/nxp/lpc_eth.c                 |  4 +-
 drivers/net/ethernet/qualcomm/qca_spi.c            | 10 ++--
 drivers/net/ethernet/qualcomm/qca_uart.c           |  9 +---
 drivers/net/ethernet/renesas/ravb_main.c           | 12 +++--
 drivers/net/ethernet/renesas/sh_eth.c              |  5 +-
 .../net/ethernet/samsung/sxgbe/sxgbe_platform.c    | 13 ++---
 drivers/net/ethernet/socionext/sni_ave.c           | 10 ++--
 .../net/ethernet/stmicro/stmmac/dwmac-anarion.c    |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c    |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-generic.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c    |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-intel-plat.c |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-ipq806x.c    |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c    |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-mediatek.c   |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c  |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-meson8b.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c  |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c     |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c  |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c  |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c  |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-visconti.c   |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac.h       |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 14 ++---
 .../net/ethernet/stmicro/stmmac/stmmac_platform.h  |  2 +-
 drivers/net/ethernet/ti/am65-cpsw-nuss.c           | 19 ++++---
 drivers/net/ethernet/ti/cpsw.c                     |  7 +--
 drivers/net/ethernet/ti/cpsw_new.c                 |  7 +--
 drivers/net/ethernet/ti/davinci_emac.c             |  8 +--
 drivers/net/ethernet/ti/netcp_core.c               |  7 +--
 drivers/net/ethernet/wiznet/w5100-spi.c            |  8 ++-
 drivers/net/ethernet/wiznet/w5100.c                |  2 +-
 drivers/net/ethernet/xilinx/ll_temac_main.c        |  8 +--
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c  | 15 +++---
 drivers/net/ethernet/xilinx/xilinx_emaclite.c      |  8 +--
 drivers/net/wireless/ath/ath9k/init.c              |  5 +-
 drivers/net/wireless/mediatek/mt76/eeprom.c        |  9 +---
 drivers/net/wireless/ralink/rt2x00/rt2x00dev.c     |  6 +--
 drivers/of/of_net.c                                | 60 ++++++++++------------
 drivers/staging/octeon/ethernet.c                  | 10 ++--
 drivers/staging/wfx/main.c                         |  7 ++-
 include/linux/of_net.h                             |  6 +--
 include/net/dsa.h                                  |  2 +-
 net/dsa/dsa2.c                                     |  2 +-
 net/dsa/slave.c                                    |  2 +-
 net/ethernet/eth.c                                 | 11 ++--
 85 files changed, 218 insertions(+), 364 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-mvebu/kirkwood.c b/arch/arm/mach-mvebu/kirkwood.c
index ceaad6d5927e..06b1706595f4 100644
--- a/arch/arm/mach-mvebu/kirkwood.c
+++ b/arch/arm/mach-mvebu/kirkwood.c
@@ -84,6 +84,7 @@ static void __init kirkwood_dt_eth_fixup(void)
 		struct device_node *pnp = of_get_parent(np);
 		struct clk *clk;
 		struct property *pmac;
+		u8 tmpmac[ETH_ALEN];
 		void __iomem *io;
 		u8 *macaddr;
 		u32 reg;
@@ -93,7 +94,7 @@ static void __init kirkwood_dt_eth_fixup(void)
 
 		/* skip disabled nodes or nodes with valid MAC address*/
 		if (!of_device_is_available(pnp) ||
-		    !IS_ERR(of_get_mac_address(np)))
+		    !of_get_mac_address(np, tmpmac))
 			goto eth_fixup_skip;
 
 		clk = of_clk_get(pnp, 0);
diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c
index 0baec82510b9..4c4a6efd5e5f 100644
--- a/arch/powerpc/sysdev/tsi108_dev.c
+++ b/arch/powerpc/sysdev/tsi108_dev.c
@@ -73,7 +73,6 @@ static int __init tsi108_eth_of_init(void)
 		struct device_node *phy, *mdio;
 		hw_info tsi_eth_data;
 		const unsigned int *phy_id;
-		const void *mac_addr;
 		const phandle *ph;
 
 		memset(r, 0, sizeof(r));
@@ -101,9 +100,7 @@ static int __init tsi108_eth_of_init(void)
 			goto err;
 		}
 
-		mac_addr = of_get_mac_address(np);
-		if (!IS_ERR(mac_addr))
-			ether_addr_copy(tsi_eth_data.mac_addr, mac_addr);
+		of_get_mac_address(np, tsi_eth_data.mac_addr);
 
 		ph = of_get_property(np, "mdio-handle", NULL);
 		mdio = of_find_node_by_phandle(*ph);
diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c
index 9c5891bbfe61..d77fafbc1530 100644
--- a/drivers/net/ethernet/aeroflex/greth.c
+++ b/drivers/net/ethernet/aeroflex/greth.c
@@ -1449,10 +1449,10 @@ static int greth_of_probe(struct platform_device *ofdev)
 			break;
 	}
 	if (i == 6) {
-		const u8 *addr;
+		u8 addr[ETH_ALEN];
 
-		addr = of_get_mac_address(ofdev->dev.of_node);
-		if (!IS_ERR(addr)) {
+		err = of_get_mac_address(ofdev->dev.of_node, addr);
+		if (!err) {
 			for (i = 0; i < 6; i++)
 				macaddr[i] = (unsigned int) addr[i];
 		} else {
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c
index 5ed80d9a6b9f..f99ae317c188 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -790,7 +790,6 @@ static int emac_probe(struct platform_device *pdev)
 	struct emac_board_info *db;
 	struct net_device *ndev;
 	int ret = 0;
-	const char *mac_addr;
 
 	ndev = alloc_etherdev(sizeof(struct emac_board_info));
 	if (!ndev) {
@@ -853,12 +852,9 @@ static int emac_probe(struct platform_device *pdev)
 	}
 
 	/* Read MAC-address from DT */
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(ndev->dev_addr, mac_addr);
-
-	/* Check if the MAC address is valid, if not get a random one */
-	if (!is_valid_ether_addr(ndev->dev_addr)) {
+	ret = of_get_mac_address(np, ndev->dev_addr);
+	if (ret) {
+		/* if the MAC address is invalid get a random one */
 		eth_hw_addr_random(ndev);
 		dev_warn(&pdev->dev, "using random MAC address %pM\n",
 			 ndev->dev_addr);
diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c
index 907125abef2c..1c00d719e5d7 100644
--- a/drivers/net/ethernet/altera/altera_tse_main.c
+++ b/drivers/net/ethernet/altera/altera_tse_main.c
@@ -1351,7 +1351,6 @@ static int altera_tse_probe(struct platform_device *pdev)
 	struct resource *control_port;
 	struct resource *dma_res;
 	struct altera_tse_private *priv;
-	const unsigned char *macaddr;
 	void __iomem *descmap;
 	const struct of_device_id *of_id = NULL;
 
@@ -1525,10 +1524,8 @@ static int altera_tse_probe(struct platform_device *pdev)
 	priv->rx_dma_buf_sz = ALTERA_RXDMABUFFER_SIZE;
 
 	/* get default MAC address from device tree */
-	macaddr = of_get_mac_address(pdev->dev.of_node);
-	if (!IS_ERR(macaddr))
-		ether_addr_copy(ndev->dev_addr, macaddr);
-	else
+	ret = of_get_mac_address(pdev->dev.of_node, ndev->dev_addr);
+	if (ret)
 		eth_hw_addr_random(ndev);
 
 	/* get phy addr and create mdio */
diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c
index b56a9e2aecd9..67b8113a2b53 100644
--- a/drivers/net/ethernet/arc/emac_main.c
+++ b/drivers/net/ethernet/arc/emac_main.c
@@ -857,7 +857,6 @@ int arc_emac_probe(struct net_device *ndev, int interface)
 	struct device_node *phy_node;
 	struct phy_device *phydev = NULL;
 	struct arc_emac_priv *priv;
-	const char *mac_addr;
 	unsigned int id, clock_frequency, irq;
 	int err;
 
@@ -942,11 +941,8 @@ int arc_emac_probe(struct net_device *ndev, int interface)
 	}
 
 	/* Get MAC address from device tree */
-	mac_addr = of_get_mac_address(dev->of_node);
-
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(ndev->dev_addr, mac_addr);
-	else
+	err = of_get_mac_address(dev->of_node, ndev->dev_addr);
+	if (err)
 		eth_hw_addr_random(ndev);
 
 	arc_emac_set_address_internal(ndev);
diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c
index 7352f98123c7..3a23b92ebfe3 100644
--- a/drivers/net/ethernet/atheros/ag71xx.c
+++ b/drivers/net/ethernet/atheros/ag71xx.c
@@ -1856,7 +1856,6 @@ static int ag71xx_probe(struct platform_device *pdev)
 	const struct ag71xx_dcfg *dcfg;
 	struct net_device *ndev;
 	struct resource *res;
-	const void *mac_addr;
 	int tx_size, err, i;
 	struct ag71xx *ag;
 
@@ -1957,10 +1956,8 @@ static int ag71xx_probe(struct platform_device *pdev)
 	ag->stop_desc->ctrl = 0;
 	ag->stop_desc->next = (u32)ag->stop_desc_dma;
 
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr))
-		memcpy(ndev->dev_addr, mac_addr, ETH_ALEN);
-	if (IS_ERR(mac_addr) || !is_valid_ether_addr(ndev->dev_addr)) {
+	err = of_get_mac_address(np, ndev->dev_addr);
+	if (err) {
 		netif_err(ag, probe, ndev, "invalid MAC address, using random address\n");
 		eth_random_addr(ndev->dev_addr);
 	}
diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c
index b7afac5c7ca7..60d908507f51 100644
--- a/drivers/net/ethernet/broadcom/bcm4908_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c
@@ -686,7 +686,6 @@ static int bcm4908_enet_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct net_device *netdev;
 	struct bcm4908_enet *enet;
-	const u8 *mac;
 	int err;
 
 	netdev = devm_alloc_etherdev(dev, sizeof(*enet));
@@ -716,10 +715,8 @@ static int bcm4908_enet_probe(struct platform_device *pdev)
 		return err;
 
 	SET_NETDEV_DEV(netdev, &pdev->dev);
-	mac = of_get_mac_address(dev->of_node);
-	if (!IS_ERR(mac))
-		ether_addr_copy(netdev->dev_addr, mac);
-	else
+	err = of_get_mac_address(dev->of_node, netdev->dev_addr);
+	if (err)
 		eth_hw_addr_random(netdev);
 	netdev->netdev_ops = &bcm4908_enet_netdev_ops;
 	netdev->min_mtu = ETH_ZLEN;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 777bbf6d2586..d9f0f0df8f7b 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2457,7 +2457,6 @@ static int bcm_sysport_probe(struct platform_device *pdev)
 	struct bcm_sysport_priv *priv;
 	struct device_node *dn;
 	struct net_device *dev;
-	const void *macaddr;
 	u32 txq, rxq;
 	int ret;
 
@@ -2552,12 +2551,10 @@ static int bcm_sysport_probe(struct platform_device *pdev)
 	}
 
 	/* Initialize netdevice members */
-	macaddr = of_get_mac_address(dn);
-	if (IS_ERR(macaddr)) {
+	ret = of_get_mac_address(dn, dev->dev_addr);
+	if (ret) {
 		dev_warn(&pdev->dev, "using random Ethernet MAC\n");
 		eth_hw_addr_random(dev);
-	} else {
-		ether_addr_copy(dev->dev_addr, macaddr);
 	}
 
 	SET_NETDEV_DEV(dev, &pdev->dev);
diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c
index a5fd161ab5ee..85fa0ab7201c 100644
--- a/drivers/net/ethernet/broadcom/bgmac-bcma.c
+++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c
@@ -115,7 +115,7 @@ static int bgmac_probe(struct bcma_device *core)
 	struct ssb_sprom *sprom = &core->bus->sprom;
 	struct mii_bus *mii_bus;
 	struct bgmac *bgmac;
-	const u8 *mac = NULL;
+	const u8 *mac;
 	int err;
 
 	bgmac = bgmac_alloc(&core->dev);
@@ -128,11 +128,10 @@ static int bgmac_probe(struct bcma_device *core)
 
 	bcma_set_drvdata(core, bgmac);
 
-	if (bgmac->dev->of_node)
-		mac = of_get_mac_address(bgmac->dev->of_node);
+	err = of_get_mac_address(bgmac->dev->of_node, bgmac->net_dev->dev_addr);
 
 	/* If no MAC address assigned via device tree, check SPROM */
-	if (IS_ERR_OR_NULL(mac)) {
+	if (err) {
 		switch (core->core_unit) {
 		case 0:
 			mac = sprom->et0mac;
@@ -149,10 +148,9 @@ static int bgmac_probe(struct bcma_device *core)
 			err = -ENOTSUPP;
 			goto err;
 		}
+		ether_addr_copy(bgmac->net_dev->dev_addr, mac);
 	}
 
-	ether_addr_copy(bgmac->net_dev->dev_addr, mac);
-
 	/* On BCM4706 we need common core to access PHY */
 	if (core->id.id == BCMA_CORE_4706_MAC_GBIT &&
 	    !core->bus->drv_gmac_cmn.core) {
diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c
index f37f1c58f368..9834b77cf4b6 100644
--- a/drivers/net/ethernet/broadcom/bgmac-platform.c
+++ b/drivers/net/ethernet/broadcom/bgmac-platform.c
@@ -173,7 +173,7 @@ static int bgmac_probe(struct platform_device *pdev)
 	struct device_node *np = pdev->dev.of_node;
 	struct bgmac *bgmac;
 	struct resource *regs;
-	const u8 *mac_addr;
+	int ret;
 
 	bgmac = bgmac_alloc(&pdev->dev);
 	if (!bgmac)
@@ -192,11 +192,10 @@ static int bgmac_probe(struct platform_device *pdev)
 	bgmac->dev = &pdev->dev;
 	bgmac->dma_dev = &pdev->dev;
 
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(bgmac->net_dev->dev_addr, mac_addr);
-	else
-		dev_warn(&pdev->dev, "MAC address not present in device tree\n");
+	ret = of_get_mac_address(np, bgmac->net_dev->dev_addr);
+	if (ret)
+		dev_warn(&pdev->dev,
+			 "MAC address not present in device tree\n");
 
 	bgmac->irq = platform_get_irq(pdev, 0);
 	if (bgmac->irq < 0)
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index ffd56a23f8b0..d6bde1748a22 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -4649,7 +4649,6 @@ static int macb_probe(struct platform_device *pdev)
 	struct net_device *dev;
 	struct resource *regs;
 	void __iomem *mem;
-	const char *mac;
 	struct macb *bp;
 	int err, val;
 
@@ -4764,15 +4763,11 @@ static int macb_probe(struct platform_device *pdev)
 	if (bp->caps & MACB_CAPS_NEEDS_RSTONUBR)
 		bp->rx_intr_mask |= MACB_BIT(RXUBR);
 
-	mac = of_get_mac_address(np);
-	if (PTR_ERR(mac) == -EPROBE_DEFER) {
-		err = -EPROBE_DEFER;
+	err = of_get_mac_address(np, bp->dev->dev_addr);
+	if (err == -EPROBE_DEFER)
 		goto err_out_free_netdev;
-	} else if (!IS_ERR_OR_NULL(mac)) {
-		ether_addr_copy(bp->dev->dev_addr, mac);
-	} else {
+	else if (err)
 		macb_get_hwaddr(bp);
-	}
 
 	err = of_get_phy_mode(np, &interface);
 	if (err)
diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
index ecffebd513be..48ff6fb0eed9 100644
--- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
+++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
@@ -1385,7 +1385,6 @@ static int octeon_mgmt_probe(struct platform_device *pdev)
 	struct net_device *netdev;
 	struct octeon_mgmt *p;
 	const __be32 *data;
-	const u8 *mac;
 	struct resource *res_mix;
 	struct resource *res_agl;
 	struct resource *res_agl_prt_ctl;
@@ -1502,11 +1501,8 @@ static int octeon_mgmt_probe(struct platform_device *pdev)
 	netdev->min_mtu = 64 - OCTEON_MGMT_RX_HEADROOM;
 	netdev->max_mtu = 16383 - OCTEON_MGMT_RX_HEADROOM - VLAN_HLEN;
 
-	mac = of_get_mac_address(pdev->dev.of_node);
-
-	if (!IS_ERR(mac))
-		ether_addr_copy(netdev->dev_addr, mac);
-	else
+	result = of_get_mac_address(pdev->dev.of_node, netdev->dev_addr);
+	if (result)
 		eth_hw_addr_random(netdev);
 
 	p->phy_np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 8ff28ed04b7f..0c783aadf393 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -1474,7 +1474,6 @@ static int bgx_init_of_phy(struct bgx *bgx)
 	device_for_each_child_node(&bgx->pdev->dev, fwn) {
 		struct phy_device *pd;
 		struct device_node *phy_np;
-		const char *mac;
 
 		/* Should always be an OF node.  But if it is not, we
 		 * cannot handle it, so exit the loop.
@@ -1483,9 +1482,7 @@ static int bgx_init_of_phy(struct bgx *bgx)
 		if (!node)
 			break;
 
-		mac = of_get_mac_address(node);
-		if (!IS_ERR(mac))
-			ether_addr_copy(bgx->lmac[lmac].mac, mac);
+		of_get_mac_address(node, bgx->lmac[lmac].mac);
 
 		SET_NETDEV_DEV(&bgx->lmac[lmac].netdev, &bgx->pdev->dev);
 		bgx->lmac[lmac].lmacid = lmac;
diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c
index 252adfa5d837..2374c51bf2b2 100644
--- a/drivers/net/ethernet/davicom/dm9000.c
+++ b/drivers/net/ethernet/davicom/dm9000.c
@@ -1385,7 +1385,7 @@ static struct dm9000_plat_data *dm9000_parse_dt(struct device *dev)
 {
 	struct dm9000_plat_data *pdata;
 	struct device_node *np = dev->of_node;
-	const void *mac_addr;
+	int ret;
 
 	if (!IS_ENABLED(CONFIG_OF) || !np)
 		return ERR_PTR(-ENXIO);
@@ -1399,11 +1399,9 @@ static struct dm9000_plat_data *dm9000_parse_dt(struct device *dev)
 	if (of_find_property(np, "davicom,no-eeprom", NULL))
 		pdata->flags |= DM9000_PLATF_NO_EEPROM;
 
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(pdata->dev_addr, mac_addr);
-	else if (PTR_ERR(mac_addr) == -EPROBE_DEFER)
-		return ERR_CAST(mac_addr);
+	ret = of_get_mac_address(np, pdata->dev_addr);
+	if (ret == -EPROBE_DEFER)
+		return ERR_PTR(ret);
 
 	return pdata;
 }
diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index 3d9b0b161e24..e1b43b07755b 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -1151,11 +1151,7 @@ static int ethoc_probe(struct platform_device *pdev)
 		ether_addr_copy(netdev->dev_addr, pdata->hwaddr);
 		priv->phy_id = pdata->phy_id;
 	} else {
-		const void *mac;
-
-		mac = of_get_mac_address(pdev->dev.of_node);
-		if (!IS_ERR(mac))
-			ether_addr_copy(netdev->dev_addr, mac);
+		of_get_mac_address(pdev->dev.of_node, netdev->dev_addr);
 		priv->phy_id = -1;
 	}
 
diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c
index 815fb62c4b02..e3954d8835e7 100644
--- a/drivers/net/ethernet/ezchip/nps_enet.c
+++ b/drivers/net/ethernet/ezchip/nps_enet.c
@@ -575,7 +575,6 @@ static s32 nps_enet_probe(struct platform_device *pdev)
 	struct net_device *ndev;
 	struct nps_enet_priv *priv;
 	s32 err = 0;
-	const char *mac_addr;
 
 	if (!dev->of_node)
 		return -ENODEV;
@@ -602,10 +601,8 @@ static s32 nps_enet_probe(struct platform_device *pdev)
 	dev_dbg(dev, "Registers base address is 0x%p\n", priv->regs_base);
 
 	/* set kernel MAC address to dev */
-	mac_addr = of_get_mac_address(dev->of_node);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(ndev->dev_addr, mac_addr);
-	else
+	err = of_get_mac_address(dev->of_node, ndev->dev_addr);
+	if (err)
 		eth_hw_addr_random(ndev);
 
 	/* Get IRQ number */
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 70aea9c274fe..aecc111fbe73 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -1665,6 +1665,7 @@ static void fec_get_mac(struct net_device *ndev)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
 	unsigned char *iap, tmpaddr[ETH_ALEN];
+	int ret;
 
 	/*
 	 * try to get mac address in following order:
@@ -1680,9 +1681,9 @@ static void fec_get_mac(struct net_device *ndev)
 	if (!is_valid_ether_addr(iap)) {
 		struct device_node *np = fep->pdev->dev.of_node;
 		if (np) {
-			const char *mac = of_get_mac_address(np);
-			if (!IS_ERR(mac))
-				iap = (unsigned char *) mac;
+			ret = of_get_mac_address(np, tmpaddr);
+			if (!ret)
+				iap = tmpaddr;
 		}
 	}
 
diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c
index b3bad429e03b..02c47658a215 100644
--- a/drivers/net/ethernet/freescale/fec_mpc52xx.c
+++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c
@@ -813,7 +813,6 @@ static int mpc52xx_fec_probe(struct platform_device *op)
 	const u32 *prop;
 	int prop_size;
 	struct device_node *np = op->dev.of_node;
-	const char *mac_addr;
 
 	phys_addr_t rx_fifo;
 	phys_addr_t tx_fifo;
@@ -891,10 +890,8 @@ static int mpc52xx_fec_probe(struct platform_device *op)
 	 *
 	 * First try to read MAC address from DT
 	 */
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr)) {
-		ether_addr_copy(ndev->dev_addr, mac_addr);
-	} else {
+	rv = of_get_mac_address(np, ndev->dev_addr);
+	if (rv) {
 		struct mpc52xx_fec __iomem *fec = priv->fec;
 
 		/*
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index 901749a7a318..46ecb42f2ef8 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -605,7 +605,6 @@ static int mac_probe(struct platform_device *_of_dev)
 	struct platform_device	*of_dev;
 	struct resource		 res;
 	struct mac_priv_s	*priv;
-	const u8		*mac_addr;
 	u32			 val;
 	u8			fman_id;
 	phy_interface_t          phy_if;
@@ -723,11 +722,9 @@ static int mac_probe(struct platform_device *_of_dev)
 	priv->cell_index = (u8)val;
 
 	/* Get the MAC address */
-	mac_addr = of_get_mac_address(mac_node);
-	if (IS_ERR(mac_addr))
+	err = of_get_mac_address(mac_node, mac_dev->addr);
+	if (err)
 		dev_warn(dev, "of_get_mac_address(%pOF) failed\n", mac_node);
-	else
-		ether_addr_copy(mac_dev->addr, mac_addr);
 
 	/* Get the port handles */
 	nph = of_count_phandle_with_args(mac_node, "fsl,fman-ports", NULL);
@@ -853,7 +850,7 @@ static int mac_probe(struct platform_device *_of_dev)
 	if (err < 0)
 		dev_err(dev, "fman_set_mac_active_pause() = %d\n", err);
 
-	if (!IS_ERR(mac_addr))
+	if (!is_zero_ether_addr(mac_dev->addr))
 		dev_info(dev, "FMan MAC address: %pM\n", mac_dev->addr);
 
 	priv->eth_dev = dpaa_eth_add_device(fman_id, mac_dev);
diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
index 78e008b81374..6ee325ad35c5 100644
--- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
+++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
@@ -918,7 +918,6 @@ static int fs_enet_probe(struct platform_device *ofdev)
 	const u32 *data;
 	struct clk *clk;
 	int err;
-	const u8 *mac_addr;
 	const char *phy_connection_type;
 	int privsize, len, ret = -ENODEV;
 
@@ -1006,9 +1005,7 @@ static int fs_enet_probe(struct platform_device *ofdev)
 	spin_lock_init(&fep->lock);
 	spin_lock_init(&fep->tx_lock);
 
-	mac_addr = of_get_mac_address(ofdev->dev.of_node);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(ndev->dev_addr, mac_addr);
+	of_get_mac_address(ofdev->dev.of_node, ndev->dev_addr);
 
 	ret = fep->ops->allocate_bd(ndev);
 	if (ret)
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 3ec4d9fddd52..339f9567ef9d 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -640,7 +640,6 @@ static phy_interface_t gfar_get_interface(struct net_device *dev)
 static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 {
 	const char *model;
-	const void *mac_addr;
 	int err = 0, i;
 	phy_interface_t interface;
 	struct net_device *dev = NULL;
@@ -782,11 +781,8 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 	if (stash_len || stash_idx)
 		priv->device_flags |= FSL_GIANFAR_DEV_HAS_BUF_STASHING;
 
-	mac_addr = of_get_mac_address(np);
-
-	if (!IS_ERR(mac_addr)) {
-		ether_addr_copy(dev->dev_addr, mac_addr);
-	} else {
+	err = of_get_mac_address(np, dev->dev_addr);
+	if (err) {
 		eth_hw_addr_random(dev);
 		dev_info(&ofdev->dev, "Using random MAC address: %pM\n", dev->dev_addr);
 	}
diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index ef4e2febeb5b..e0936510fa34 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -3562,7 +3562,6 @@ static int ucc_geth_probe(struct platform_device* ofdev)
 	struct resource res;
 	int err, ucc_num, max_speed = 0;
 	const unsigned int *prop;
-	const void *mac_addr;
 	phy_interface_t phy_interface;
 	static const int enet_to_speed[] = {
 		SPEED_10, SPEED_10, SPEED_10,
@@ -3733,9 +3732,7 @@ static int ucc_geth_probe(struct platform_device* ofdev)
 		goto err_free_netdev;
 	}
 
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(dev->dev_addr, mac_addr);
+	of_get_mac_address(np, dev->dev_addr);
 
 	ugeth->ug_info = ug_info;
 	ugeth->dev = device;
diff --git a/drivers/net/ethernet/hisilicon/hisi_femac.c b/drivers/net/ethernet/hisilicon/hisi_femac.c
index 57c3bc4f7089..3c4db4a6b431 100644
--- a/drivers/net/ethernet/hisilicon/hisi_femac.c
+++ b/drivers/net/ethernet/hisilicon/hisi_femac.c
@@ -772,7 +772,6 @@ static int hisi_femac_drv_probe(struct platform_device *pdev)
 	struct net_device *ndev;
 	struct hisi_femac_priv *priv;
 	struct phy_device *phy;
-	const char *mac_addr;
 	int ret;
 
 	ndev = alloc_etherdev(sizeof(*priv));
@@ -842,10 +841,8 @@ static int hisi_femac_drv_probe(struct platform_device *pdev)
 			   (unsigned long)phy->phy_id,
 			   phy_modes(phy->interface));
 
-	mac_addr = of_get_mac_address(node);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(ndev->dev_addr, mac_addr);
-	if (!is_valid_ether_addr(ndev->dev_addr)) {
+	ret = of_get_mac_address(node, ndev->dev_addr);
+	if (ret) {
 		eth_hw_addr_random(ndev);
 		dev_warn(dev, "using random MAC address %pM\n",
 			 ndev->dev_addr);
diff --git a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
index 8b2bf85039f1..c1aae0fca5e9 100644
--- a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
+++ b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
@@ -1098,7 +1098,6 @@ static int hix5hd2_dev_probe(struct platform_device *pdev)
 	struct net_device *ndev;
 	struct hix5hd2_priv *priv;
 	struct mii_bus *bus;
-	const char *mac_addr;
 	int ret;
 
 	ndev = alloc_etherdev(sizeof(struct hix5hd2_priv));
@@ -1220,10 +1219,8 @@ static int hix5hd2_dev_probe(struct platform_device *pdev)
 		goto out_phy_node;
 	}
 
-	mac_addr = of_get_mac_address(node);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(ndev->dev_addr, mac_addr);
-	if (!is_valid_ether_addr(ndev->dev_addr)) {
+	ret = of_get_mac_address(node, ndev->dev_addr);
+	if (ret) {
 		eth_hw_addr_random(ndev);
 		netdev_warn(ndev, "using random MAC address %pM\n",
 			    ndev->dev_addr);
diff --git a/drivers/net/ethernet/lantiq_xrx200.c b/drivers/net/ethernet/lantiq_xrx200.c
index 0f8ef8f1232c..41c2ad210bc9 100644
--- a/drivers/net/ethernet/lantiq_xrx200.c
+++ b/drivers/net/ethernet/lantiq_xrx200.c
@@ -435,7 +435,6 @@ static int xrx200_probe(struct platform_device *pdev)
 	struct resource *res;
 	struct xrx200_priv *priv;
 	struct net_device *net_dev;
-	const u8 *mac;
 	int err;
 
 	/* alloc the network device */
@@ -477,10 +476,8 @@ static int xrx200_probe(struct platform_device *pdev)
 		return PTR_ERR(priv->clk);
 	}
 
-	mac = of_get_mac_address(np);
-	if (!IS_ERR(mac))
-		ether_addr_copy(net_dev->dev_addr, mac);
-	else
+	err = of_get_mac_address(np, net_dev->dev_addr);
+	if (err)
 		eth_hw_addr_random(net_dev);
 
 	/* bring up the dma engine and IP core */
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index ca1681aa951a..d207bfcaf31d 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -2702,7 +2702,6 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev,
 	struct platform_device *ppdev;
 	struct mv643xx_eth_platform_data ppd;
 	struct resource res;
-	const char *mac_addr;
 	int ret;
 	int dev_num = 0;
 
@@ -2733,9 +2732,7 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev,
 		return -EINVAL;
 	}
 
-	mac_addr = of_get_mac_address(pnp);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(ppd.mac_addr, mac_addr);
+	of_get_mac_address(pnp, ppd.mac_addr);
 
 	mv643xx_eth_property(pnp, "tx-queue-size", ppd.tx_queue_size);
 	mv643xx_eth_property(pnp, "tx-sram-addr", ppd.tx_sram_addr);
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index f20dfd1d7a6b..7d5cd9bc6c99 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -5141,7 +5141,6 @@ static int mvneta_probe(struct platform_device *pdev)
 	struct net_device *dev;
 	struct phylink *phylink;
 	struct phy *comphy;
-	const char *dt_mac_addr;
 	char hw_mac_addr[ETH_ALEN];
 	phy_interface_t phy_mode;
 	const char *mac_from;
@@ -5237,10 +5236,9 @@ static int mvneta_probe(struct platform_device *pdev)
 		goto err_free_ports;
 	}
 
-	dt_mac_addr = of_get_mac_address(dn);
-	if (!IS_ERR(dt_mac_addr)) {
+	err = of_get_mac_address(dn, dev->dev_addr);
+	if (!err) {
 		mac_from = "device tree";
-		ether_addr_copy(dev->dev_addr, dt_mac_addr);
 	} else {
 		mvneta_get_mac_addr(pp, hw_mac_addr);
 		if (is_valid_ether_addr(hw_mac_addr)) {
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index 25dd903a3e92..f08c420a5803 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -456,20 +456,17 @@ static int prestera_switch_set_base_mac_addr(struct prestera_switch *sw)
 {
 	struct device_node *base_mac_np;
 	struct device_node *np;
-	const char *base_mac;
+	int ret;
 
 	np = of_find_compatible_node(NULL, NULL, "marvell,prestera");
 	base_mac_np = of_parse_phandle(np, "base-mac-provider", 0);
 
-	base_mac = of_get_mac_address(base_mac_np);
-	of_node_put(base_mac_np);
-	if (!IS_ERR(base_mac))
-		ether_addr_copy(sw->base_mac, base_mac);
-
-	if (!is_valid_ether_addr(sw->base_mac)) {
+	ret = of_get_mac_address(base_mac_np, sw->base_mac);
+	if (ret) {
 		eth_random_addr(sw->base_mac);
 		dev_info(prestera_dev(sw), "using random base mac address\n");
 	}
+	of_node_put(base_mac_np);
 
 	return prestera_hw_switch_mac_set(sw, sw->base_mac);
 }
diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index 3712e1786091..e967867828d8 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -1392,7 +1392,6 @@ static int pxa168_eth_probe(struct platform_device *pdev)
 	struct resource *res;
 	struct clk *clk;
 	struct device_node *np;
-	const unsigned char *mac_addr = NULL;
 	int err;
 
 	printk(KERN_NOTICE "PXA168 10/100 Ethernet Driver\n");
@@ -1435,12 +1434,8 @@ static int pxa168_eth_probe(struct platform_device *pdev)
 
 	INIT_WORK(&pep->tx_timeout_task, pxa168_eth_tx_timeout_task);
 
-	if (pdev->dev.of_node)
-		mac_addr = of_get_mac_address(pdev->dev.of_node);
-
-	if (!IS_ERR_OR_NULL(mac_addr)) {
-		ether_addr_copy(dev->dev_addr, mac_addr);
-	} else {
+	err = of_get_mac_address(pdev->dev.of_node, dev->dev_addr);
+	if (err) {
 		/* try reading the mac address, if set by the bootloader */
 		pxa168_eth_get_mac_address(dev, dev->dev_addr);
 		if (!is_valid_ether_addr(dev->dev_addr)) {
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 68c154d715d6..222c32367b2c 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -4728,7 +4728,7 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port,
 {
 	struct sky2_port *sky2;
 	struct net_device *dev = alloc_etherdev(sizeof(*sky2));
-	const void *iap;
+	int ret;
 
 	if (!dev)
 		return NULL;
@@ -4798,10 +4798,8 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port,
 	 * 1) from device tree data
 	 * 2) from internal registers set by bootloader
 	 */
-	iap = of_get_mac_address(hw->pdev->dev.of_node);
-	if (!IS_ERR(iap))
-		ether_addr_copy(dev->dev_addr, iap);
-	else
+	ret = of_get_mac_address(hw->pdev->dev.of_node, dev->dev_addr);
+	if (ret)
 		memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8,
 			      ETH_ALEN);
 
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 810def064f11..6b00c12c6c43 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2484,14 +2484,11 @@ static int __init mtk_init(struct net_device *dev)
 {
 	struct mtk_mac *mac = netdev_priv(dev);
 	struct mtk_eth *eth = mac->hw;
-	const char *mac_addr;
-
-	mac_addr = of_get_mac_address(mac->of_node);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(dev->dev_addr, mac_addr);
+	int ret;
 
-	/* If the mac address is invalid, use random mac address  */
-	if (!is_valid_ether_addr(dev->dev_addr)) {
+	ret = of_get_mac_address(mac->of_node, dev->dev_addr);
+	if (ret) {
+		/* If the mac address is invalid, use random mac address */
 		eth_hw_addr_random(dev);
 		dev_err(eth->dev, "generated random MAC address %pM\n",
 			dev->dev_addr);
diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c
index 2feed6ce19d3..13eef6e9bd2d 100644
--- a/drivers/net/ethernet/micrel/ks8851_common.c
+++ b/drivers/net/ethernet/micrel/ks8851_common.c
@@ -193,11 +193,10 @@ static void ks8851_read_mac_addr(struct net_device *dev)
 static void ks8851_init_mac(struct ks8851_net *ks, struct device_node *np)
 {
 	struct net_device *dev = ks->netdev;
-	const u8 *mac_addr;
+	int ret;
 
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr)) {
-		ether_addr_copy(dev->dev_addr, mac_addr);
+	ret = of_get_mac_address(np, dev->dev_addr);
+	if (!ret) {
 		ks8851_write_mac_addr(dev);
 		return;
 	}
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c
index 11a1dc4c436d..dae10328c6cf 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -2771,7 +2771,6 @@ static int lan743x_pcidev_probe(struct pci_dev *pdev,
 {
 	struct lan743x_adapter *adapter = NULL;
 	struct net_device *netdev = NULL;
-	const void *mac_addr;
 	int ret = -ENODEV;
 
 	netdev = devm_alloc_etherdev(&pdev->dev,
@@ -2788,9 +2787,7 @@ static int lan743x_pcidev_probe(struct pci_dev *pdev,
 			      NETIF_MSG_IFDOWN | NETIF_MSG_TX_QUEUED;
 	netdev->max_mtu = LAN743X_MAX_FRAME_SIZE;
 
-	mac_addr = of_get_mac_address(pdev->dev.of_node);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(adapter->mac_address, mac_addr);
+	of_get_mac_address(pdev->dev.of_node, adapter->mac_address);
 
 	ret = lan743x_pci_init(adapter, pdev);
 	if (ret)
diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c
index e72fd33a214c..64c6842bd452 100644
--- a/drivers/net/ethernet/nxp/lpc_eth.c
+++ b/drivers/net/ethernet/nxp/lpc_eth.c
@@ -1350,9 +1350,7 @@ static int lpc_eth_drv_probe(struct platform_device *pdev)
 	__lpc_get_mac(pldat, ndev->dev_addr);
 
 	if (!is_valid_ether_addr(ndev->dev_addr)) {
-		const char *macaddr = of_get_mac_address(np);
-		if (!IS_ERR(macaddr))
-			ether_addr_copy(ndev->dev_addr, macaddr);
+		of_get_mac_address(np, ndev->dev_addr);
 	}
 	if (!is_valid_ether_addr(ndev->dev_addr))
 		eth_hw_addr_random(ndev);
diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c
index 5a3b65a6eb4f..ab9b02574a15 100644
--- a/drivers/net/ethernet/qualcomm/qca_spi.c
+++ b/drivers/net/ethernet/qualcomm/qca_spi.c
@@ -885,7 +885,7 @@ qca_spi_probe(struct spi_device *spi)
 	struct net_device *qcaspi_devs = NULL;
 	u8 legacy_mode = 0;
 	u16 signature;
-	const char *mac;
+	int ret;
 
 	if (!spi->dev.of_node) {
 		dev_err(&spi->dev, "Missing device tree\n");
@@ -962,12 +962,8 @@ qca_spi_probe(struct spi_device *spi)
 
 	spi_set_drvdata(spi, qcaspi_devs);
 
-	mac = of_get_mac_address(spi->dev.of_node);
-
-	if (!IS_ERR(mac))
-		ether_addr_copy(qca->net_dev->dev_addr, mac);
-
-	if (!is_valid_ether_addr(qca->net_dev->dev_addr)) {
+	ret = of_get_mac_address(spi->dev.of_node, qca->net_dev->dev_addr);
+	if (ret) {
 		eth_hw_addr_random(qca->net_dev);
 		dev_info(&spi->dev, "Using random MAC address: %pM\n",
 			 qca->net_dev->dev_addr);
diff --git a/drivers/net/ethernet/qualcomm/qca_uart.c b/drivers/net/ethernet/qualcomm/qca_uart.c
index 362b4f5c162c..bcdeca7b3366 100644
--- a/drivers/net/ethernet/qualcomm/qca_uart.c
+++ b/drivers/net/ethernet/qualcomm/qca_uart.c
@@ -323,7 +323,6 @@ static int qca_uart_probe(struct serdev_device *serdev)
 {
 	struct net_device *qcauart_dev = alloc_etherdev(sizeof(struct qcauart));
 	struct qcauart *qca;
-	const char *mac;
 	u32 speed = 115200;
 	int ret;
 
@@ -348,12 +347,8 @@ static int qca_uart_probe(struct serdev_device *serdev)
 
 	of_property_read_u32(serdev->dev.of_node, "current-speed", &speed);
 
-	mac = of_get_mac_address(serdev->dev.of_node);
-
-	if (!IS_ERR(mac))
-		ether_addr_copy(qca->net_dev->dev_addr, mac);
-
-	if (!is_valid_ether_addr(qca->net_dev->dev_addr)) {
+	ret = of_get_mac_address(serdev->dev.of_node, qca->net_dev->dev_addr);
+	if (ret) {
 		eth_hw_addr_random(qca->net_dev);
 		dev_info(&serdev->dev, "Using random MAC address: %pM\n",
 			 qca->net_dev->dev_addr);
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 1409ae986aa2..8c84c40ab9a0 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -109,11 +109,13 @@ static void ravb_set_buffer_align(struct sk_buff *skb)
  * Ethernet AVB device doesn't have ROM for MAC address.
  * This function gets the MAC address that was used by a bootloader.
  */
-static void ravb_read_mac_address(struct net_device *ndev, const u8 *mac)
+static void ravb_read_mac_address(struct device_node *np,
+				  struct net_device *ndev)
 {
-	if (!IS_ERR(mac)) {
-		ether_addr_copy(ndev->dev_addr, mac);
-	} else {
+	int ret;
+
+	ret = of_get_mac_address(np, ndev->dev_addr);
+	if (ret) {
 		u32 mahr = ravb_read(ndev, MAHR);
 		u32 malr = ravb_read(ndev, MALR);
 
@@ -2207,7 +2209,7 @@ static int ravb_probe(struct platform_device *pdev)
 	priv->msg_enable = RAVB_DEF_MSG_ENABLE;
 
 	/* Read and set MAC address */
-	ravb_read_mac_address(ndev, of_get_mac_address(np));
+	ravb_read_mac_address(np, ndev);
 	if (!is_valid_ether_addr(ndev->dev_addr)) {
 		dev_warn(&pdev->dev,
 			 "no valid MAC address supplied, using a random one\n");
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index ebedb1a11132..c5b154868c1f 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3170,7 +3170,6 @@ static struct sh_eth_plat_data *sh_eth_parse_dt(struct device *dev)
 	struct device_node *np = dev->of_node;
 	struct sh_eth_plat_data *pdata;
 	phy_interface_t interface;
-	const char *mac_addr;
 	int ret;
 
 	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
@@ -3182,9 +3181,7 @@ static struct sh_eth_plat_data *sh_eth_parse_dt(struct device *dev)
 		return NULL;
 	pdata->phy_interface = interface;
 
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(pdata->mac_addr, mac_addr);
+	of_get_mac_address(np, pdata->mac_addr);
 
 	pdata->no_ether_link =
 		of_property_read_bool(np, "renesas,no-ether-link");
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
index 33f79402850d..4639ed9438a3 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
@@ -25,8 +25,7 @@
 
 #ifdef CONFIG_OF
 static int sxgbe_probe_config_dt(struct platform_device *pdev,
-				 struct sxgbe_plat_data *plat,
-				 const char **mac)
+				 struct sxgbe_plat_data *plat)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct sxgbe_dma_cfg *dma_cfg;
@@ -35,7 +34,6 @@ static int sxgbe_probe_config_dt(struct platform_device *pdev,
 	if (!np)
 		return -ENODEV;
 
-	*mac = of_get_mac_address(np);
 	err = of_get_phy_mode(np, &plat->interface);
 	if (err && err != -ENODEV)
 		return err;
@@ -63,8 +61,7 @@ static int sxgbe_probe_config_dt(struct platform_device *pdev,
 }
 #else
 static int sxgbe_probe_config_dt(struct platform_device *pdev,
-				 struct sxgbe_plat_data *plat,
-				 const char **mac)
+				 struct sxgbe_plat_data *plat)
 {
 	return -ENOSYS;
 }
@@ -85,7 +82,6 @@ static int sxgbe_platform_probe(struct platform_device *pdev)
 	void __iomem *addr;
 	struct sxgbe_priv_data *priv = NULL;
 	struct sxgbe_plat_data *plat_dat = NULL;
-	const char *mac = NULL;
 	struct net_device *ndev = platform_get_drvdata(pdev);
 	struct device_node *node = dev->of_node;
 
@@ -101,7 +97,7 @@ static int sxgbe_platform_probe(struct platform_device *pdev)
 		if (!plat_dat)
 			return  -ENOMEM;
 
-		ret = sxgbe_probe_config_dt(pdev, plat_dat, &mac);
+		ret = sxgbe_probe_config_dt(pdev, plat_dat);
 		if (ret) {
 			pr_err("%s: main dt probe failed\n", __func__);
 			return ret;
@@ -122,8 +118,7 @@ static int sxgbe_platform_probe(struct platform_device *pdev)
 	}
 
 	/* Get MAC address if available (DT) */
-	if (!IS_ERR_OR_NULL(mac))
-		ether_addr_copy(priv->dev->dev_addr, mac);
+	of_get_mac_address(node, priv->dev->dev_addr);
 
 	/* Get the TX/RX IRQ numbers */
 	for (i = 0, chan = 1; i < SXGBE_TX_QUEUES; i++) {
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 501b9c7aba56..fcbb4bb31408 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -1559,7 +1559,6 @@ static int ave_probe(struct platform_device *pdev)
 	struct ave_private *priv;
 	struct net_device *ndev;
 	struct device_node *np;
-	const void *mac_addr;
 	void __iomem *base;
 	const char *name;
 	int i, irq, ret;
@@ -1600,12 +1599,9 @@ static int ave_probe(struct platform_device *pdev)
 
 	ndev->max_mtu = AVE_MAX_ETHFRAME - (ETH_HLEN + ETH_FCS_LEN);
 
-	mac_addr = of_get_mac_address(np);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(ndev->dev_addr, mac_addr);
-
-	/* if the mac address is invalid, use random mac address */
-	if (!is_valid_ether_addr(ndev->dev_addr)) {
+	ret = of_get_mac_address(np, ndev->dev_addr);
+	if (ret) {
+		/* if the mac address is invalid, use random mac address */
 		eth_hw_addr_random(ndev);
 		dev_warn(dev, "Using random MAC address: %pM\n",
 			 ndev->dev_addr);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
index 08c76636c164..dfbaea06d108 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
@@ -115,7 +115,7 @@ static int anarion_dwmac_probe(struct platform_device *pdev)
 	if (IS_ERR(gmac))
 		return PTR_ERR(gmac);
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
index 27254b27d7ed..bc91fd867dcd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c
@@ -438,7 +438,7 @@ static int dwc_eth_dwmac_probe(struct platform_device *pdev)
 	if (IS_ERR(stmmac_res.addr))
 		return PTR_ERR(stmmac_res.addr);
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
index fad503820e04..fbfda55b4c52 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
@@ -27,7 +27,7 @@ static int dwmac_generic_probe(struct platform_device *pdev)
 		return ret;
 
 	if (pdev->dev.of_node) {
-		plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+		plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 		if (IS_ERR(plat_dat)) {
 			dev_err(&pdev->dev, "dt configuration failed\n");
 			return PTR_ERR(plat_dat);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c
index c1a361305a5a..84651207a1de 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c
@@ -231,7 +231,7 @@ static int imx_dwmac_probe(struct platform_device *pdev)
 	if (!dwmac)
 		return -ENOMEM;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c
index 6c19fcc76c6f..06d287f104be 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c
@@ -85,7 +85,7 @@ static int intel_eth_plat_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat)) {
 		dev_err(&pdev->dev, "dt configuration failed\n");
 		return PTR_ERR(plat_dat);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
index 749585fe6fc9..28dd0ed85a82 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
@@ -255,7 +255,7 @@ static int ipq806x_gmac_probe(struct platform_device *pdev)
 	if (val)
 		return val;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c
index 3d3f43d91b98..9d77c647badd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c
@@ -37,7 +37,7 @@ static int lpc18xx_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
index 9e4b83832938..58c0feaa8131 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
@@ -407,7 +407,7 @@ static int mediatek_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c
index bbc16b5a410a..16fb66a0ca72 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c
@@ -52,7 +52,7 @@ static int meson6_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
index 848e5c37746b..c7a6588d9398 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
@@ -398,7 +398,7 @@ static int meson8b_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c
index 8551ea878ba5..adfeb8d3293d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c
@@ -118,7 +118,7 @@ static int oxnas_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
index a674b7d6b49a..84382fc5cc4d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
@@ -461,7 +461,7 @@ static int qcom_ethqos_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat)) {
 		dev_err(&pdev->dev, "dt configuration failed\n");
 		return PTR_ERR(plat_dat);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 6ef30252bfe0..8d28a536e1bb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -1396,7 +1396,7 @@ static int rk_gmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index 70d41783329d..85208128f135 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -398,7 +398,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
index e1b63df6f96f..710d7435733e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
@@ -325,7 +325,7 @@ static int sti_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
index 5d4df4c5254e..2b38a499a404 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
@@ -371,7 +371,7 @@ static int stm32_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index 19e7ec30af4c..4422baeed3d8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -1221,7 +1221,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return -EINVAL;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
index 0e1ca2cba3c7..527077c98ebc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
@@ -108,7 +108,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c
index d23be45a64e5..d046e33b8a29 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c
@@ -208,7 +208,7 @@ static int visconti_eth_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac);
+	plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac);
 	if (IS_ERR(plat_dat))
 		return PTR_ERR(plat_dat);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index c49debb62b05..8b4ff9c189a1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -26,7 +26,7 @@
 
 struct stmmac_resources {
 	void __iomem *addr;
-	const char *mac;
+	u8 mac[ETH_ALEN];
 	int wol_irq;
 	int lpi_irq;
 	int irq;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 77285646c5fc..328aeb2cd276 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -6016,7 +6016,7 @@ int stmmac_dvr_probe(struct device *device,
 	for (i = 0; i < MTL_MAX_TX_QUEUES; i++)
 		priv->tx_irq[i] = res->tx_irq[i];
 
-	if (!IS_ERR_OR_NULL(res->mac))
+	if (!is_zero_ether_addr(res->mac))
 		memcpy(priv->dev->dev_addr, res->mac, ETH_ALEN);
 
 	dev_set_drvdata(device, priv->dev);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 5a1e018884e6..1e17a23d9118 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -394,7 +394,7 @@ static int stmmac_of_get_mac_mode(struct device_node *np)
  * set some private fields that will be used by the main at runtime.
  */
 struct plat_stmmacenet_data *
-stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
+stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct plat_stmmacenet_data *plat;
@@ -406,12 +406,12 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	if (!plat)
 		return ERR_PTR(-ENOMEM);
 
-	*mac = of_get_mac_address(np);
-	if (IS_ERR(*mac)) {
-		if (PTR_ERR(*mac) == -EPROBE_DEFER)
-			return ERR_CAST(*mac);
+	rc = of_get_mac_address(np, mac);
+	if (rc) {
+		if (rc == -EPROBE_DEFER)
+			return ERR_PTR(rc);
 
-		*mac = NULL;
+		eth_zero_addr(mac);
 	}
 
 	plat->phy_interface = device_get_phy_mode(&pdev->dev);
@@ -627,7 +627,7 @@ void stmmac_remove_config_dt(struct platform_device *pdev,
 }
 #else
 struct plat_stmmacenet_data *
-stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
+stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
 {
 	return ERR_PTR(-EINVAL);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h
index 3a4663b7b460..3fff3f59d73d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h
@@ -12,7 +12,7 @@
 #include "stmmac.h"
 
 struct plat_stmmacenet_data *
-stmmac_probe_config_dt(struct platform_device *pdev, const char **mac);
+stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac);
 void stmmac_remove_config_dt(struct platform_device *pdev,
 			     struct plat_stmmacenet_data *plat);
 
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index 638d7b03be4b..6a67b026df0b 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -1824,7 +1824,6 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common)
 
 	for_each_child_of_node(node, port_np) {
 		struct am65_cpsw_port *port;
-		const void *mac_addr;
 		u32 port_id;
 
 		/* it is not a slave port node, continue */
@@ -1903,15 +1902,15 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common)
 			return ret;
 		}
 
-		mac_addr = of_get_mac_address(port_np);
-		if (!IS_ERR(mac_addr)) {
-			ether_addr_copy(port->slave.mac_addr, mac_addr);
-		} else if (am65_cpsw_am654_get_efuse_macid(port_np,
-							   port->port_id,
-							   port->slave.mac_addr) ||
-			   !is_valid_ether_addr(port->slave.mac_addr)) {
-			random_ether_addr(port->slave.mac_addr);
-			dev_err(dev, "Use random MAC address\n");
+		ret = of_get_mac_address(port_np, port->slave.mac_addr);
+		if (ret) {
+			am65_cpsw_am654_get_efuse_macid(port_np,
+							port->port_id,
+							port->slave.mac_addr);
+			if (!is_valid_ether_addr(port->slave.mac_addr)) {
+				random_ether_addr(port->slave.mac_addr);
+				dev_err(dev, "Use random MAC address\n");
+			}
 		}
 	}
 	of_node_put(node);
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 074702af3dc6..c0cd7de88316 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1296,7 +1296,6 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 
 	for_each_available_child_of_node(node, slave_node) {
 		struct cpsw_slave_data *slave_data = data->slave_data + i;
-		const void *mac_addr = NULL;
 		int lenp;
 		const __be32 *parp;
 
@@ -1368,10 +1367,8 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 		}
 
 no_phy_slave:
-		mac_addr = of_get_mac_address(slave_node);
-		if (!IS_ERR(mac_addr)) {
-			ether_addr_copy(slave_data->mac_addr, mac_addr);
-		} else {
+		ret = of_get_mac_address(slave_node, slave_data->mac_addr);
+		if (ret) {
 			ret = ti_cm_get_macid(&pdev->dev, i,
 					      slave_data->mac_addr);
 			if (ret)
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 0751f77de2c7..69b7a4e0220a 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1257,7 +1257,6 @@ static int cpsw_probe_dt(struct cpsw_common *cpsw)
 
 	for_each_child_of_node(tmp_node, port_np) {
 		struct cpsw_slave_data *slave_data;
-		const void *mac_addr;
 		u32 port_id;
 
 		ret = of_property_read_u32(port_np, "reg", &port_id);
@@ -1316,10 +1315,8 @@ static int cpsw_probe_dt(struct cpsw_common *cpsw)
 			goto err_node_put;
 		}
 
-		mac_addr = of_get_mac_address(port_np);
-		if (!IS_ERR(mac_addr)) {
-			ether_addr_copy(slave_data->mac_addr, mac_addr);
-		} else {
+		ret = of_get_mac_address(port_np, slave_data->mac_addr);
+		if (ret) {
 			ret = ti_cm_get_macid(dev, port_id - 1,
 					      slave_data->mac_addr);
 			if (ret)
diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index c7031e1960d4..14e7da7d302f 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -1687,7 +1687,6 @@ davinci_emac_of_get_pdata(struct platform_device *pdev, struct emac_priv *priv)
 	const struct of_device_id *match;
 	const struct emac_platform_data *auxdata;
 	struct emac_platform_data *pdata = NULL;
-	const u8 *mac_addr;
 
 	if (!IS_ENABLED(CONFIG_OF) || !pdev->dev.of_node)
 		return dev_get_platdata(&pdev->dev);
@@ -1699,11 +1698,8 @@ davinci_emac_of_get_pdata(struct platform_device *pdev, struct emac_priv *priv)
 	np = pdev->dev.of_node;
 	pdata->version = EMAC_VERSION_2;
 
-	if (!is_valid_ether_addr(pdata->mac_addr)) {
-		mac_addr = of_get_mac_address(np);
-		if (!IS_ERR(mac_addr))
-			ether_addr_copy(pdata->mac_addr, mac_addr);
-	}
+	if (!is_valid_ether_addr(pdata->mac_addr))
+		of_get_mac_address(np, pdata->mac_addr);
 
 	of_property_read_u32(np, "ti,davinci-ctrl-reg-offset",
 			     &pdata->ctrl_reg_offset);
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index d7a144b4a09f..9030e619e543 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1966,7 +1966,6 @@ static int netcp_create_interface(struct netcp_device *netcp_device,
 	struct resource res;
 	void __iomem *efuse = NULL;
 	u32 efuse_mac = 0;
-	const void *mac_addr;
 	u8 efuse_mac_addr[6];
 	u32 temp[2];
 	int ret = 0;
@@ -2036,10 +2035,8 @@ static int netcp_create_interface(struct netcp_device *netcp_device,
 		devm_iounmap(dev, efuse);
 		devm_release_mem_region(dev, res.start, size);
 	} else {
-		mac_addr = of_get_mac_address(node_interface);
-		if (!IS_ERR(mac_addr))
-			ether_addr_copy(ndev->dev_addr, mac_addr);
-		else
+		ret = of_get_mac_address(node_interface, ndev->dev_addr);
+		if (ret)
 			eth_random_addr(ndev->dev_addr);
 	}
 
diff --git a/drivers/net/ethernet/wiznet/w5100-spi.c b/drivers/net/ethernet/wiznet/w5100-spi.c
index 2b4126d2427d..2b84848dc26a 100644
--- a/drivers/net/ethernet/wiznet/w5100-spi.c
+++ b/drivers/net/ethernet/wiznet/w5100-spi.c
@@ -423,8 +423,14 @@ static int w5100_spi_probe(struct spi_device *spi)
 	const struct of_device_id *of_id;
 	const struct w5100_ops *ops;
 	kernel_ulong_t driver_data;
+	const void *mac = NULL;
+	u8 tmpmac[ETH_ALEN];
 	int priv_size;
-	const void *mac = of_get_mac_address(spi->dev.of_node);
+	int ret;
+
+	ret = of_get_mac_address(spi->dev.of_node, tmpmac);
+	if (!ret)
+		mac = tmpmac;
 
 	if (spi->dev.of_node) {
 		of_id = of_match_device(w5100_of_match, &spi->dev);
diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c
index c0d181a7f83a..ec5db481c9cd 100644
--- a/drivers/net/ethernet/wiznet/w5100.c
+++ b/drivers/net/ethernet/wiznet/w5100.c
@@ -1157,7 +1157,7 @@ int w5100_probe(struct device *dev, const struct w5100_ops *ops,
 	INIT_WORK(&priv->setrx_work, w5100_setrx_work);
 	INIT_WORK(&priv->restart_work, w5100_restart_work);
 
-	if (!IS_ERR_OR_NULL(mac_addr))
+	if (mac_addr)
 		memcpy(ndev->dev_addr, mac_addr, ETH_ALEN);
 	else
 		eth_hw_addr_random(ndev);
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 030185301014..a1f5f07f4ca9 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -438,7 +438,7 @@ static void temac_do_set_mac_address(struct net_device *ndev)
 
 static int temac_init_mac_address(struct net_device *ndev, const void *address)
 {
-	ether_addr_copy(ndev->dev_addr, address);
+	memcpy(ndev->dev_addr, address, ETH_ALEN);
 	if (!is_valid_ether_addr(ndev->dev_addr))
 		eth_hw_addr_random(ndev);
 	temac_do_set_mac_address(ndev);
@@ -1351,7 +1351,7 @@ static int temac_probe(struct platform_device *pdev)
 	struct device_node *temac_np = dev_of_node(&pdev->dev), *dma_np;
 	struct temac_local *lp;
 	struct net_device *ndev;
-	const void *addr;
+	u8 addr[ETH_ALEN];
 	__be32 *p;
 	bool little_endian;
 	int rc = 0;
@@ -1542,8 +1542,8 @@ static int temac_probe(struct platform_device *pdev)
 
 	if (temac_np) {
 		/* Retrieve the MAC address */
-		addr = of_get_mac_address(temac_np);
-		if (IS_ERR(addr)) {
+		rc = of_get_mac_address(temac_np, addr);
+		if (rc) {
 			dev_err(&pdev->dev, "could not find MAC address\n");
 			return -ENODEV;
 		}
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index feb1aa4ec927..b508c9453f40 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1835,8 +1835,8 @@ static int axienet_probe(struct platform_device *pdev)
 	struct device_node *np;
 	struct axienet_local *lp;
 	struct net_device *ndev;
-	const void *mac_addr;
 	struct resource *ethres;
+	u8 mac_addr[ETH_ALEN];
 	int addr_width = 32;
 	u32 value;
 
@@ -2062,13 +2062,14 @@ static int axienet_probe(struct platform_device *pdev)
 		dev_info(&pdev->dev, "Ethernet core IRQ not defined\n");
 
 	/* Retrieve the MAC address */
-	mac_addr = of_get_mac_address(pdev->dev.of_node);
-	if (IS_ERR(mac_addr)) {
-		dev_warn(&pdev->dev, "could not find MAC address property: %ld\n",
-			 PTR_ERR(mac_addr));
-		mac_addr = NULL;
+	ret = of_get_mac_address(pdev->dev.of_node, mac_addr);
+	if (!ret) {
+		axienet_set_mac_address(ndev, mac_addr);
+	} else {
+		dev_warn(&pdev->dev, "could not find MAC address property: %d\n",
+			 ret);
+		axienet_set_mac_address(ndev, NULL);
 	}
-	axienet_set_mac_address(ndev, mac_addr);
 
 	lp->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD;
 	lp->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD;
diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
index 007840d4a807..d9d58a7dabee 100644
--- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c
+++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c
@@ -1115,7 +1115,6 @@ static int xemaclite_of_probe(struct platform_device *ofdev)
 	struct net_device *ndev = NULL;
 	struct net_local *lp = NULL;
 	struct device *dev = &ofdev->dev;
-	const void *mac_address;
 
 	int rc = 0;
 
@@ -1157,12 +1156,9 @@ static int xemaclite_of_probe(struct platform_device *ofdev)
 	lp->next_rx_buf_to_use = 0x0;
 	lp->tx_ping_pong = get_bool(ofdev, "xlnx,tx-ping-pong");
 	lp->rx_ping_pong = get_bool(ofdev, "xlnx,rx-ping-pong");
-	mac_address = of_get_mac_address(ofdev->dev.of_node);
 
-	if (!IS_ERR(mac_address)) {
-		/* Set the MAC address. */
-		ether_addr_copy(ndev->dev_addr, mac_address);
-	} else {
+	rc = of_get_mac_address(ofdev->dev.of_node, ndev->dev_addr);
+	if (rc) {
 		dev_warn(dev, "No MAC address found, using random\n");
 		eth_hw_addr_random(ndev);
 	}
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index 01f9c26f9bf3..e9a36dd7144f 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -617,7 +617,6 @@ static int ath9k_of_init(struct ath_softc *sc)
 	struct ath_hw *ah = sc->sc_ah;
 	struct ath_common *common = ath9k_hw_common(ah);
 	enum ath_bus_type bus_type = common->bus_ops->ath_bus_type;
-	const char *mac;
 	char eeprom_name[100];
 	int ret;
 
@@ -640,9 +639,7 @@ static int ath9k_of_init(struct ath_softc *sc)
 		ah->ah_flags |= AH_NO_EEP_SWAP;
 	}
 
-	mac = of_get_mac_address(np);
-	if (!IS_ERR(mac))
-		ether_addr_copy(common->macaddr, mac);
+	of_get_mac_address(np, common->macaddr);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/mediatek/mt76/eeprom.c b/drivers/net/wireless/mediatek/mt76/eeprom.c
index 665b54c5c8ae..6d895738222a 100644
--- a/drivers/net/wireless/mediatek/mt76/eeprom.c
+++ b/drivers/net/wireless/mediatek/mt76/eeprom.c
@@ -91,16 +91,9 @@ void
 mt76_eeprom_override(struct mt76_phy *phy)
 {
 	struct mt76_dev *dev = phy->dev;
-
-#ifdef CONFIG_OF
 	struct device_node *np = dev->dev->of_node;
-	const u8 *mac = NULL;
 
-	if (np)
-		mac = of_get_mac_address(np);
-	if (!IS_ERR_OR_NULL(mac))
-		ether_addr_copy(phy->macaddr, mac);
-#endif
+	of_get_mac_address(np, phy->macaddr);
 
 	if (!is_valid_ether_addr(phy->macaddr)) {
 		eth_random_addr(phy->macaddr);
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c
index 61a4f1ad31e2..e95c101c2711 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c
@@ -989,11 +989,7 @@ static void rt2x00lib_rate(struct ieee80211_rate *entry,
 
 void rt2x00lib_set_mac_address(struct rt2x00_dev *rt2x00dev, u8 *eeprom_mac_addr)
 {
-	const char *mac_addr;
-
-	mac_addr = of_get_mac_address(rt2x00dev->dev->of_node);
-	if (!IS_ERR(mac_addr))
-		ether_addr_copy(eeprom_mac_addr, mac_addr);
+	of_get_mac_address(rt2x00dev->dev->of_node, eeprom_mac_addr);
 
 	if (!is_valid_ether_addr(eeprom_mac_addr)) {
 		eth_random_addr(eeprom_mac_addr);
diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c
index bc0a27de69d4..cb77b774bf76 100644
--- a/drivers/of/of_net.c
+++ b/drivers/of/of_net.c
@@ -45,42 +45,35 @@ int of_get_phy_mode(struct device_node *np, phy_interface_t *interface)
 }
 EXPORT_SYMBOL_GPL(of_get_phy_mode);
 
-static const void *of_get_mac_addr(struct device_node *np, const char *name)
+static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr)
 {
 	struct property *pp = of_find_property(np, name, NULL);
 
-	if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value))
-		return pp->value;
-	return NULL;
+	if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) {
+		memcpy(addr, pp->value, ETH_ALEN);
+		return 0;
+	}
+	return -ENODEV;
 }
 
-static const void *of_get_mac_addr_nvmem(struct device_node *np)
+static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
 {
-	int ret;
-	const void *mac;
-	u8 nvmem_mac[ETH_ALEN];
 	struct platform_device *pdev = of_find_device_by_node(np);
+	int ret;
 
 	if (!pdev)
-		return ERR_PTR(-ENODEV);
+		return -ENODEV;
 
-	ret = nvmem_get_mac_address(&pdev->dev, &nvmem_mac);
-	if (ret) {
-		put_device(&pdev->dev);
-		return ERR_PTR(ret);
-	}
-
-	mac = devm_kmemdup(&pdev->dev, nvmem_mac, ETH_ALEN, GFP_KERNEL);
+	ret = nvmem_get_mac_address(&pdev->dev, addr);
 	put_device(&pdev->dev);
-	if (!mac)
-		return ERR_PTR(-ENOMEM);
 
-	return mac;
+	return ret;
 }
 
 /**
  * of_get_mac_address()
  * @np:		Caller's Device Node
+ * @addr:	Pointer to a six-byte array for the result
  *
  * Search the device tree for the best MAC address to use.  'mac-address' is
  * checked first, because that is supposed to contain to "most recent" MAC
@@ -101,24 +94,27 @@ static const void *of_get_mac_addr_nvmem(struct device_node *np)
  * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
  * but is all zeros.
  *
- * Return: Will be a valid pointer on success and ERR_PTR in case of error.
+ * Return: 0 on success and errno in case of error.
 */
-const void *of_get_mac_address(struct device_node *np)
+int of_get_mac_address(struct device_node *np, u8 *addr)
 {
-	const void *addr;
+	int ret;
+
+	if (!np)
+		return -ENODEV;
 
-	addr = of_get_mac_addr(np, "mac-address");
-	if (addr)
-		return addr;
+	ret = of_get_mac_addr(np, "mac-address", addr);
+	if (!ret)
+		return 0;
 
-	addr = of_get_mac_addr(np, "local-mac-address");
-	if (addr)
-		return addr;
+	ret = of_get_mac_addr(np, "local-mac-address", addr);
+	if (!ret)
+		return 0;
 
-	addr = of_get_mac_addr(np, "address");
-	if (addr)
-		return addr;
+	ret = of_get_mac_addr(np, "address", addr);
+	if (!ret)
+		return 0;
 
-	return of_get_mac_addr_nvmem(np);
+	return of_get_mac_addr_nvmem(np, addr);
 }
 EXPORT_SYMBOL(of_get_mac_address);
diff --git a/drivers/staging/octeon/ethernet.c b/drivers/staging/octeon/ethernet.c
index 5dea6e96ec90..da7c2cd8ebb8 100644
--- a/drivers/staging/octeon/ethernet.c
+++ b/drivers/staging/octeon/ethernet.c
@@ -407,14 +407,10 @@ static int cvm_oct_common_set_mac_address(struct net_device *dev, void *addr)
 int cvm_oct_common_init(struct net_device *dev)
 {
 	struct octeon_ethernet *priv = netdev_priv(dev);
-	const u8 *mac = NULL;
+	int ret;
 
-	if (priv->of_node)
-		mac = of_get_mac_address(priv->of_node);
-
-	if (!IS_ERR_OR_NULL(mac))
-		ether_addr_copy(dev->dev_addr, mac);
-	else
+	ret = of_get_mac_address(priv->of_node, dev->dev_addr);
+	if (ret)
 		eth_hw_addr_random(dev);
 
 	/*
diff --git a/drivers/staging/wfx/main.c b/drivers/staging/wfx/main.c
index e7bc1988124a..4b9fdf99981b 100644
--- a/drivers/staging/wfx/main.c
+++ b/drivers/staging/wfx/main.c
@@ -334,7 +334,6 @@ int wfx_probe(struct wfx_dev *wdev)
 {
 	int i;
 	int err;
-	const void *macaddr;
 	struct gpio_desc *gpio_saved;
 
 	// During first part of boot, gpio_wakeup cannot yet been used. So
@@ -423,9 +422,9 @@ int wfx_probe(struct wfx_dev *wdev)
 
 	for (i = 0; i < ARRAY_SIZE(wdev->addresses); i++) {
 		eth_zero_addr(wdev->addresses[i].addr);
-		macaddr = of_get_mac_address(wdev->dev->of_node);
-		if (!IS_ERR_OR_NULL(macaddr)) {
-			ether_addr_copy(wdev->addresses[i].addr, macaddr);
+		err = of_get_mac_address(wdev->dev->of_node,
+					 wdev->addresses[i].addr);
+		if (!err) {
 			wdev->addresses[i].addr[ETH_ALEN - 1] += i;
 		} else {
 			ether_addr_copy(wdev->addresses[i].addr,
diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index 71bbfcf3adcd..daef3b0d9270 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -13,7 +13,7 @@
 
 struct net_device;
 extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface);
-extern const void *of_get_mac_address(struct device_node *np);
+extern int of_get_mac_address(struct device_node *np, u8 *mac);
 extern struct net_device *of_find_net_device_by_node(struct device_node *np);
 #else
 static inline int of_get_phy_mode(struct device_node *np,
@@ -22,9 +22,9 @@ static inline int of_get_phy_mode(struct device_node *np,
 	return -ENODEV;
 }
 
-static inline const void *of_get_mac_address(struct device_node *np)
+static inline int of_get_mac_address(struct device_node *np, u8 *mac)
 {
-	return ERR_PTR(-ENODEV);
+	return -ENODEV;
 }
 
 static inline struct net_device *of_find_net_device_by_node(struct device_node *np)
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 57b2c49f72f4..1259b0f40684 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -260,7 +260,7 @@ struct dsa_port {
 	unsigned int		index;
 	const char		*name;
 	struct dsa_port		*cpu_dp;
-	const char		*mac;
+	u8			mac[ETH_ALEN];
 	struct device_node	*dn;
 	unsigned int		ageing_time;
 	bool			vlan_filtering;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 3c3e56a1f34d..d7c22e3a1fbf 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -392,7 +392,7 @@ static int dsa_port_setup(struct dsa_port *dp)
 
 		break;
 	case DSA_PORT_TYPE_USER:
-		dp->mac = of_get_mac_address(dp->dn);
+		of_get_mac_address(dp->dn, dp->mac);
 		err = dsa_slave_create(dp);
 		if (err)
 			break;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 995e0e16f295..9300cb66e500 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1896,7 +1896,7 @@ int dsa_slave_create(struct dsa_port *port)
 	slave_dev->hw_features |= NETIF_F_HW_TC;
 	slave_dev->features |= NETIF_F_LLTX;
 	slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
-	if (!IS_ERR_OR_NULL(port->mac))
+	if (!is_zero_ether_addr(port->mac))
 		ether_addr_copy(slave_dev->dev_addr, port->mac);
 	else
 		eth_hw_addr_inherit(slave_dev, master);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 933b427122be..9cce612e8976 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -511,13 +511,14 @@ unsigned char * __weak arch_get_platform_mac_address(void)
 
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
 {
-	const unsigned char *addr = NULL;
+	unsigned char *addr;
+	int ret;
 
-	if (dev->of_node)
-		addr = of_get_mac_address(dev->of_node);
-	if (IS_ERR_OR_NULL(addr))
-		addr = arch_get_platform_mac_address();
+	ret = of_get_mac_address(dev->of_node, mac_addr);
+	if (!ret)
+		return 0;
 
+	addr = arch_get_platform_mac_address();
 	if (!addr)
 		return -ENODEV;
 
-- 
cgit v1.2.3


From 441e8c66b23e027c00ccebd70df9fd933918eefe Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Tue, 13 Apr 2021 11:16:06 +0200
Subject: bpf: Return target info when a tracing bpf_link is queried
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is currently no way to discover the target of a tracing program
attachment after the fact. Add this information to bpf_link_info and return
it when querying the bpf_link fd.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210413091607.58945-1-toke@redhat.com
---
 include/linux/bpf_verifier.h   | 9 +++++++++
 include/uapi/linux/bpf.h       | 2 ++
 kernel/bpf/syscall.c           | 3 +++
 tools/include/uapi/linux/bpf.h | 2 ++
 4 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 51c2ffa3d901..6023a1367853 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -487,6 +487,15 @@ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
 		return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id;
 }
 
+/* unpack the IDs from the key as constructed above */
+static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id)
+{
+	if (obj_id)
+		*obj_id = key >> 32;
+	if (btf_id)
+		*btf_id = key & 0x7FFFFFFF;
+}
+
 int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    const struct bpf_prog *prog,
 			    const struct bpf_prog *tgt_prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 85c924bc21b1..df164a44bb41 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5416,6 +5416,8 @@ struct bpf_link_info {
 		} raw_tracepoint;
 		struct {
 			__u32 attach_type;
+			__u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */
+			__u32 target_btf_id; /* BTF type id inside the object */
 		} tracing;
 		struct {
 			__u64 cgroup_id;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6428634da57e..fd495190115e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2551,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
 		container_of(link, struct bpf_tracing_link, link);
 
 	info->tracing.attach_type = tr_link->attach_type;
+	bpf_trampoline_unpack_key(tr_link->trampoline->key,
+				  &info->tracing.target_obj_id,
+				  &info->tracing.target_btf_id);
 
 	return 0;
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 85c924bc21b1..df164a44bb41 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5416,6 +5416,8 @@ struct bpf_link_info {
 		} raw_tracepoint;
 		struct {
 			__u32 attach_type;
+			__u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */
+			__u32 target_btf_id; /* BTF type id inside the object */
 		} tracing;
 		struct {
 			__u64 cgroup_id;
-- 
cgit v1.2.3


From 316d0d92fbc9b926bda8ce7ccc109de0dccb4d92 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 5 Apr 2021 14:40:44 +0300
Subject: mfd: bd718x7: simplify by cleaning unnecessary device data

Most ROHM PMIC sub-devices only use the regmap pointer from
parent device. They can obtain this by dev_get_regamap so in
most cases the MFD device does not need to allocate and populate
the driver data. Simplify drivers by removing this.

The BD70528 still needs the access to watchdog mutex so keep
rohm_regmap_dev in use on BD70528 RTC and WDG drivers for now.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rohm-bd718x7.c       | 43 +++++++++++++++-------------------------
 include/linux/mfd/rohm-bd718x7.h | 13 ------------
 2 files changed, 16 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/rohm-bd718x7.c b/drivers/mfd/rohm-bd718x7.c
index c32c1b6c98fa..bfd81f78beae 100644
--- a/drivers/mfd/rohm-bd718x7.c
+++ b/drivers/mfd/rohm-bd718x7.c
@@ -91,9 +91,9 @@ static const struct regmap_config bd718xx_regmap_config = {
 	.cache_type = REGCACHE_RBTREE,
 };
 
-static int bd718xx_init_press_duration(struct bd718xx *bd718xx)
+static int bd718xx_init_press_duration(struct regmap *regmap,
+				       struct device *dev)
 {
-	struct device* dev = bd718xx->chip.dev;
 	u32 short_press_ms, long_press_ms;
 	u32 short_press_value, long_press_value;
 	int ret;
@@ -102,8 +102,7 @@ static int bd718xx_init_press_duration(struct bd718xx *bd718xx)
 				   &short_press_ms);
 	if (!ret) {
 		short_press_value = min(15u, (short_press_ms + 250) / 500);
-		ret = regmap_update_bits(bd718xx->chip.regmap,
-					 BD718XX_REG_PWRONCONFIG0,
+		ret = regmap_update_bits(regmap, BD718XX_REG_PWRONCONFIG0,
 					 BD718XX_PWRBTN_PRESS_DURATION_MASK,
 					 short_press_value);
 		if (ret) {
@@ -116,8 +115,7 @@ static int bd718xx_init_press_duration(struct bd718xx *bd718xx)
 				   &long_press_ms);
 	if (!ret) {
 		long_press_value = min(15u, (long_press_ms + 500) / 1000);
-		ret = regmap_update_bits(bd718xx->chip.regmap,
-					 BD718XX_REG_PWRONCONFIG1,
+		ret = regmap_update_bits(regmap, BD718XX_REG_PWRONCONFIG1,
 					 BD718XX_PWRBTN_PRESS_DURATION_MASK,
 					 long_press_value);
 		if (ret) {
@@ -132,7 +130,8 @@ static int bd718xx_init_press_duration(struct bd718xx *bd718xx)
 static int bd718xx_i2c_probe(struct i2c_client *i2c,
 			    const struct i2c_device_id *id)
 {
-	struct bd718xx *bd718xx;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *irq_data;
 	int ret;
 	unsigned int chip_type;
 	struct mfd_cell *mfd;
@@ -142,13 +141,6 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 		dev_err(&i2c->dev, "No IRQ configured\n");
 		return -EINVAL;
 	}
-
-	bd718xx = devm_kzalloc(&i2c->dev, sizeof(struct bd718xx), GFP_KERNEL);
-
-	if (!bd718xx)
-		return -ENOMEM;
-
-	bd718xx->chip_irq = i2c->irq;
 	chip_type = (unsigned int)(uintptr_t)
 		    of_device_get_match_data(&i2c->dev);
 	switch (chip_type) {
@@ -164,29 +156,26 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 		dev_err(&i2c->dev, "Unknown device type");
 		return -EINVAL;
 	}
-	bd718xx->chip.dev = &i2c->dev;
-	dev_set_drvdata(&i2c->dev, bd718xx);
 
-	bd718xx->chip.regmap = devm_regmap_init_i2c(i2c,
-						    &bd718xx_regmap_config);
-	if (IS_ERR(bd718xx->chip.regmap)) {
+	regmap = devm_regmap_init_i2c(i2c, &bd718xx_regmap_config);
+	if (IS_ERR(regmap)) {
 		dev_err(&i2c->dev, "regmap initialization failed\n");
-		return PTR_ERR(bd718xx->chip.regmap);
+		return PTR_ERR(regmap);
 	}
 
-	ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->chip.regmap,
-				       bd718xx->chip_irq, IRQF_ONESHOT, 0,
-				       &bd718xx_irq_chip, &bd718xx->irq_data);
+	ret = devm_regmap_add_irq_chip(&i2c->dev, regmap, i2c->irq,
+				       IRQF_ONESHOT, 0, &bd718xx_irq_chip,
+				       &irq_data);
 	if (ret) {
 		dev_err(&i2c->dev, "Failed to add irq_chip\n");
 		return ret;
 	}
 
-	ret = bd718xx_init_press_duration(bd718xx);
+	ret = bd718xx_init_press_duration(regmap, &i2c->dev);
 	if (ret)
 		return ret;
 
-	ret = regmap_irq_get_virq(bd718xx->irq_data, BD718XX_INT_PWRBTN_S);
+	ret = regmap_irq_get_virq(irq_data, BD718XX_INT_PWRBTN_S);
 
 	if (ret < 0) {
 		dev_err(&i2c->dev, "Failed to get the IRQ\n");
@@ -195,9 +184,9 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 
 	button.irq = ret;
 
-	ret = devm_mfd_add_devices(bd718xx->chip.dev, PLATFORM_DEVID_AUTO,
+	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO,
 				   mfd, cells, NULL, 0,
-				   regmap_irq_get_domain(bd718xx->irq_data));
+				   regmap_irq_get_domain(irq_data));
 	if (ret)
 		dev_err(&i2c->dev, "Failed to create subdevices\n");
 
diff --git a/include/linux/mfd/rohm-bd718x7.h b/include/linux/mfd/rohm-bd718x7.h
index bee2474a8f9f..df2918198d37 100644
--- a/include/linux/mfd/rohm-bd718x7.h
+++ b/include/linux/mfd/rohm-bd718x7.h
@@ -310,17 +310,4 @@ enum {
 	BD718XX_PWRBTN_LONG_PRESS_15S
 };
 
-struct bd718xx {
-	/*
-	 * Please keep this as the first member here as some
-	 * drivers (clk) supporting more than one chip may only know this
-	 * generic struct 'struct rohm_regmap_dev' and assume it is
-	 * the first chunk of parent device's private data.
-	 */
-	struct rohm_regmap_dev chip;
-
-	int chip_irq;
-	struct regmap_irq_chip_data *irq_data;
-};
-
 #endif /* __LINUX_MFD_BD718XX_H__ */
-- 
cgit v1.2.3


From 488b205e57181a56f3503cb97240d32798d3d3bd Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 5 Apr 2021 14:42:13 +0300
Subject: mfd: Add ROHM BD71815 ID

Add chip ID for ROHM BD71815 and PMIC so that drivers can identify
this IC.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rohm-generic.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 66f673c35303..e107b4769101 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -11,6 +11,7 @@ enum rohm_chip_type {
 	ROHM_CHIP_TYPE_BD71837 = 0,
 	ROHM_CHIP_TYPE_BD71847,
 	ROHM_CHIP_TYPE_BD70528,
+	ROHM_CHIP_TYPE_BD71815,
 	ROHM_CHIP_TYPE_BD71828,
 	ROHM_CHIP_TYPE_BD9571,
 	ROHM_CHIP_TYPE_BD9574,
-- 
cgit v1.2.3


From 06b2f5f5ecf3e3a3b74b42057b654a5dba7e63bb Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 5 Apr 2021 14:42:34 +0300
Subject: mfd: Sort ROHM chip ID list for better readability

Sort the ID list so it is easier to see which ICs are present.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Suggested-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rohm-generic.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index e107b4769101..9e2880e06950 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -8,13 +8,13 @@
 #include <linux/regulator/driver.h>
 
 enum rohm_chip_type {
-	ROHM_CHIP_TYPE_BD71837 = 0,
-	ROHM_CHIP_TYPE_BD71847,
+	ROHM_CHIP_TYPE_BD9571,
+	ROHM_CHIP_TYPE_BD9574,
 	ROHM_CHIP_TYPE_BD70528,
 	ROHM_CHIP_TYPE_BD71815,
 	ROHM_CHIP_TYPE_BD71828,
-	ROHM_CHIP_TYPE_BD9571,
-	ROHM_CHIP_TYPE_BD9574,
+	ROHM_CHIP_TYPE_BD71837,
+	ROHM_CHIP_TYPE_BD71847,
 	ROHM_CHIP_TYPE_AMOUNT
 };
 
-- 
cgit v1.2.3


From 4dcdcfd5abb34d3139669fcd830b756d45678c47 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 5 Apr 2021 14:42:51 +0300
Subject: mfd: Support for ROHM BD71815 PMIC core

Add core support for ROHM BD71815 Power Management IC.

The IC integrates regulators, a battery charger with a coulomb counter,
a real-time clock (RTC), clock gate and general-purpose outputs (GPO).

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig              |  15 +-
 drivers/mfd/rohm-bd71828.c       | 486 ++++++++++++++++++++++++---------
 include/linux/mfd/rohm-bd71815.h | 562 +++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/rohm-bd71828.h |   3 +
 4 files changed, 933 insertions(+), 133 deletions(-)
 create mode 100644 include/linux/mfd/rohm-bd71815.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index b74efa469e90..60d9ae559f0a 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1975,19 +1975,20 @@ config MFD_ROHM_BD70528
 	  charger.
 
 config MFD_ROHM_BD71828
-	tristate "ROHM BD71828 Power Management IC"
+	tristate "ROHM BD71828 and BD71815 Power Management IC"
 	depends on I2C=y
 	depends on OF
 	select REGMAP_I2C
 	select REGMAP_IRQ
 	select MFD_CORE
 	help
-	  Select this option to get support for the ROHM BD71828 Power
-	  Management IC. BD71828GW is a single-chip power management IC for
-	  battery-powered portable devices. The IC integrates 7 buck
-	  converters, 7 LDOs, and a 1500 mA single-cell linear charger.
-	  Also included is a Coulomb counter, a real-time clock (RTC), and
-	  a 32.768 kHz clock gate.
+	  Select this option to get support for the ROHM BD71828 and BD71815
+	  Power Management ICs. BD71828GW and BD71815AGW are single-chip power
+	  management ICs mainly for battery-powered portable devices.
+	  The BD71828 integrates 7 buck converters and 7 LDOs. The BD71815
+	  has 5 bucks, 7 LDOs, and a boost for driving LEDs. Both ICs provide
+	  also a single-cell linear charger, a Coulomb counter, a real-time
+	  clock (RTC), GPIOs and a 32.768 kHz clock gate.
 
 config MFD_STM32_LPTIMER
 	tristate "Support for STM32 Low-Power Timer"
diff --git a/drivers/mfd/rohm-bd71828.c b/drivers/mfd/rohm-bd71828.c
index 210261d026f2..714d9fcbf07b 100644
--- a/drivers/mfd/rohm-bd71828.c
+++ b/drivers/mfd/rohm-bd71828.c
@@ -2,7 +2,7 @@
 //
 // Copyright (C) 2019 ROHM Semiconductors
 //
-// ROHM BD71828 PMIC driver
+// ROHM BD71828/BD71815 PMIC driver
 
 #include <linux/gpio_keys.h>
 #include <linux/i2c.h>
@@ -11,7 +11,9 @@
 #include <linux/ioport.h>
 #include <linux/irq.h>
 #include <linux/mfd/core.h>
+#include <linux/mfd/rohm-bd71815.h>
 #include <linux/mfd/rohm-bd71828.h>
+#include <linux/mfd/rohm-generic.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/regmap.h>
@@ -29,12 +31,84 @@ static struct gpio_keys_platform_data bd71828_powerkey_data = {
 	.name = "bd71828-pwrkey",
 };
 
-static const struct resource rtc_irqs[] = {
+static const struct resource bd71815_rtc_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_RTC0, "bd71815-rtc-alm-0"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_RTC1, "bd71815-rtc-alm-1"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_RTC2, "bd71815-rtc-alm-2"),
+};
+
+static const struct resource bd71828_rtc_irqs[] = {
 	DEFINE_RES_IRQ_NAMED(BD71828_INT_RTC0, "bd71828-rtc-alm-0"),
 	DEFINE_RES_IRQ_NAMED(BD71828_INT_RTC1, "bd71828-rtc-alm-1"),
 	DEFINE_RES_IRQ_NAMED(BD71828_INT_RTC2, "bd71828-rtc-alm-2"),
 };
 
+static struct resource bd71815_power_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_RMV, "bd71815-dcin-rmv"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-clps-out"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-clps-in"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_RES, "bd71815-dcin-ovp-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_DET, "bd71815-dcin-ovp-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_MON_RES, "bd71815-dcin-mon-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_MON_DET, "bd71815-dcin-mon-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_UV_RES, "bd71815-vsys-uv-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_UV_DET, "bd71815-vsys-uv-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_RES, "bd71815-vsys-low-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_DET, "bd71815-vsys-low-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TEMP, "bd71815-chg-wdg-temp"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TIME, "bd71815-chg-wdg"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_RECHARGE_RES, "bd71815-rechg-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_RECHARGE_DET, "bd71815-rechg-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_RANGED_TEMP_TRANSITION, "bd71815-ranged-temp-transit"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_STATE_TRANSITION, "bd71815-chg-state-change"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_TEMP_NORMAL, "bd71815-bat-temp-normal"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_TEMP_ERANGE, "bd71815-bat-temp-erange"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_REMOVED, "bd71815-bat-rmv"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_DETECTED, "bd71815-bat-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_THERM_REMOVED, "bd71815-therm-rmv"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_THERM_DETECTED, "bd71815-therm-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_DEAD, "bd71815-bat-dead"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_SHORTC_RES, "bd71815-bat-short-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_SHORTC_DET, "bd71815-bat-short-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_LOW_VOLT_RES, "bd71815-bat-low-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_LOW_VOLT_DET, "bd71815-bat-low-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_VOLT_RES, "bd71815-bat-over-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_VOLT_DET, "bd71815-bat-over-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_MON_RES, "bd71815-bat-mon-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_MON_DET, "bd71815-bat-mon-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_CC_MON1, "bd71815-bat-cc-mon1"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_CC_MON2, "bd71815-bat-cc-mon2"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_CC_MON3, "bd71815-bat-cc-mon3"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_1_RES, "bd71815-bat-oc1-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_1_DET, "bd71815-bat-oc1-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_2_RES, "bd71815-bat-oc2-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_2_DET, "bd71815-bat-oc2-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_RES, "bd71815-bat-oc3-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_DET, "bd71815-bat-oc3-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-bat-low-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-bat-low-det"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-bat-hi-res"),
+	DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-bat-hi-det"),
+};
+
+static struct mfd_cell bd71815_mfd_cells[] = {
+	{ .name = "bd71815-pmic", },
+	{ .name = "bd71815-clk", },
+	{ .name = "bd71815-gpo", },
+	{
+		.name = "bd71815-power",
+		.num_resources = ARRAY_SIZE(bd71815_power_irqs),
+		.resources = &bd71815_power_irqs[0],
+	},
+	{
+		.name = "bd71815-rtc",
+		.num_resources = ARRAY_SIZE(bd71815_rtc_irqs),
+		.resources = &bd71815_rtc_irqs[0],
+	},
+};
+
 static struct mfd_cell bd71828_mfd_cells[] = {
 	{ .name = "bd71828-pmic", },
 	{ .name = "bd71828-gpio", },
@@ -47,8 +121,8 @@ static struct mfd_cell bd71828_mfd_cells[] = {
 	{ .name = "bd71827-power", },
 	{
 		.name = "bd71828-rtc",
-		.resources = rtc_irqs,
-		.num_resources = ARRAY_SIZE(rtc_irqs),
+		.resources = bd71828_rtc_irqs,
+		.num_resources = ARRAY_SIZE(bd71828_rtc_irqs),
 	}, {
 		.name = "gpio-keys",
 		.platform_data = &bd71828_powerkey_data,
@@ -56,7 +130,35 @@ static struct mfd_cell bd71828_mfd_cells[] = {
 	},
 };
 
-static const struct regmap_range volatile_ranges[] = {
+static const struct regmap_range bd71815_volatile_ranges[] = {
+	{
+		.range_min = BD71815_REG_SEC,
+		.range_max = BD71815_REG_YEAR,
+	}, {
+		.range_min = BD71815_REG_CONF,
+		.range_max = BD71815_REG_BAT_TEMP,
+	}, {
+		.range_min = BD71815_REG_VM_IBAT_U,
+		.range_max = BD71815_REG_CC_CTRL,
+	}, {
+		.range_min = BD71815_REG_CC_STAT,
+		.range_max = BD71815_REG_CC_CURCD_L,
+	}, {
+		.range_min = BD71815_REG_VM_BTMP_MON,
+		.range_max = BD71815_REG_VM_BTMP_MON,
+	}, {
+		.range_min = BD71815_REG_INT_STAT,
+		.range_max = BD71815_REG_INT_UPDATE,
+	}, {
+		.range_min = BD71815_REG_VM_VSYS_U,
+		.range_max = BD71815_REG_REX_CTRL_1,
+	}, {
+		.range_min = BD71815_REG_FULL_CCNTD_3,
+		.range_max = BD71815_REG_CCNTD_CHG_2,
+	},
+};
+
+static const struct regmap_range bd71828_volatile_ranges[] = {
 	{
 		.range_min = BD71828_REG_PS_CTRL_1,
 		.range_max = BD71828_REG_PS_CTRL_1,
@@ -80,15 +182,28 @@ static const struct regmap_range volatile_ranges[] = {
 	},
 };
 
-static const struct regmap_access_table volatile_regs = {
-	.yes_ranges = &volatile_ranges[0],
-	.n_yes_ranges = ARRAY_SIZE(volatile_ranges),
+static const struct regmap_access_table bd71815_volatile_regs = {
+	.yes_ranges = &bd71815_volatile_ranges[0],
+	.n_yes_ranges = ARRAY_SIZE(bd71815_volatile_ranges),
+};
+
+static const struct regmap_access_table bd71828_volatile_regs = {
+	.yes_ranges = &bd71828_volatile_ranges[0],
+	.n_yes_ranges = ARRAY_SIZE(bd71828_volatile_ranges),
+};
+
+static const struct regmap_config bd71815_regmap = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.volatile_table = &bd71815_volatile_regs,
+	.max_register = BD71815_MAX_REGISTER - 1,
+	.cache_type = REGCACHE_RBTREE,
 };
 
-static struct regmap_config bd71828_regmap = {
+static const struct regmap_config bd71828_regmap = {
 	.reg_bits = 8,
 	.val_bits = 8,
-	.volatile_table = &volatile_regs,
+	.volatile_table = &bd71828_volatile_regs,
 	.max_register = BD71828_MAX_REGISTER,
 	.cache_type = REGCACHE_RBTREE,
 };
@@ -96,7 +211,7 @@ static struct regmap_config bd71828_regmap = {
 /*
  * Mapping of main IRQ register bits to sub-IRQ register offsets so that we can
  * access corect sub-IRQ registers based on bits that are set in main IRQ
- * register.
+ * register. BD71815 and BD71828 have same sub-register-block offests.
  */
 
 static unsigned int bit0_offsets[] = {11};		/* RTC IRQ */
@@ -108,7 +223,7 @@ static unsigned int bit5_offsets[] = {3};		/* VSYS IRQ */
 static unsigned int bit6_offsets[] = {1, 2};		/* DCIN IRQ */
 static unsigned int bit7_offsets[] = {0};		/* BUCK IRQ */
 
-static struct regmap_irq_sub_irq_map bd71828_sub_irq_offsets[] = {
+static struct regmap_irq_sub_irq_map bd718xx_sub_irq_offsets[] = {
 	REGMAP_IRQ_MAIN_REG_OFFSET(bit0_offsets),
 	REGMAP_IRQ_MAIN_REG_OFFSET(bit1_offsets),
 	REGMAP_IRQ_MAIN_REG_OFFSET(bit2_offsets),
@@ -119,6 +234,88 @@ static struct regmap_irq_sub_irq_map bd71828_sub_irq_offsets[] = {
 	REGMAP_IRQ_MAIN_REG_OFFSET(bit7_offsets),
 };
 
+static const struct regmap_irq bd71815_irqs[] = {
+	REGMAP_IRQ_REG(BD71815_INT_BUCK1_OCP, 0, BD71815_INT_BUCK1_OCP_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BUCK2_OCP, 0, BD71815_INT_BUCK2_OCP_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BUCK3_OCP, 0, BD71815_INT_BUCK3_OCP_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BUCK4_OCP, 0, BD71815_INT_BUCK4_OCP_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BUCK5_OCP, 0, BD71815_INT_BUCK5_OCP_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_LED_OVP, 0, BD71815_INT_LED_OVP_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_LED_OCP, 0, BD71815_INT_LED_OCP_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_LED_SCP, 0, BD71815_INT_LED_SCP_MASK),
+	/* DCIN1 interrupts */
+	REGMAP_IRQ_REG(BD71815_INT_DCIN_RMV, 1, BD71815_INT_DCIN_RMV_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_CLPS_OUT, 1, BD71815_INT_CLPS_OUT_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_CLPS_IN, 1, BD71815_INT_CLPS_IN_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_DCIN_OVP_RES, 1, BD71815_INT_DCIN_OVP_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_DCIN_OVP_DET, 1, BD71815_INT_DCIN_OVP_DET_MASK),
+	/* DCIN2 interrupts */
+	REGMAP_IRQ_REG(BD71815_INT_DCIN_MON_RES, 2, BD71815_INT_DCIN_MON_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_DCIN_MON_DET, 2, BD71815_INT_DCIN_MON_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_WDOG, 2, BD71815_INT_WDOG_MASK),
+	/* Vsys */
+	REGMAP_IRQ_REG(BD71815_INT_VSYS_UV_RES, 3, BD71815_INT_VSYS_UV_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_VSYS_UV_DET, 3, BD71815_INT_VSYS_UV_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_VSYS_LOW_RES, 3, BD71815_INT_VSYS_LOW_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_VSYS_LOW_DET, 3, BD71815_INT_VSYS_LOW_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_VSYS_MON_RES, 3, BD71815_INT_VSYS_MON_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_VSYS_MON_DET, 3, BD71815_INT_VSYS_MON_DET_MASK),
+	/* Charger */
+	REGMAP_IRQ_REG(BD71815_INT_CHG_WDG_TEMP, 4, BD71815_INT_CHG_WDG_TEMP_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_CHG_WDG_TIME, 4, BD71815_INT_CHG_WDG_TIME_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_CHG_RECHARGE_RES, 4, BD71815_INT_CHG_RECHARGE_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_CHG_RECHARGE_DET, 4, BD71815_INT_CHG_RECHARGE_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_CHG_RANGED_TEMP_TRANSITION, 4,
+		       BD71815_INT_CHG_RANGED_TEMP_TRANSITION_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_CHG_STATE_TRANSITION, 4, BD71815_INT_CHG_STATE_TRANSITION_MASK),
+	/* Battery */
+	REGMAP_IRQ_REG(BD71815_INT_BAT_TEMP_NORMAL, 5, BD71815_INT_BAT_TEMP_NORMAL_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_TEMP_ERANGE, 5, BD71815_INT_BAT_TEMP_ERANGE_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_REMOVED, 5, BD71815_INT_BAT_REMOVED_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_DETECTED, 5, BD71815_INT_BAT_DETECTED_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_THERM_REMOVED, 5, BD71815_INT_THERM_REMOVED_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_THERM_DETECTED, 5, BD71815_INT_THERM_DETECTED_MASK),
+	/* Battery Mon 1 */
+	REGMAP_IRQ_REG(BD71815_INT_BAT_DEAD, 6, BD71815_INT_BAT_DEAD_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_SHORTC_RES, 6, BD71815_INT_BAT_SHORTC_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_SHORTC_DET, 6, BD71815_INT_BAT_SHORTC_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_LOW_VOLT_RES, 6, BD71815_INT_BAT_LOW_VOLT_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_LOW_VOLT_DET, 6, BD71815_INT_BAT_LOW_VOLT_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_OVER_VOLT_RES, 6, BD71815_INT_BAT_OVER_VOLT_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_OVER_VOLT_DET, 6, BD71815_INT_BAT_OVER_VOLT_DET_MASK),
+	/* Battery Mon 2 */
+	REGMAP_IRQ_REG(BD71815_INT_BAT_MON_RES, 7, BD71815_INT_BAT_MON_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_MON_DET, 7, BD71815_INT_BAT_MON_DET_MASK),
+	/* Battery Mon 3 (Coulomb counter) */
+	REGMAP_IRQ_REG(BD71815_INT_BAT_CC_MON1, 8, BD71815_INT_BAT_CC_MON1_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_CC_MON2, 8, BD71815_INT_BAT_CC_MON2_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_CC_MON3, 8, BD71815_INT_BAT_CC_MON3_MASK),
+	/* Battery Mon 4 */
+	REGMAP_IRQ_REG(BD71815_INT_BAT_OVER_CURR_1_RES, 9, BD71815_INT_BAT_OVER_CURR_1_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_OVER_CURR_1_DET, 9, BD71815_INT_BAT_OVER_CURR_1_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_OVER_CURR_2_RES, 9, BD71815_INT_BAT_OVER_CURR_2_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_OVER_CURR_2_DET, 9, BD71815_INT_BAT_OVER_CURR_2_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_OVER_CURR_3_RES, 9, BD71815_INT_BAT_OVER_CURR_3_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_BAT_OVER_CURR_3_DET, 9, BD71815_INT_BAT_OVER_CURR_3_DET_MASK),
+	/* Temperature */
+	REGMAP_IRQ_REG(BD71815_INT_TEMP_BAT_LOW_RES, 10, BD71815_INT_TEMP_BAT_LOW_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_TEMP_BAT_LOW_DET, 10, BD71815_INT_TEMP_BAT_LOW_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_TEMP_BAT_HI_RES, 10, BD71815_INT_TEMP_BAT_HI_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_TEMP_BAT_HI_DET, 10, BD71815_INT_TEMP_BAT_HI_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_TEMP_CHIP_OVER_125_RES, 10,
+		       BD71815_INT_TEMP_CHIP_OVER_125_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_TEMP_CHIP_OVER_125_DET, 10,
+		       BD71815_INT_TEMP_CHIP_OVER_125_DET_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_TEMP_CHIP_OVER_VF_RES, 10,
+		       BD71815_INT_TEMP_CHIP_OVER_VF_RES_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_TEMP_CHIP_OVER_VF_DET, 10,
+		       BD71815_INT_TEMP_CHIP_OVER_VF_DET_MASK),
+	/* RTC Alarm */
+	REGMAP_IRQ_REG(BD71815_INT_RTC0, 11, BD71815_INT_RTC0_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_RTC1, 11, BD71815_INT_RTC1_MASK),
+	REGMAP_IRQ_REG(BD71815_INT_RTC2, 11, BD71815_INT_RTC2_MASK),
+};
+
 static struct regmap_irq bd71828_irqs[] = {
 	REGMAP_IRQ_REG(BD71828_INT_BUCK1_OCP, 0, BD71828_INT_BUCK1_OCP_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_BUCK2_OCP, 0, BD71828_INT_BUCK2_OCP_MASK),
@@ -134,10 +331,8 @@ static struct regmap_irq bd71828_irqs[] = {
 	REGMAP_IRQ_REG(BD71828_INT_CLPS_OUT, 1, BD71828_INT_CLPS_OUT_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_CLPS_IN, 1, BD71828_INT_CLPS_IN_MASK),
 	/* DCIN2 interrupts */
-	REGMAP_IRQ_REG(BD71828_INT_DCIN_MON_RES, 2,
-		       BD71828_INT_DCIN_MON_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_DCIN_MON_DET, 2,
-		       BD71828_INT_DCIN_MON_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_DCIN_MON_RES, 2, BD71828_INT_DCIN_MON_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_DCIN_MON_DET, 2, BD71828_INT_DCIN_MON_DET_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_LONGPUSH, 2, BD71828_INT_LONGPUSH_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_MIDPUSH, 2, BD71828_INT_MIDPUSH_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_SHORTPUSH, 2, BD71828_INT_SHORTPUSH_MASK),
@@ -145,102 +340,59 @@ static struct regmap_irq bd71828_irqs[] = {
 	REGMAP_IRQ_REG(BD71828_INT_WDOG, 2, BD71828_INT_WDOG_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_SWRESET, 2, BD71828_INT_SWRESET_MASK),
 	/* Vsys */
-	REGMAP_IRQ_REG(BD71828_INT_VSYS_UV_RES, 3,
-		       BD71828_INT_VSYS_UV_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_VSYS_UV_DET, 3,
-		       BD71828_INT_VSYS_UV_DET_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_VSYS_LOW_RES, 3,
-		       BD71828_INT_VSYS_LOW_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_VSYS_LOW_DET, 3,
-		       BD71828_INT_VSYS_LOW_DET_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_VSYS_HALL_IN, 3,
-		       BD71828_INT_VSYS_HALL_IN_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_VSYS_HALL_TOGGLE, 3,
-		       BD71828_INT_VSYS_HALL_TOGGLE_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_VSYS_MON_RES, 3,
-		       BD71828_INT_VSYS_MON_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_VSYS_MON_DET, 3,
-		       BD71828_INT_VSYS_MON_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_VSYS_UV_RES, 3, BD71828_INT_VSYS_UV_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_VSYS_UV_DET, 3, BD71828_INT_VSYS_UV_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_VSYS_LOW_RES, 3, BD71828_INT_VSYS_LOW_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_VSYS_LOW_DET, 3, BD71828_INT_VSYS_LOW_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_VSYS_HALL_IN, 3, BD71828_INT_VSYS_HALL_IN_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_VSYS_HALL_TOGGLE, 3, BD71828_INT_VSYS_HALL_TOGGLE_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_VSYS_MON_RES, 3, BD71828_INT_VSYS_MON_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_VSYS_MON_DET, 3, BD71828_INT_VSYS_MON_DET_MASK),
 	/* Charger */
-	REGMAP_IRQ_REG(BD71828_INT_CHG_DCIN_ILIM, 4,
-		       BD71828_INT_CHG_DCIN_ILIM_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_CHG_TOPOFF_TO_DONE, 4,
-		       BD71828_INT_CHG_TOPOFF_TO_DONE_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_CHG_WDG_TEMP, 4,
-		       BD71828_INT_CHG_WDG_TEMP_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_CHG_WDG_TIME, 4,
-		       BD71828_INT_CHG_WDG_TIME_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_CHG_RECHARGE_RES, 4,
-		       BD71828_INT_CHG_RECHARGE_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_CHG_RECHARGE_DET, 4,
-		       BD71828_INT_CHG_RECHARGE_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_CHG_DCIN_ILIM, 4, BD71828_INT_CHG_DCIN_ILIM_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_CHG_TOPOFF_TO_DONE, 4, BD71828_INT_CHG_TOPOFF_TO_DONE_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_CHG_WDG_TEMP, 4, BD71828_INT_CHG_WDG_TEMP_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_CHG_WDG_TIME, 4, BD71828_INT_CHG_WDG_TIME_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_CHG_RECHARGE_RES, 4, BD71828_INT_CHG_RECHARGE_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_CHG_RECHARGE_DET, 4, BD71828_INT_CHG_RECHARGE_DET_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_CHG_RANGED_TEMP_TRANSITION, 4,
 		       BD71828_INT_CHG_RANGED_TEMP_TRANSITION_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_CHG_STATE_TRANSITION, 4,
-		       BD71828_INT_CHG_STATE_TRANSITION_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_CHG_STATE_TRANSITION, 4, BD71828_INT_CHG_STATE_TRANSITION_MASK),
 	/* Battery */
-	REGMAP_IRQ_REG(BD71828_INT_BAT_TEMP_NORMAL, 5,
-		       BD71828_INT_BAT_TEMP_NORMAL_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_TEMP_ERANGE, 5,
-		       BD71828_INT_BAT_TEMP_ERANGE_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_TEMP_WARN, 5,
-		       BD71828_INT_BAT_TEMP_WARN_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_REMOVED, 5,
-		       BD71828_INT_BAT_REMOVED_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_DETECTED, 5,
-		       BD71828_INT_BAT_DETECTED_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_THERM_REMOVED, 5,
-		       BD71828_INT_THERM_REMOVED_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_THERM_DETECTED, 5,
-		       BD71828_INT_THERM_DETECTED_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_TEMP_NORMAL, 5, BD71828_INT_BAT_TEMP_NORMAL_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_TEMP_ERANGE, 5, BD71828_INT_BAT_TEMP_ERANGE_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_TEMP_WARN, 5, BD71828_INT_BAT_TEMP_WARN_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_REMOVED, 5, BD71828_INT_BAT_REMOVED_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_DETECTED, 5, BD71828_INT_BAT_DETECTED_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_THERM_REMOVED, 5, BD71828_INT_THERM_REMOVED_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_THERM_DETECTED, 5, BD71828_INT_THERM_DETECTED_MASK),
 	/* Battery Mon 1 */
 	REGMAP_IRQ_REG(BD71828_INT_BAT_DEAD, 6, BD71828_INT_BAT_DEAD_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_SHORTC_RES, 6,
-		       BD71828_INT_BAT_SHORTC_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_SHORTC_DET, 6,
-		       BD71828_INT_BAT_SHORTC_DET_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_LOW_VOLT_RES, 6,
-		       BD71828_INT_BAT_LOW_VOLT_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_LOW_VOLT_DET, 6,
-		       BD71828_INT_BAT_LOW_VOLT_DET_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_VOLT_RES, 6,
-		       BD71828_INT_BAT_OVER_VOLT_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_VOLT_DET, 6,
-		       BD71828_INT_BAT_OVER_VOLT_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_SHORTC_RES, 6, BD71828_INT_BAT_SHORTC_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_SHORTC_DET, 6, BD71828_INT_BAT_SHORTC_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_LOW_VOLT_RES, 6, BD71828_INT_BAT_LOW_VOLT_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_LOW_VOLT_DET, 6, BD71828_INT_BAT_LOW_VOLT_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_VOLT_RES, 6, BD71828_INT_BAT_OVER_VOLT_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_VOLT_DET, 6, BD71828_INT_BAT_OVER_VOLT_DET_MASK),
 	/* Battery Mon 2 */
-	REGMAP_IRQ_REG(BD71828_INT_BAT_MON_RES, 7,
-		       BD71828_INT_BAT_MON_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_MON_DET, 7,
-		       BD71828_INT_BAT_MON_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_MON_RES, 7, BD71828_INT_BAT_MON_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_MON_DET, 7, BD71828_INT_BAT_MON_DET_MASK),
 	/* Battery Mon 3 (Coulomb counter) */
-	REGMAP_IRQ_REG(BD71828_INT_BAT_CC_MON1, 8,
-		       BD71828_INT_BAT_CC_MON1_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_CC_MON2, 8,
-		       BD71828_INT_BAT_CC_MON2_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_CC_MON3, 8,
-		       BD71828_INT_BAT_CC_MON3_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_CC_MON1, 8, BD71828_INT_BAT_CC_MON1_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_CC_MON2, 8, BD71828_INT_BAT_CC_MON2_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_CC_MON3, 8, BD71828_INT_BAT_CC_MON3_MASK),
 	/* Battery Mon 4 */
-	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_1_RES, 9,
-		       BD71828_INT_BAT_OVER_CURR_1_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_1_DET, 9,
-		       BD71828_INT_BAT_OVER_CURR_1_DET_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_2_RES, 9,
-		       BD71828_INT_BAT_OVER_CURR_2_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_2_DET, 9,
-		       BD71828_INT_BAT_OVER_CURR_2_DET_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_3_RES, 9,
-		       BD71828_INT_BAT_OVER_CURR_3_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_3_DET, 9,
-		       BD71828_INT_BAT_OVER_CURR_3_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_1_RES, 9, BD71828_INT_BAT_OVER_CURR_1_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_1_DET, 9, BD71828_INT_BAT_OVER_CURR_1_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_2_RES, 9, BD71828_INT_BAT_OVER_CURR_2_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_2_DET, 9, BD71828_INT_BAT_OVER_CURR_2_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_3_RES, 9, BD71828_INT_BAT_OVER_CURR_3_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_BAT_OVER_CURR_3_DET, 9, BD71828_INT_BAT_OVER_CURR_3_DET_MASK),
 	/* Temperature */
-	REGMAP_IRQ_REG(BD71828_INT_TEMP_BAT_LOW_RES, 10,
-		       BD71828_INT_TEMP_BAT_LOW_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_TEMP_BAT_LOW_DET, 10,
-		       BD71828_INT_TEMP_BAT_LOW_DET_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_TEMP_BAT_HI_RES, 10,
-		       BD71828_INT_TEMP_BAT_HI_RES_MASK),
-	REGMAP_IRQ_REG(BD71828_INT_TEMP_BAT_HI_DET, 10,
-		       BD71828_INT_TEMP_BAT_HI_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_TEMP_BAT_LOW_RES, 10, BD71828_INT_TEMP_BAT_LOW_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_TEMP_BAT_LOW_DET, 10, BD71828_INT_TEMP_BAT_LOW_DET_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_TEMP_BAT_HI_RES, 10, BD71828_INT_TEMP_BAT_HI_RES_MASK),
+	REGMAP_IRQ_REG(BD71828_INT_TEMP_BAT_HI_DET, 10, BD71828_INT_TEMP_BAT_HI_DET_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_TEMP_CHIP_OVER_125_RES, 10,
 		       BD71828_INT_TEMP_CHIP_OVER_125_RES_MASK),
 	REGMAP_IRQ_REG(BD71828_INT_TEMP_CHIP_OVER_125_DET, 10,
@@ -267,57 +419,133 @@ static struct regmap_irq_chip bd71828_irq_chip = {
 	.init_ack_masked = true,
 	.num_regs = 12,
 	.num_main_regs = 1,
-	.sub_reg_offsets = &bd71828_sub_irq_offsets[0],
+	.sub_reg_offsets = &bd718xx_sub_irq_offsets[0],
+	.num_main_status_bits = 8,
+	.irq_reg_stride = 1,
+};
+
+static struct regmap_irq_chip bd71815_irq_chip = {
+	.name = "bd71815_irq",
+	.main_status = BD71815_REG_INT_STAT,
+	.irqs = &bd71815_irqs[0],
+	.num_irqs = ARRAY_SIZE(bd71815_irqs),
+	.status_base = BD71815_REG_INT_STAT_01,
+	.mask_base = BD71815_REG_INT_EN_01,
+	.ack_base = BD71815_REG_INT_STAT_01,
+	.mask_invert = true,
+	.init_ack_masked = true,
+	.num_regs = 12,
+	.num_main_regs = 1,
+	.sub_reg_offsets = &bd718xx_sub_irq_offsets[0],
 	.num_main_status_bits = 8,
 	.irq_reg_stride = 1,
 };
 
+static int set_clk_mode(struct device *dev, struct regmap *regmap,
+			int clkmode_reg)
+{
+	int ret;
+	unsigned int open_drain;
+
+	ret = of_property_read_u32(dev->of_node, "rohm,clkout-open-drain", &open_drain);
+	if (ret) {
+		if (ret == -EINVAL)
+			return 0;
+		return ret;
+	}
+	if (open_drain > 1) {
+		dev_err(dev, "bad clk32kout mode configuration");
+		return -EINVAL;
+	}
+
+	if (open_drain)
+		return regmap_update_bits(regmap, clkmode_reg, OUT32K_MODE,
+					  OUT32K_MODE_OPEN_DRAIN);
+
+	return regmap_update_bits(regmap, clkmode_reg, OUT32K_MODE,
+				  OUT32K_MODE_CMOS);
+}
+
 static int bd71828_i2c_probe(struct i2c_client *i2c)
 {
-	struct rohm_regmap_dev *chip;
 	struct regmap_irq_chip_data *irq_data;
 	int ret;
+	struct regmap *regmap;
+	const struct regmap_config *regmap_config;
+	struct regmap_irq_chip *irqchip;
+	unsigned int chip_type;
+	struct mfd_cell *mfd;
+	int cells;
+	int button_irq;
+	int clkmode_reg;
 
 	if (!i2c->irq) {
 		dev_err(&i2c->dev, "No IRQ configured\n");
 		return -EINVAL;
 	}
 
-	chip = devm_kzalloc(&i2c->dev, sizeof(*chip), GFP_KERNEL);
-	if (!chip)
-		return -ENOMEM;
-
-	dev_set_drvdata(&i2c->dev, chip);
+	chip_type = (unsigned int)(uintptr_t)
+		    of_device_get_match_data(&i2c->dev);
+	switch (chip_type) {
+	case ROHM_CHIP_TYPE_BD71828:
+		mfd = bd71828_mfd_cells;
+		cells = ARRAY_SIZE(bd71828_mfd_cells);
+		regmap_config = &bd71828_regmap;
+		irqchip = &bd71828_irq_chip;
+		clkmode_reg = BD71828_REG_OUT32K;
+		button_irq = BD71828_INT_SHORTPUSH;
+		break;
+	case ROHM_CHIP_TYPE_BD71815:
+		mfd = bd71815_mfd_cells;
+		cells = ARRAY_SIZE(bd71815_mfd_cells);
+		regmap_config = &bd71815_regmap;
+		irqchip = &bd71815_irq_chip;
+		clkmode_reg = BD71815_REG_OUT32K;
+		/*
+		 * If BD71817 support is needed we should be able to handle it
+		 * with proper DT configs + BD71815 drivers + power-button.
+		 * BD71815 data-sheet does not list the power-button IRQ so we
+		 * don't use it.
+		 */
+		button_irq = 0;
+		break;
+	default:
+		dev_err(&i2c->dev, "Unknown device type");
+		return -EINVAL;
+	}
 
-	chip->regmap = devm_regmap_init_i2c(i2c, &bd71828_regmap);
-	if (IS_ERR(chip->regmap)) {
+	regmap = devm_regmap_init_i2c(i2c, regmap_config);
+	if (IS_ERR(regmap)) {
 		dev_err(&i2c->dev, "Failed to initialize Regmap\n");
-		return PTR_ERR(chip->regmap);
+		return PTR_ERR(regmap);
 	}
 
-	ret = devm_regmap_add_irq_chip(&i2c->dev, chip->regmap,
-				       i2c->irq, IRQF_ONESHOT, 0,
-				       &bd71828_irq_chip, &irq_data);
+	ret = devm_regmap_add_irq_chip(&i2c->dev, regmap, i2c->irq,
+				       IRQF_ONESHOT, 0, irqchip, &irq_data);
 	if (ret) {
 		dev_err(&i2c->dev, "Failed to add IRQ chip\n");
 		return ret;
 	}
 
 	dev_dbg(&i2c->dev, "Registered %d IRQs for chip\n",
-		bd71828_irq_chip.num_irqs);
+		irqchip->num_irqs);
 
-	ret = regmap_irq_get_virq(irq_data, BD71828_INT_SHORTPUSH);
-	if (ret < 0) {
-		dev_err(&i2c->dev, "Failed to get the power-key IRQ\n");
-		return ret;
+	if (button_irq) {
+		ret = regmap_irq_get_virq(irq_data, button_irq);
+		if (ret < 0) {
+			dev_err(&i2c->dev, "Failed to get the power-key IRQ\n");
+			return ret;
+		}
+
+		button.irq = ret;
 	}
 
-	button.irq = ret;
+	ret = set_clk_mode(&i2c->dev, regmap, clkmode_reg);
+	if (ret)
+		return ret;
 
-	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO,
-				   bd71828_mfd_cells,
-				   ARRAY_SIZE(bd71828_mfd_cells), NULL, 0,
-				   regmap_irq_get_domain(irq_data));
+	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO, mfd, cells,
+				   NULL, 0, regmap_irq_get_domain(irq_data));
 	if (ret)
 		dev_err(&i2c->dev, "Failed to create subdevices\n");
 
@@ -325,7 +553,13 @@ static int bd71828_i2c_probe(struct i2c_client *i2c)
 }
 
 static const struct of_device_id bd71828_of_match[] = {
-	{ .compatible = "rohm,bd71828", },
+	{
+		.compatible = "rohm,bd71828",
+		.data = (void *)ROHM_CHIP_TYPE_BD71828,
+	}, {
+		.compatible = "rohm,bd71815",
+		.data = (void *)ROHM_CHIP_TYPE_BD71815,
+	 },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, bd71828_of_match);
diff --git a/include/linux/mfd/rohm-bd71815.h b/include/linux/mfd/rohm-bd71815.h
new file mode 100644
index 000000000000..ec6d9612bebe
--- /dev/null
+++ b/include/linux/mfd/rohm-bd71815.h
@@ -0,0 +1,562 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2021 ROHM Semiconductors.
+ *
+ * Author: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+ *
+ * Copyright 2014 Embest Technology Co. Ltd. Inc.
+ *
+ * Author: yanglsh@embest-tech.com
+ */
+
+#ifndef _MFD_BD71815_H
+#define _MFD_BD71815_H
+
+#include <linux/regmap.h>
+
+enum {
+	BD71815_BUCK1	=	0,
+	BD71815_BUCK2,
+	BD71815_BUCK3,
+	BD71815_BUCK4,
+	BD71815_BUCK5,
+	/* General Purpose */
+	BD71815_LDO1,
+	BD71815_LDO2,
+	BD71815_LDO3,
+	/* LDOs for SD Card and SD Card Interface */
+	BD71815_LDO4,
+	BD71815_LDO5,
+	/* LDO for DDR Reference Voltage */
+	BD71815_LDODVREF,
+	/* LDO for Low-Power State Retention */
+	BD71815_LDOLPSR,
+	BD71815_WLED,
+	BD71815_REGULATOR_CNT,
+};
+
+#define BD71815_SUPPLY_STATE_ENABLED    0x1
+
+enum {
+	BD71815_REG_DEVICE		= 0,
+	BD71815_REG_PWRCTRL,
+	BD71815_REG_BUCK1_MODE,
+	BD71815_REG_BUCK2_MODE,
+	BD71815_REG_BUCK3_MODE,
+	BD71815_REG_BUCK4_MODE,
+	BD71815_REG_BUCK5_MODE,
+	BD71815_REG_BUCK1_VOLT_H,
+	BD71815_REG_BUCK1_VOLT_L,
+	BD71815_REG_BUCK2_VOLT_H,
+	BD71815_REG_BUCK2_VOLT_L,
+	BD71815_REG_BUCK3_VOLT,
+	BD71815_REG_BUCK4_VOLT,
+	BD71815_REG_BUCK5_VOLT,
+	BD71815_REG_LED_CTRL,
+	BD71815_REG_LED_DIMM,
+	BD71815_REG_LDO_MODE1,
+	BD71815_REG_LDO_MODE2,
+	BD71815_REG_LDO_MODE3,
+	BD71815_REG_LDO_MODE4,
+	BD71815_REG_LDO1_VOLT,
+	BD71815_REG_LDO2_VOLT,
+	BD71815_REG_LDO3_VOLT,
+	BD71815_REG_LDO4_VOLT,
+	BD71815_REG_LDO5_VOLT_H,
+	BD71815_REG_LDO5_VOLT_L,
+	BD71815_REG_BUCK_PD_DIS,
+	BD71815_REG_LDO_PD_DIS,
+	BD71815_REG_GPO,
+	BD71815_REG_OUT32K,
+	BD71815_REG_SEC,
+	BD71815_REG_MIN,
+	BD71815_REG_HOUR,
+	BD71815_REG_WEEK,
+	BD71815_REG_DAY,
+	BD71815_REG_MONTH,
+	BD71815_REG_YEAR,
+	BD71815_REG_ALM0_SEC,
+
+	BD71815_REG_ALM1_SEC		= 0x2C,
+
+	BD71815_REG_ALM0_MASK		= 0x33,
+	BD71815_REG_ALM1_MASK,
+	BD71815_REG_ALM2,
+	BD71815_REG_TRIM,
+	BD71815_REG_CONF,
+	BD71815_REG_SYS_INIT,
+	BD71815_REG_CHG_STATE,
+	BD71815_REG_CHG_LAST_STATE,
+	BD71815_REG_BAT_STAT,
+	BD71815_REG_DCIN_STAT,
+	BD71815_REG_VSYS_STAT,
+	BD71815_REG_CHG_STAT,
+	BD71815_REG_CHG_WDT_STAT,
+	BD71815_REG_BAT_TEMP,
+	BD71815_REG_IGNORE_0,
+	BD71815_REG_INHIBIT_0,
+	BD71815_REG_DCIN_CLPS,
+	BD71815_REG_VSYS_REG,
+	BD71815_REG_VSYS_MAX,
+	BD71815_REG_VSYS_MIN,
+	BD71815_REG_CHG_SET1,
+	BD71815_REG_CHG_SET2,
+	BD71815_REG_CHG_WDT_PRE,
+	BD71815_REG_CHG_WDT_FST,
+	BD71815_REG_CHG_IPRE,
+	BD71815_REG_CHG_IFST,
+	BD71815_REG_CHG_IFST_TERM,
+	BD71815_REG_CHG_VPRE,
+	BD71815_REG_CHG_VBAT_1,
+	BD71815_REG_CHG_VBAT_2,
+	BD71815_REG_CHG_VBAT_3,
+	BD71815_REG_CHG_LED_1,
+	BD71815_REG_VF_TH,
+	BD71815_REG_BAT_SET_1,
+	BD71815_REG_BAT_SET_2,
+	BD71815_REG_BAT_SET_3,
+	BD71815_REG_ALM_VBAT_TH_U,
+	BD71815_REG_ALM_VBAT_TH_L,
+	BD71815_REG_ALM_DCIN_TH,
+	BD71815_REG_ALM_VSYS_TH,
+	BD71815_REG_VM_IBAT_U,
+	BD71815_REG_VM_IBAT_L,
+	BD71815_REG_VM_VBAT_U,
+	BD71815_REG_VM_VBAT_L,
+	BD71815_REG_VM_BTMP,
+	BD71815_REG_VM_VTH,
+	BD71815_REG_VM_DCIN_U,
+	BD71815_REG_VM_DCIN_L,
+	BD71815_REG_VM_VSYS,
+	BD71815_REG_VM_VF,
+	BD71815_REG_VM_OCI_PRE_U,
+	BD71815_REG_VM_OCI_PRE_L,
+	BD71815_REG_VM_OCV_PRE_U,
+	BD71815_REG_VM_OCV_PRE_L,
+	BD71815_REG_VM_OCI_PST_U,
+	BD71815_REG_VM_OCI_PST_L,
+	BD71815_REG_VM_OCV_PST_U,
+	BD71815_REG_VM_OCV_PST_L,
+	BD71815_REG_VM_SA_VBAT_U,
+	BD71815_REG_VM_SA_VBAT_L,
+	BD71815_REG_VM_SA_IBAT_U,
+	BD71815_REG_VM_SA_IBAT_L,
+	BD71815_REG_CC_CTRL,
+	BD71815_REG_CC_BATCAP1_TH_U,
+	BD71815_REG_CC_BATCAP1_TH_L,
+	BD71815_REG_CC_BATCAP2_TH_U,
+	BD71815_REG_CC_BATCAP2_TH_L,
+	BD71815_REG_CC_BATCAP3_TH_U,
+	BD71815_REG_CC_BATCAP3_TH_L,
+	BD71815_REG_CC_STAT,
+	BD71815_REG_CC_CCNTD_3,
+	BD71815_REG_CC_CCNTD_2,
+	BD71815_REG_CC_CCNTD_1,
+	BD71815_REG_CC_CCNTD_0,
+	BD71815_REG_CC_CURCD_U,
+	BD71815_REG_CC_CURCD_L,
+	BD71815_REG_VM_OCUR_THR_1,
+	BD71815_REG_VM_OCUR_DUR_1,
+	BD71815_REG_VM_OCUR_THR_2,
+	BD71815_REG_VM_OCUR_DUR_2,
+	BD71815_REG_VM_OCUR_THR_3,
+	BD71815_REG_VM_OCUR_DUR_3,
+	BD71815_REG_VM_OCUR_MON,
+	BD71815_REG_VM_BTMP_OV_THR,
+	BD71815_REG_VM_BTMP_OV_DUR,
+	BD71815_REG_VM_BTMP_LO_THR,
+	BD71815_REG_VM_BTMP_LO_DUR,
+	BD71815_REG_VM_BTMP_MON,
+	BD71815_REG_INT_EN_01,
+
+	BD71815_REG_INT_EN_11		= 0x95,
+	BD71815_REG_INT_EN_12,
+	BD71815_REG_INT_STAT,
+	BD71815_REG_INT_STAT_01,
+	BD71815_REG_INT_STAT_02,
+	BD71815_REG_INT_STAT_03,
+	BD71815_REG_INT_STAT_04,
+	BD71815_REG_INT_STAT_05,
+	BD71815_REG_INT_STAT_06,
+	BD71815_REG_INT_STAT_07,
+	BD71815_REG_INT_STAT_08,
+	BD71815_REG_INT_STAT_09,
+	BD71815_REG_INT_STAT_10,
+	BD71815_REG_INT_STAT_11,
+	BD71815_REG_INT_STAT_12,
+	BD71815_REG_INT_UPDATE,
+
+	BD71815_REG_VM_VSYS_U		= 0xC0,
+	BD71815_REG_VM_VSYS_L,
+	BD71815_REG_VM_SA_VSYS_U,
+	BD71815_REG_VM_SA_VSYS_L,
+
+	BD71815_REG_VM_SA_IBAT_MIN_U	= 0xD0,
+	BD71815_REG_VM_SA_IBAT_MIN_L,
+	BD71815_REG_VM_SA_IBAT_MAX_U,
+	BD71815_REG_VM_SA_IBAT_MAX_L,
+	BD71815_REG_VM_SA_VBAT_MIN_U,
+	BD71815_REG_VM_SA_VBAT_MIN_L,
+	BD71815_REG_VM_SA_VBAT_MAX_U,
+	BD71815_REG_VM_SA_VBAT_MAX_L,
+	BD71815_REG_VM_SA_VSYS_MIN_U,
+	BD71815_REG_VM_SA_VSYS_MIN_L,
+	BD71815_REG_VM_SA_VSYS_MAX_U,
+	BD71815_REG_VM_SA_VSYS_MAX_L,
+	BD71815_REG_VM_SA_MINMAX_CLR,
+
+	BD71815_REG_REX_CCNTD_3		= 0xE0,
+	BD71815_REG_REX_CCNTD_2,
+	BD71815_REG_REX_CCNTD_1,
+	BD71815_REG_REX_CCNTD_0,
+	BD71815_REG_REX_SA_VBAT_U,
+	BD71815_REG_REX_SA_VBAT_L,
+	BD71815_REG_REX_CTRL_1,
+	BD71815_REG_REX_CTRL_2,
+	BD71815_REG_FULL_CCNTD_3,
+	BD71815_REG_FULL_CCNTD_2,
+	BD71815_REG_FULL_CCNTD_1,
+	BD71815_REG_FULL_CCNTD_0,
+	BD71815_REG_FULL_CTRL,
+
+	BD71815_REG_CCNTD_CHG_3		= 0xF0,
+	BD71815_REG_CCNTD_CHG_2,
+
+	BD71815_REG_TEST_MODE		= 0xFE,
+	BD71815_MAX_REGISTER,
+};
+
+/* BD71815_REG_BUCK1_MODE bits */
+#define BD71815_BUCK_RAMPRATE_MASK		0xC0
+#define BD71815_BUCK_RAMPRATE_10P00MV		0x0
+#define BD71815_BUCK_RAMPRATE_5P00MV		0x01
+#define BD71815_BUCK_RAMPRATE_2P50MV		0x02
+#define BD71815_BUCK_RAMPRATE_1P25MV		0x03
+
+#define BD71815_BUCK_PWM_FIXED			BIT(4)
+#define BD71815_BUCK_SNVS_ON			BIT(3)
+#define BD71815_BUCK_RUN_ON			BIT(2)
+#define BD71815_BUCK_LPSR_ON			BIT(1)
+#define BD71815_BUCK_SUSP_ON			BIT(0)
+
+/* BD71815_REG_BUCK1_VOLT_H bits */
+#define BD71815_BUCK_DVSSEL			BIT(7)
+#define BD71815_BUCK_STBY_DVS			BIT(6)
+#define BD71815_VOLT_MASK			0x3F
+#define BD71815_BUCK1_H_DEFAULT			0x14
+#define BD71815_BUCK1_L_DEFAULT			0x14
+
+/* BD71815_REG_BUCK2_VOLT_H bits */
+#define BD71815_BUCK2_H_DEFAULT			0x14
+#define BD71815_BUCK2_L_DEFAULT			0x14
+
+/* WLED output */
+/* current register mask */
+#define LED_DIMM_MASK				0x3f
+/* LED enable bits at LED_CTRL reg */
+#define LED_CHGDONE_EN				BIT(4)
+#define LED_RUN_ON				BIT(2)
+#define LED_LPSR_ON				BIT(1)
+#define LED_SUSP_ON				BIT(0)
+
+/* BD71815_REG_LDO1_CTRL bits */
+#define LDO1_EN					BIT(0)
+#define LDO2_EN					BIT(1)
+#define LDO3_EN					BIT(2)
+#define DVREF_EN				BIT(3)
+#define VOSNVS_SW_EN				BIT(4)
+
+/* LDO_MODE1_register */
+#define LDO1_SNVS_ON				BIT(7)
+#define LDO1_RUN_ON				BIT(6)
+#define LDO1_LPSR_ON				BIT(5)
+#define LDO1_SUSP_ON				BIT(4)
+/* set => register control, unset => GPIO control */
+#define LDO4_MODE_MASK				BIT(3)
+#define LDO4_MODE_I2C				BIT(3)
+#define LDO4_MODE_GPIO				0
+/* set => register control, unset => start when DCIN connected */
+#define LDO3_MODE_MASK				BIT(2)
+#define LDO3_MODE_I2C				BIT(2)
+#define LDO3_MODE_DCIN				0
+
+/* LDO_MODE2 register */
+#define LDO3_SNVS_ON				BIT(7)
+#define LDO3_RUN_ON				BIT(6)
+#define LDO3_LPSR_ON				BIT(5)
+#define LDO3_SUSP_ON				BIT(4)
+#define LDO2_SNVS_ON				BIT(3)
+#define LDO2_RUN_ON				BIT(2)
+#define LDO2_LPSR_ON				BIT(1)
+#define LDO2_SUSP_ON				BIT(0)
+
+
+/* LDO_MODE3 register */
+#define LDO5_SNVS_ON				BIT(7)
+#define LDO5_RUN_ON				BIT(6)
+#define LDO5_LPSR_ON				BIT(5)
+#define LDO5_SUSP_ON				BIT(4)
+#define LDO4_SNVS_ON				BIT(3)
+#define LDO4_RUN_ON				BIT(2)
+#define LDO4_LPSR_ON				BIT(1)
+#define LDO4_SUSP_ON				BIT(0)
+
+/* LDO_MODE4 register */
+#define DVREF_SNVS_ON				BIT(7)
+#define DVREF_RUN_ON				BIT(6)
+#define DVREF_LPSR_ON				BIT(5)
+#define DVREF_SUSP_ON				BIT(4)
+#define LDO_LPSR_SNVS_ON			BIT(3)
+#define LDO_LPSR_RUN_ON				BIT(2)
+#define LDO_LPSR_LPSR_ON			BIT(1)
+#define LDO_LPSR_SUSP_ON			BIT(0)
+
+/* BD71815_REG_OUT32K bits */
+#define OUT32K_EN				BIT(0)
+#define OUT32K_MODE				BIT(1)
+#define OUT32K_MODE_CMOS			BIT(1)
+#define OUT32K_MODE_OPEN_DRAIN			0
+
+/* BD71815_REG_BAT_STAT bits */
+#define BAT_DET					BIT(5)
+#define BAT_DET_OFFSET				5
+#define BAT_DET_DONE				BIT(4)
+#define VBAT_OV					BIT(3)
+#define DBAT_DET				BIT(0)
+
+/* BD71815_REG_VBUS_STAT bits */
+#define VBUS_DET				BIT(0)
+
+#define BD71815_REG_RTC_START			BD71815_REG_SEC
+#define BD71815_REG_RTC_ALM_START		BD71815_REG_ALM0_SEC
+
+/* BD71815_REG_ALM0_MASK bits */
+#define A0_ONESEC				BIT(7)
+
+/* BD71815_REG_INT_EN_00 bits */
+#define ALMALE					BIT(0)
+
+/* BD71815_REG_INT_STAT_03 bits */
+#define DCIN_MON_DET				BIT(1)
+#define DCIN_MON_RES				BIT(0)
+#define POWERON_LONG				BIT(2)
+#define POWERON_MID				BIT(3)
+#define POWERON_SHORT				BIT(4)
+#define POWERON_PRESS				BIT(5)
+
+/* BD71805_REG_INT_STAT_08 bits */
+#define VBAT_MON_DET				BIT(1)
+#define VBAT_MON_RES				BIT(0)
+
+/* BD71805_REG_INT_STAT_11 bits */
+#define	INT_STAT_11_VF_DET			BIT(7)
+#define	INT_STAT_11_VF_RES			BIT(6)
+#define	INT_STAT_11_VF125_DET			BIT(5)
+#define	INT_STAT_11_VF125_RES			BIT(4)
+#define	INT_STAT_11_OVTMP_DET			BIT(3)
+#define	INT_STAT_11_OVTMP_RES			BIT(2)
+#define	INT_STAT_11_LOTMP_DET			BIT(1)
+#define	INT_STAT_11_LOTMP_RES			BIT(0)
+
+#define VBAT_MON_DET				BIT(1)
+#define VBAT_MON_RES				BIT(0)
+
+/* BD71815_REG_PWRCTRL bits */
+#define RESTARTEN				BIT(0)
+
+/* BD71815_REG_GPO bits */
+#define READY_FORCE_LOW				BIT(2)
+#define BD71815_GPIO_DRIVE_MASK			BIT(4)
+#define BD71815_GPIO_OPEN_DRAIN			0
+#define BD71815_GPIO_CMOS			BIT(4)
+
+/* BD71815 interrupt masks */
+enum {
+	BD71815_INT_EN_01_BUCKAST_MASK	=	0x0F,
+	BD71815_INT_EN_02_DCINAST_MASK	=	0x3E,
+	BD71815_INT_EN_03_DCINAST_MASK	=	0x3F,
+	BD71815_INT_EN_04_VSYSAST_MASK	=	0xCF,
+	BD71815_INT_EN_05_CHGAST_MASK	=	0xFC,
+	BD71815_INT_EN_06_BATAST_MASK	=	0xF3,
+	BD71815_INT_EN_07_BMONAST_MASK	=	0xFE,
+	BD71815_INT_EN_08_BMONAST_MASK	=	0x03,
+	BD71815_INT_EN_09_BMONAST_MASK	=	0x07,
+	BD71815_INT_EN_10_BMONAST_MASK	=	0x3F,
+	BD71815_INT_EN_11_TMPAST_MASK	=	0xFF,
+	BD71815_INT_EN_12_ALMAST_MASK	=	0x07,
+};
+/* BD71815 interrupt irqs */
+enum {
+	/* BUCK reg interrupts */
+	BD71815_INT_BUCK1_OCP,
+	BD71815_INT_BUCK2_OCP,
+	BD71815_INT_BUCK3_OCP,
+	BD71815_INT_BUCK4_OCP,
+	BD71815_INT_BUCK5_OCP,
+	BD71815_INT_LED_OVP,
+	BD71815_INT_LED_OCP,
+	BD71815_INT_LED_SCP,
+	/* DCIN1 interrupts */
+	BD71815_INT_DCIN_RMV,
+	BD71815_INT_CLPS_OUT,
+	BD71815_INT_CLPS_IN,
+	BD71815_INT_DCIN_OVP_RES,
+	BD71815_INT_DCIN_OVP_DET,
+	/* DCIN2 interrupts */
+	BD71815_INT_DCIN_MON_RES,
+	BD71815_INT_DCIN_MON_DET,
+	BD71815_INT_WDOG,
+	/* Vsys INT_STAT_04 */
+	BD71815_INT_VSYS_UV_RES,
+	BD71815_INT_VSYS_UV_DET,
+	BD71815_INT_VSYS_LOW_RES,
+	BD71815_INT_VSYS_LOW_DET,
+	BD71815_INT_VSYS_MON_RES,
+	BD71815_INT_VSYS_MON_DET,
+	/* Charger INT_STAT_05 */
+	BD71815_INT_CHG_WDG_TEMP,
+	BD71815_INT_CHG_WDG_TIME,
+	BD71815_INT_CHG_RECHARGE_RES,
+	BD71815_INT_CHG_RECHARGE_DET,
+	BD71815_INT_CHG_RANGED_TEMP_TRANSITION,
+	BD71815_INT_CHG_STATE_TRANSITION,
+	/* Battery  INT_STAT_06 */
+	BD71815_INT_BAT_TEMP_NORMAL,
+	BD71815_INT_BAT_TEMP_ERANGE,
+	BD71815_INT_BAT_REMOVED,
+	BD71815_INT_BAT_DETECTED,
+	BD71815_INT_THERM_REMOVED,
+	BD71815_INT_THERM_DETECTED,
+	/* Battery Mon 1 INT_STAT_07 */
+	BD71815_INT_BAT_DEAD,
+	BD71815_INT_BAT_SHORTC_RES,
+	BD71815_INT_BAT_SHORTC_DET,
+	BD71815_INT_BAT_LOW_VOLT_RES,
+	BD71815_INT_BAT_LOW_VOLT_DET,
+	BD71815_INT_BAT_OVER_VOLT_RES,
+	BD71815_INT_BAT_OVER_VOLT_DET,
+	/* Battery Mon 2 INT_STAT_08 */
+	BD71815_INT_BAT_MON_RES,
+	BD71815_INT_BAT_MON_DET,
+	/* Battery Mon 3 (Coulomb counter) INT_STAT_09 */
+	BD71815_INT_BAT_CC_MON1,
+	BD71815_INT_BAT_CC_MON2,
+	BD71815_INT_BAT_CC_MON3,
+	/* Battery Mon 4 INT_STAT_10 */
+	BD71815_INT_BAT_OVER_CURR_1_RES,
+	BD71815_INT_BAT_OVER_CURR_1_DET,
+	BD71815_INT_BAT_OVER_CURR_2_RES,
+	BD71815_INT_BAT_OVER_CURR_2_DET,
+	BD71815_INT_BAT_OVER_CURR_3_RES,
+	BD71815_INT_BAT_OVER_CURR_3_DET,
+	/* Temperature INT_STAT_11 */
+	BD71815_INT_TEMP_BAT_LOW_RES,
+	BD71815_INT_TEMP_BAT_LOW_DET,
+	BD71815_INT_TEMP_BAT_HI_RES,
+	BD71815_INT_TEMP_BAT_HI_DET,
+	BD71815_INT_TEMP_CHIP_OVER_125_RES,
+	BD71815_INT_TEMP_CHIP_OVER_125_DET,
+	BD71815_INT_TEMP_CHIP_OVER_VF_RES,
+	BD71815_INT_TEMP_CHIP_OVER_VF_DET,
+	/* RTC Alarm INT_STAT_12 */
+	BD71815_INT_RTC0,
+	BD71815_INT_RTC1,
+	BD71815_INT_RTC2,
+};
+
+#define BD71815_INT_BUCK1_OCP_MASK			BIT(0)
+#define BD71815_INT_BUCK2_OCP_MASK			BIT(1)
+#define BD71815_INT_BUCK3_OCP_MASK			BIT(2)
+#define BD71815_INT_BUCK4_OCP_MASK			BIT(3)
+#define BD71815_INT_BUCK5_OCP_MASK			BIT(4)
+#define BD71815_INT_LED_OVP_MASK			BIT(5)
+#define BD71815_INT_LED_OCP_MASK			BIT(6)
+#define BD71815_INT_LED_SCP_MASK			BIT(7)
+
+#define BD71815_INT_DCIN_RMV_MASK			BIT(1)
+#define BD71815_INT_CLPS_OUT_MASK			BIT(2)
+#define BD71815_INT_CLPS_IN_MASK			BIT(3)
+#define BD71815_INT_DCIN_OVP_RES_MASK			BIT(4)
+#define BD71815_INT_DCIN_OVP_DET_MASK			BIT(5)
+
+#define BD71815_INT_DCIN_MON_RES_MASK			BIT(0)
+#define BD71815_INT_DCIN_MON_DET_MASK			BIT(1)
+#define BD71815_INT_WDOG_MASK				BIT(6)
+
+#define BD71815_INT_VSYS_UV_RES_MASK			BIT(0)
+#define BD71815_INT_VSYS_UV_DET_MASK			BIT(1)
+#define BD71815_INT_VSYS_LOW_RES_MASK			BIT(2)
+#define BD71815_INT_VSYS_LOW_DET_MASK			BIT(3)
+#define BD71815_INT_VSYS_MON_RES_MASK			BIT(6)
+#define BD71815_INT_VSYS_MON_DET_MASK			BIT(7)
+
+#define BD71815_INT_CHG_WDG_TEMP_MASK			BIT(2)
+#define BD71815_INT_CHG_WDG_TIME_MASK			BIT(3)
+#define BD71815_INT_CHG_RECHARGE_RES_MASK		BIT(4)
+#define BD71815_INT_CHG_RECHARGE_DET_MASK		BIT(5)
+#define BD71815_INT_CHG_RANGED_TEMP_TRANSITION_MASK	BIT(6)
+#define BD71815_INT_CHG_STATE_TRANSITION_MASK		BIT(7)
+
+#define BD71815_INT_BAT_TEMP_NORMAL_MASK		BIT(0)
+#define BD71815_INT_BAT_TEMP_ERANGE_MASK		BIT(1)
+#define BD71815_INT_BAT_REMOVED_MASK			BIT(4)
+#define BD71815_INT_BAT_DETECTED_MASK			BIT(5)
+#define BD71815_INT_THERM_REMOVED_MASK			BIT(6)
+#define BD71815_INT_THERM_DETECTED_MASK			BIT(7)
+
+#define BD71815_INT_BAT_DEAD_MASK			BIT(1)
+#define BD71815_INT_BAT_SHORTC_RES_MASK			BIT(2)
+#define BD71815_INT_BAT_SHORTC_DET_MASK			BIT(3)
+#define BD71815_INT_BAT_LOW_VOLT_RES_MASK		BIT(4)
+#define BD71815_INT_BAT_LOW_VOLT_DET_MASK		BIT(5)
+#define BD71815_INT_BAT_OVER_VOLT_RES_MASK		BIT(6)
+#define BD71815_INT_BAT_OVER_VOLT_DET_MASK		BIT(7)
+
+#define BD71815_INT_BAT_MON_RES_MASK			BIT(0)
+#define BD71815_INT_BAT_MON_DET_MASK			BIT(1)
+
+#define BD71815_INT_BAT_CC_MON1_MASK			BIT(0)
+#define BD71815_INT_BAT_CC_MON2_MASK			BIT(1)
+#define BD71815_INT_BAT_CC_MON3_MASK			BIT(2)
+
+#define BD71815_INT_BAT_OVER_CURR_1_RES_MASK		BIT(0)
+#define BD71815_INT_BAT_OVER_CURR_1_DET_MASK		BIT(1)
+#define BD71815_INT_BAT_OVER_CURR_2_RES_MASK		BIT(2)
+#define BD71815_INT_BAT_OVER_CURR_2_DET_MASK		BIT(3)
+#define BD71815_INT_BAT_OVER_CURR_3_RES_MASK		BIT(4)
+#define BD71815_INT_BAT_OVER_CURR_3_DET_MASK		BIT(5)
+
+#define BD71815_INT_TEMP_BAT_LOW_RES_MASK		BIT(0)
+#define BD71815_INT_TEMP_BAT_LOW_DET_MASK		BIT(1)
+#define BD71815_INT_TEMP_BAT_HI_RES_MASK		BIT(2)
+#define BD71815_INT_TEMP_BAT_HI_DET_MASK		BIT(3)
+#define BD71815_INT_TEMP_CHIP_OVER_125_RES_MASK		BIT(4)
+#define BD71815_INT_TEMP_CHIP_OVER_125_DET_MASK		BIT(5)
+#define BD71815_INT_TEMP_CHIP_OVER_VF_RES_MASK		BIT(6)
+#define BD71815_INT_TEMP_CHIP_OVER_VF_DET_MASK		BIT(7)
+
+#define BD71815_INT_RTC0_MASK				BIT(0)
+#define BD71815_INT_RTC1_MASK				BIT(1)
+#define BD71815_INT_RTC2_MASK				BIT(2)
+
+/* BD71815_REG_CC_CTRL bits */
+#define CCNTRST						0x80
+#define CCNTENB						0x40
+#define CCCALIB						0x20
+
+/* BD71815_REG_CC_CURCD */
+#define CURDIR_Discharging				0x8000
+
+/* BD71815_REG_VM_SA_IBAT */
+#define IBAT_SA_DIR_Discharging				0x8000
+
+/* BD71815_REG_REX_CTRL_1 bits */
+#define REX_CLR						BIT(4)
+
+/* BD71815_REG_REX_CTRL_1 bits */
+#define REX_PMU_STATE_MASK				BIT(2)
+
+/* BD71815_REG_LED_CTRL bits */
+#define CHGDONE_LED_EN					BIT(4)
+
+#endif /* __LINUX_MFD_BD71815_H */
diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h
index 017a4c01cb31..c7ab69c87ee8 100644
--- a/include/linux/mfd/rohm-bd71828.h
+++ b/include/linux/mfd/rohm-bd71828.h
@@ -151,6 +151,9 @@ enum {
 #define BD71828_REG_GPIO_CTRL3		0x49
 #define BD71828_REG_IO_STAT		0xed
 
+/* clk */
+#define BD71828_REG_OUT32K		0x4b
+
 /* RTC */
 #define BD71828_REG_RTC_SEC		0x4c
 #define BD71828_REG_RTC_MINUTE		0x4d
-- 
cgit v1.2.3


From 80a71170646df80914a7290a197aca1e6116a49d Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 5 Apr 2021 14:43:44 +0300
Subject: regulator: rohm-regulator: Support SNVS HW state.

The ROHM BD71815 supports setting voltage levels/regulator status
for HW-states "RUN", "SUSPEND", "LPSR" and "SNVS". Add DT parsing
helper also for SNVS state.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/rohm-regulator.c | 6 ++++++
 include/linux/mfd/rohm-generic.h   | 6 +++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/rohm-regulator.c b/drivers/regulator/rohm-regulator.c
index 63aabb8c7786..6e0d9c08ec1c 100644
--- a/drivers/regulator/rohm-regulator.c
+++ b/drivers/regulator/rohm-regulator.c
@@ -95,6 +95,12 @@ int rohm_regulator_set_dvs_levels(const struct rohm_dvs_config *dvs,
 				mask = dvs->lpsr_mask;
 				omask = dvs->lpsr_on_mask;
 				break;
+			case ROHM_DVS_LEVEL_SNVS:
+				prop = "rohm,dvs-snvs-voltage";
+				reg = dvs->snvs_reg;
+				mask = dvs->snvs_mask;
+				omask = dvs->snvs_on_mask;
+				break;
 			default:
 				return -EINVAL;
 			}
diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
index 9e2880e06950..a9144284cf6d 100644
--- a/include/linux/mfd/rohm-generic.h
+++ b/include/linux/mfd/rohm-generic.h
@@ -27,7 +27,8 @@ struct rohm_regmap_dev {
 #define ROHM_DVS_LEVEL_IDLE		BIT(1)
 #define ROHM_DVS_LEVEL_SUSPEND		BIT(2)
 #define ROHM_DVS_LEVEL_LPSR		BIT(3)
-#define ROHM_DVS_LEVEL_VALID_AMOUNT	4
+#define ROHM_DVS_LEVEL_SNVS		BIT(4)
+#define ROHM_DVS_LEVEL_VALID_AMOUNT	5
 #define ROHM_DVS_LEVEL_UNKNOWN		0
 
 /**
@@ -66,6 +67,9 @@ struct rohm_dvs_config {
 	unsigned int lpsr_reg;
 	unsigned int lpsr_mask;
 	unsigned int lpsr_on_mask;
+	unsigned int snvs_reg;
+	unsigned int snvs_mask;
+	unsigned int snvs_on_mask;
 };
 
 #if IS_ENABLED(CONFIG_REGULATOR_ROHM)
-- 
cgit v1.2.3


From b07067627cd5f1f6dc60c224b47c728f7f4b7b45 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 27 Jan 2021 11:06:13 -0800
Subject: lib: Add ASN.1 encoder

We have a need in the TPM2 trusted keys to return the ASN.1 form of the TPM
key blob so it can be operated on by tools outside of the kernel.  The
specific tools are the openssl_tpm2_engine, openconnect and the Intel
tpm2-tss-engine.  To do that, we have to be able to read and write the same
binary key format the tools use.  The current ASN.1 decoder does fine for
reading, but we need pieces of an ASN.1 encoder to write the key blob in
binary compatible form.

For backwards compatibility, the trusted key reader code will still accept
the two TPM2B quantities that it uses today, but the writer will only
output the ASN.1 form.

The current implementation only encodes the ASN.1 bits we actually need.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/asn1_encoder.h |  32 +++
 lib/Kconfig                  |   3 +
 lib/Makefile                 |   1 +
 lib/asn1_encoder.c           | 454 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 490 insertions(+)
 create mode 100644 include/linux/asn1_encoder.h
 create mode 100644 lib/asn1_encoder.c

(limited to 'include/linux')

diff --git a/include/linux/asn1_encoder.h b/include/linux/asn1_encoder.h
new file mode 100644
index 000000000000..08cd0c2ad34f
--- /dev/null
+++ b/include/linux/asn1_encoder.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _LINUX_ASN1_ENCODER_H
+#define _LINUX_ASN1_ENCODER_H
+
+#include <linux/types.h>
+#include <linux/asn1.h>
+#include <linux/asn1_ber_bytecode.h>
+#include <linux/bug.h>
+
+#define asn1_oid_len(oid) (sizeof(oid)/sizeof(u32))
+unsigned char *
+asn1_encode_integer(unsigned char *data, const unsigned char *end_data,
+		    s64 integer);
+unsigned char *
+asn1_encode_oid(unsigned char *data, const unsigned char *end_data,
+		u32 oid[], int oid_len);
+unsigned char *
+asn1_encode_tag(unsigned char *data, const unsigned char *end_data,
+		u32 tag, const unsigned char *string, int len);
+unsigned char *
+asn1_encode_octet_string(unsigned char *data,
+			 const unsigned char *end_data,
+			 const unsigned char *string, u32 len);
+unsigned char *
+asn1_encode_sequence(unsigned char *data, const unsigned char *end_data,
+		     const unsigned char *seq, int len);
+unsigned char *
+asn1_encode_boolean(unsigned char *data, const unsigned char *end_data,
+		    bool val);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index a38cc61256f1..ac3b30697b2b 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -701,3 +701,6 @@ config GENERIC_LIB_DEVMEM_IS_ALLOWED
 config PLDMFW
 	bool
 	default n
+
+config ASN1_ENCODER
+       tristate
diff --git a/lib/Makefile b/lib/Makefile
index b5307d3eec1a..e11cfc18b6c0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -280,6 +280,7 @@ obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o
 obj-$(CONFIG_PERCPU_TEST) += percpu_test.o
 
 obj-$(CONFIG_ASN1) += asn1_decoder.o
+obj-$(CONFIG_ASN1_ENCODER) += asn1_encoder.o
 
 obj-$(CONFIG_FONT_SUPPORT) += fonts/
 
diff --git a/lib/asn1_encoder.c b/lib/asn1_encoder.c
new file mode 100644
index 000000000000..41e71aae3ef6
--- /dev/null
+++ b/lib/asn1_encoder.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Simple encoder primitives for ASN.1 BER/DER/CER
+ *
+ * Copyright (C) 2019 James.Bottomley@HansenPartnership.com
+ */
+
+#include <linux/asn1_encoder.h>
+#include <linux/bug.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+/**
+ * asn1_encode_integer() - encode positive integer to ASN.1
+ * @data:	pointer to the pointer to the data
+ * @end_data:	end of data pointer, points one beyond last usable byte in @data
+ * @integer:	integer to be encoded
+ *
+ * This is a simplified encoder: it only currently does
+ * positive integers, but it should be simple enough to add the
+ * negative case if a use comes along.
+ */
+unsigned char *
+asn1_encode_integer(unsigned char *data, const unsigned char *end_data,
+		    s64 integer)
+{
+	int data_len = end_data - data;
+	unsigned char *d = &data[2];
+	bool found = false;
+	int i;
+
+	if (WARN(integer < 0,
+		 "BUG: integer encode only supports positive integers"))
+		return ERR_PTR(-EINVAL);
+
+	if (IS_ERR(data))
+		return data;
+
+	/* need at least 3 bytes for tag, length and integer encoding */
+	if (data_len < 3)
+		return ERR_PTR(-EINVAL);
+
+	/* remaining length where at d (the start of the integer encoding) */
+	data_len -= 2;
+
+	data[0] = _tag(UNIV, PRIM, INT);
+	if (integer == 0) {
+		*d++ = 0;
+		goto out;
+	}
+
+	for (i = sizeof(integer); i > 0 ; i--) {
+		int byte = integer >> (8 * (i - 1));
+
+		if (!found && byte == 0)
+			continue;
+
+		/*
+		 * for a positive number the first byte must have bit
+		 * 7 clear in two's complement (otherwise it's a
+		 * negative number) so prepend a leading zero if
+		 * that's not the case
+		 */
+		if (!found && (byte & 0x80)) {
+			/*
+			 * no check needed here, we already know we
+			 * have len >= 1
+			 */
+			*d++ = 0;
+			data_len--;
+		}
+
+		found = true;
+		if (data_len == 0)
+			return ERR_PTR(-EINVAL);
+
+		*d++ = byte;
+		data_len--;
+	}
+
+ out:
+	data[1] = d - data - 2;
+
+	return d;
+}
+EXPORT_SYMBOL_GPL(asn1_encode_integer);
+
+/* calculate the base 128 digit values setting the top bit of the first octet */
+static int asn1_encode_oid_digit(unsigned char **_data, int *data_len, u32 oid)
+{
+	unsigned char *data = *_data;
+	int start = 7 + 7 + 7 + 7;
+	int ret = 0;
+
+	if (*data_len < 1)
+		return -EINVAL;
+
+	/* quick case */
+	if (oid == 0) {
+		*data++ = 0x80;
+		(*data_len)--;
+		goto out;
+	}
+
+	while (oid >> start == 0)
+		start -= 7;
+
+	while (start > 0 && *data_len > 0) {
+		u8 byte;
+
+		byte = oid >> start;
+		oid = oid - (byte << start);
+		start -= 7;
+		byte |= 0x80;
+		*data++ = byte;
+		(*data_len)--;
+	}
+
+	if (*data_len > 0) {
+		*data++ = oid;
+		(*data_len)--;
+	} else {
+		ret = -EINVAL;
+	}
+
+ out:
+	*_data = data;
+	return ret;
+}
+
+/**
+ * asn1_encode_oid() - encode an oid to ASN.1
+ * @data:	position to begin encoding at
+ * @end_data:	end of data pointer, points one beyond last usable byte in @data
+ * @oid:	array of oids
+ * @oid_len:	length of oid array
+ *
+ * this encodes an OID up to ASN.1 when presented as an array of OID values
+ */
+unsigned char *
+asn1_encode_oid(unsigned char *data, const unsigned char *end_data,
+		u32 oid[], int oid_len)
+{
+	int data_len = end_data - data;
+	unsigned char *d = data + 2;
+	int i, ret;
+
+	if (WARN(oid_len < 2, "OID must have at least two elements"))
+		return ERR_PTR(-EINVAL);
+
+	if (WARN(oid_len > 32, "OID is too large"))
+		return ERR_PTR(-EINVAL);
+
+	if (IS_ERR(data))
+		return data;
+
+
+	/* need at least 3 bytes for tag, length and OID encoding */
+	if (data_len < 3)
+		return ERR_PTR(-EINVAL);
+
+	data[0] = _tag(UNIV, PRIM, OID);
+	*d++ = oid[0] * 40 + oid[1];
+
+	data_len -= 3;
+
+	ret = 0;
+
+	for (i = 2; i < oid_len; i++) {
+		ret = asn1_encode_oid_digit(&d, &data_len, oid[i]);
+		if (ret < 0)
+			return ERR_PTR(ret);
+	}
+
+	data[1] = d - data - 2;
+
+	return d;
+}
+EXPORT_SYMBOL_GPL(asn1_encode_oid);
+
+/**
+ * asn1_encode_length() - encode a length to follow an ASN.1 tag
+ * @data: pointer to encode at
+ * @data_len: pointer to remaning length (adjusted by routine)
+ * @len: length to encode
+ *
+ * This routine can encode lengths up to 65535 using the ASN.1 rules.
+ * It will accept a negative length and place a zero length tag
+ * instead (to keep the ASN.1 valid).  This convention allows other
+ * encoder primitives to accept negative lengths as singalling the
+ * sequence will be re-encoded when the length is known.
+ */
+static int asn1_encode_length(unsigned char **data, int *data_len, int len)
+{
+	if (*data_len < 1)
+		return -EINVAL;
+
+	if (len < 0) {
+		*((*data)++) = 0;
+		(*data_len)--;
+		return 0;
+	}
+
+	if (len <= 0x7f) {
+		*((*data)++) = len;
+		(*data_len)--;
+		return 0;
+	}
+
+	if (*data_len < 2)
+		return -EINVAL;
+
+	if (len <= 0xff) {
+		*((*data)++) = 0x81;
+		*((*data)++) = len & 0xff;
+		*data_len -= 2;
+		return 0;
+	}
+
+	if (*data_len < 3)
+		return -EINVAL;
+
+	if (len <= 0xffff) {
+		*((*data)++) = 0x82;
+		*((*data)++) = (len >> 8) & 0xff;
+		*((*data)++) = len & 0xff;
+		*data_len -= 3;
+		return 0;
+	}
+
+	if (WARN(len > 0xffffff, "ASN.1 length can't be > 0xffffff"))
+		return -EINVAL;
+
+	if (*data_len < 4)
+		return -EINVAL;
+	*((*data)++) = 0x83;
+	*((*data)++) = (len >> 16) & 0xff;
+	*((*data)++) = (len >> 8) & 0xff;
+	*((*data)++) = len & 0xff;
+	*data_len -= 4;
+
+	return 0;
+}
+
+/**
+ * asn1_encode_tag() - add a tag for optional or explicit value
+ * @data:	pointer to place tag at
+ * @end_data:	end of data pointer, points one beyond last usable byte in @data
+ * @tag:	tag to be placed
+ * @string:	the data to be tagged
+ * @len:	the length of the data to be tagged
+ *
+ * Note this currently only handles short form tags < 31.
+ *
+ * Standard usage is to pass in a @tag, @string and @length and the
+ * @string will be ASN.1 encoded with @tag and placed into @data.  If
+ * the encoding would put data past @end_data then an error is
+ * returned, otherwise a pointer to a position one beyond the encoding
+ * is returned.
+ *
+ * To encode in place pass a NULL @string and -1 for @len and the
+ * maximum allowable beginning and end of the data; all this will do
+ * is add the current maximum length and update the data pointer to
+ * the place where the tag contents should be placed is returned.  The
+ * data should be copied in by the calling routine which should then
+ * repeat the prior statement but now with the known length.  In order
+ * to avoid having to keep both before and after pointers, the repeat
+ * expects to be called with @data pointing to where the first encode
+ * returned it and still NULL for @string but the real length in @len.
+ */
+unsigned char *
+asn1_encode_tag(unsigned char *data, const unsigned char *end_data,
+		u32 tag, const unsigned char *string, int len)
+{
+	int data_len = end_data - data;
+	int ret;
+
+	if (WARN(tag > 30, "ASN.1 tag can't be > 30"))
+		return ERR_PTR(-EINVAL);
+
+	if (!string && WARN(len > 127,
+			    "BUG: recode tag is too big (>127)"))
+		return ERR_PTR(-EINVAL);
+
+	if (IS_ERR(data))
+		return data;
+
+	if (!string && len > 0) {
+		/*
+		 * we're recoding, so move back to the start of the
+		 * tag and install a dummy length because the real
+		 * data_len should be NULL
+		 */
+		data -= 2;
+		data_len = 2;
+	}
+
+	if (data_len < 2)
+		return ERR_PTR(-EINVAL);
+
+	*(data++) = _tagn(CONT, CONS, tag);
+	data_len--;
+	ret = asn1_encode_length(&data, &data_len, len);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	if (!string)
+		return data;
+
+	if (data_len < len)
+		return ERR_PTR(-EINVAL);
+
+	memcpy(data, string, len);
+	data += len;
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(asn1_encode_tag);
+
+/**
+ * asn1_encode_octet_string() - encode an ASN.1 OCTET STRING
+ * @data:	pointer to encode at
+ * @end_data:	end of data pointer, points one beyond last usable byte in @data
+ * @string:	string to be encoded
+ * @len:	length of string
+ *
+ * Note ASN.1 octet strings may contain zeros, so the length is obligatory.
+ */
+unsigned char *
+asn1_encode_octet_string(unsigned char *data,
+			 const unsigned char *end_data,
+			 const unsigned char *string, u32 len)
+{
+	int data_len = end_data - data;
+	int ret;
+
+	if (IS_ERR(data))
+		return data;
+
+	/* need minimum of 2 bytes for tag and length of zero length string */
+	if (data_len < 2)
+		return ERR_PTR(-EINVAL);
+
+	*(data++) = _tag(UNIV, PRIM, OTS);
+	data_len--;
+
+	ret = asn1_encode_length(&data, &data_len, len);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (data_len < len)
+		return ERR_PTR(-EINVAL);
+
+	memcpy(data, string, len);
+	data += len;
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(asn1_encode_octet_string);
+
+/**
+ * asn1_encode_sequence() - wrap a byte stream in an ASN.1 SEQUENCE
+ * @data:	pointer to encode at
+ * @end_data:	end of data pointer, points one beyond last usable byte in @data
+ * @seq:	data to be encoded as a sequence
+ * @len:	length of the data to be encoded as a sequence
+ *
+ * Fill in a sequence.  To encode in place, pass NULL for @seq and -1
+ * for @len; then call again once the length is known (still with NULL
+ * for @seq). In order to avoid having to keep both before and after
+ * pointers, the repeat expects to be called with @data pointing to
+ * where the first encode placed it.
+ */
+unsigned char *
+asn1_encode_sequence(unsigned char *data, const unsigned char *end_data,
+		     const unsigned char *seq, int len)
+{
+	int data_len = end_data - data;
+	int ret;
+
+	if (!seq && WARN(len > 127,
+			 "BUG: recode sequence is too big (>127)"))
+		return ERR_PTR(-EINVAL);
+
+	if (IS_ERR(data))
+		return data;
+
+	if (!seq && len >= 0) {
+		/*
+		 * we're recoding, so move back to the start of the
+		 * sequence and install a dummy length because the
+		 * real length should be NULL
+		 */
+		data -= 2;
+		data_len = 2;
+	}
+
+	if (data_len < 2)
+		return ERR_PTR(-EINVAL);
+
+	*(data++) = _tag(UNIV, CONS, SEQ);
+	data_len--;
+
+	ret = asn1_encode_length(&data, &data_len, len);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!seq)
+		return data;
+
+	if (data_len < len)
+		return ERR_PTR(-EINVAL);
+
+	memcpy(data, seq, len);
+	data += len;
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(asn1_encode_sequence);
+
+/**
+ * asn1_encode_boolean() - encode a boolean value to ASN.1
+ * @data:	pointer to encode at
+ * @end_data:	end of data pointer, points one beyond last usable byte in @data
+ * @val:	the boolean true/false value
+ */
+unsigned char *
+asn1_encode_boolean(unsigned char *data, const unsigned char *end_data,
+		    bool val)
+{
+	int data_len = end_data - data;
+
+	if (IS_ERR(data))
+		return data;
+
+	/* booleans are 3 bytes: tag, length == 1 and value == 0 or 1 */
+	if (data_len < 3)
+		return ERR_PTR(-EINVAL);
+
+	*(data++) = _tag(UNIV, PRIM, BOOL);
+	data_len--;
+
+	asn1_encode_length(&data, &data_len, 1);
+
+	if (val)
+		*(data++) = 1;
+	else
+		*(data++) = 0;
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(asn1_encode_boolean);
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 1c6476e9741e30be57e0b370d4405214f055607c Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 27 Jan 2021 11:06:14 -0800
Subject: oid_registry: Add TCG defined OIDS for TPM keys

The TCG has defined an OID prefix "2.23.133.10.1" for the various TPM
key uses.  We've defined three of the available numbers:

2.23.133.10.1.3 TPM Loadable key.  This is an asymmetric key (Usually
		RSA2048 or Elliptic Curve) which can be imported by a
		TPM2_Load() operation.

2.23.133.10.1.4 TPM Importable Key.  This is an asymmetric key (Usually
		RSA2048 or Elliptic Curve) which can be imported by a
		TPM2_Import() operation.

Both loadable and importable keys are specific to a given TPM, the
difference is that a loadable key is wrapped with the symmetric
secret, so must have been created by the TPM itself.  An importable
key is wrapped with a DH shared secret, and may be created without
access to the TPM provided you know the public part of the parent key.

2.23.133.10.1.5 TPM Sealed Data.  This is a set of data (up to 128
		bytes) which is sealed by the TPM.  It usually
		represents a symmetric key and must be unsealed before
		use.

The ASN.1 binary key form starts of with this OID as the first element
of a sequence, giving the binary form a unique recognizable identity
marker regardless of encoding.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/oid_registry.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 4462ed2c18cd..d06988d1565e 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -113,6 +113,11 @@ enum OID {
 	OID_SM2_with_SM3,		/* 1.2.156.10197.1.501 */
 	OID_sm3WithRSAEncryption,	/* 1.2.156.10197.1.504 */
 
+	/* TCG defined OIDS for TPM based keys */
+	OID_TPMLoadableKey,		/* 2.23.133.10.1.3 */
+	OID_TPMImportableKey,		/* 2.23.133.10.1.4 */
+	OID_TPMSealedData,		/* 2.23.133.10.1.5 */
+
 	OID__NR
 };
 
-- 
cgit v1.2.3


From e5fb5d2c5a03e229ded1f45aa2a42f2c288689c7 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 27 Jan 2021 11:06:17 -0800
Subject: security: keys: trusted: Make sealed key properly interoperable

The current implementation appends a migratable flag to the end of a
key, meaning the format isn't exactly interoperable because the using
party needs to know to strip this extra byte.  However, all other
consumers of TPM sealed blobs expect the unseal to return exactly the
key.  Since TPM2 keys have a key property flag that corresponds to
migratable, use that flag instead and make the actual key the only
sealed quantity.  This is secure because the key properties are bound
to a hash in the private part, so if they're altered the key won't
load.

Backwards compatibility is implemented by detecting whether we're
loading a new format key or not and correctly setting migratable from
the last byte of old format keys.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/tpm.h                       |  2 ++
 security/keys/trusted-keys/trusted_tpm2.c | 53 ++++++++++++++++++++++---------
 2 files changed, 40 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 543aa3b1dedc..aa11fe323c56 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -305,6 +305,8 @@ struct tpm_buf {
 };
 
 enum tpm2_object_attributes {
+	TPM2_OA_FIXED_TPM		= BIT(1),
+	TPM2_OA_FIXED_PARENT		= BIT(4),
 	TPM2_OA_USER_WITH_AUTH		= BIT(6),
 };
 
diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c
index 68249db98a4c..d225ad140960 100644
--- a/security/keys/trusted-keys/trusted_tpm2.c
+++ b/security/keys/trusted-keys/trusted_tpm2.c
@@ -231,6 +231,7 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 	int blob_len = 0;
 	struct tpm_buf buf;
 	u32 hash;
+	u32 flags;
 	int i;
 	int rc;
 
@@ -265,31 +266,32 @@ int tpm2_seal_trusted(struct tpm_chip *chip,
 			     TPM_DIGEST_SIZE);
 
 	/* sensitive */
-	tpm_buf_append_u16(&buf, 4 + options->blobauth_len + payload->key_len + 1);
+	tpm_buf_append_u16(&buf, 4 + options->blobauth_len + payload->key_len);
 
 	tpm_buf_append_u16(&buf, options->blobauth_len);
 	if (options->blobauth_len)
 		tpm_buf_append(&buf, options->blobauth, options->blobauth_len);
 
-	tpm_buf_append_u16(&buf, payload->key_len + 1);
+	tpm_buf_append_u16(&buf, payload->key_len);
 	tpm_buf_append(&buf, payload->key, payload->key_len);
-	tpm_buf_append_u8(&buf, payload->migratable);
 
 	/* public */
 	tpm_buf_append_u16(&buf, 14 + options->policydigest_len);
 	tpm_buf_append_u16(&buf, TPM_ALG_KEYEDHASH);
 	tpm_buf_append_u16(&buf, hash);
 
+	/* key properties */
+	flags = 0;
+	flags |= options->policydigest_len ? 0 : TPM2_OA_USER_WITH_AUTH;
+	flags |= payload->migratable ? (TPM2_OA_FIXED_TPM |
+					TPM2_OA_FIXED_PARENT) : 0;
+	tpm_buf_append_u32(&buf, flags);
+
 	/* policy */
-	if (options->policydigest_len) {
-		tpm_buf_append_u32(&buf, 0);
-		tpm_buf_append_u16(&buf, options->policydigest_len);
+	tpm_buf_append_u16(&buf, options->policydigest_len);
+	if (options->policydigest_len)
 		tpm_buf_append(&buf, options->policydigest,
 			       options->policydigest_len);
-	} else {
-		tpm_buf_append_u32(&buf, TPM2_OA_USER_WITH_AUTH);
-		tpm_buf_append_u16(&buf, 0);
-	}
 
 	/* public parameters */
 	tpm_buf_append_u16(&buf, TPM_ALG_NULL);
@@ -364,8 +366,9 @@ static int tpm2_load_cmd(struct tpm_chip *chip,
 	unsigned int private_len;
 	unsigned int public_len;
 	unsigned int blob_len;
-	u8 *blob;
+	u8 *blob, *pub;
 	int rc;
+	u32 attrs;
 
 	rc = tpm2_key_decode(payload, options, &blob);
 	if (rc) {
@@ -392,6 +395,16 @@ static int tpm2_load_cmd(struct tpm_chip *chip,
 	if (private_len + 2 + public_len + 2 > payload->blob_len)
 		return -E2BIG;
 
+	pub = blob + 2 + private_len + 2;
+	/* key attributes are always at offset 4 */
+	attrs = get_unaligned_be32(pub + 4);
+
+	if ((attrs & (TPM2_OA_FIXED_TPM | TPM2_OA_FIXED_PARENT)) ==
+	    (TPM2_OA_FIXED_TPM | TPM2_OA_FIXED_PARENT))
+		payload->migratable = 0;
+	else
+		payload->migratable = 1;
+
 	blob_len = private_len + public_len + 4;
 	if (blob_len > payload->blob_len)
 		return -E2BIG;
@@ -472,7 +485,7 @@ static int tpm2_unseal_cmd(struct tpm_chip *chip,
 	if (!rc) {
 		data_len = be16_to_cpup(
 			(__be16 *) &buf.data[TPM_HEADER_SIZE + 4]);
-		if (data_len < MIN_KEY_SIZE ||  data_len > MAX_KEY_SIZE + 1) {
+		if (data_len < MIN_KEY_SIZE ||  data_len > MAX_KEY_SIZE) {
 			rc = -EFAULT;
 			goto out;
 		}
@@ -483,9 +496,19 @@ static int tpm2_unseal_cmd(struct tpm_chip *chip,
 		}
 		data = &buf.data[TPM_HEADER_SIZE + 6];
 
-		memcpy(payload->key, data, data_len - 1);
-		payload->key_len = data_len - 1;
-		payload->migratable = data[data_len - 1];
+		if (payload->old_format) {
+			/* migratable flag is at the end of the key */
+			memcpy(payload->key, data, data_len - 1);
+			payload->key_len = data_len - 1;
+			payload->migratable = data[data_len - 1];
+		} else {
+			/*
+			 * migratable flag already collected from key
+			 * attributes
+			 */
+			memcpy(payload->key, data, data_len);
+			payload->key_len = data_len;
+		}
 	}
 
 out:
-- 
cgit v1.2.3


From ce62b114bbad9346641d16853c528ba01513e1b0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 5 Mar 2018 15:01:18 -0500
Subject: NFS: Split attribute support out from the server capabilities

There are lots of attributes, and they are crowding out the bit space.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c           | 15 +++++++++++---
 fs/nfs/inode.c            | 51 ++++++++++++++++++++++++++++-------------------
 fs/nfs/nfs4proc.c         | 49 +++++++++++++++++++++++----------------------
 include/linux/nfs_fs_sb.h | 11 ++--------
 4 files changed, 70 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2aeb4e52a4f1..cfeaadf56bf0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -696,9 +696,18 @@ static int nfs_init_server(struct nfs_server *server,
 	/* Initialise the client representation from the mount data */
 	server->flags = ctx->flags;
 	server->options = ctx->options;
-	server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
-		NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
-		NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
+	server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS;
+
+	switch (clp->rpc_ops->version) {
+	case 2:
+		server->fattr_valid = NFS_ATTR_FATTR_V2;
+		break;
+	case 3:
+		server->fattr_valid = NFS_ATTR_FATTR_V3;
+		break;
+	default:
+		server->fattr_valid = NFS_ATTR_FATTR_V4;
+	}
 
 	if (ctx->rsize)
 		server->rsize = nfs_block_size(ctx->rsize, NULL);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7fa914e24fc4..6d04ebb4f084 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -438,6 +438,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		.fattr	= fattr
 	};
 	struct inode *inode = ERR_PTR(-ENOENT);
+	u64 fattr_supported = NFS_SB(sb)->fattr_valid;
 	unsigned long hash;
 
 	nfs_attr_check_mountpoint(sb, fattr);
@@ -470,7 +471,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		inode->i_mode = fattr->mode;
 		nfsi->cache_validity = 0;
 		if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
-				&& nfs_server_capable(inode, NFS_CAP_MODE))
+				&& (fattr_supported & NFS_ATTR_FATTR_MODE))
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
 		/* Why so? Because we want revalidate for devices/FIFOs, and
 		 * that's precisely what we have in nfs_file_inode_operations.
@@ -516,15 +517,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		nfsi->attr_gencount = fattr->gencount;
 		if (fattr->valid & NFS_ATTR_FATTR_ATIME)
 			inode->i_atime = fattr->atime;
-		else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+		else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
 		if (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			inode->i_mtime = fattr->mtime;
-		else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+		else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_CTIME)
 			inode->i_ctime = fattr->ctime;
-		else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+		else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			inode_set_iversion_raw(inode, fattr->change_attr);
@@ -536,26 +537,30 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE);
 		if (fattr->valid & NFS_ATTR_FATTR_NLINK)
 			set_nlink(inode, fattr->nlink);
-		else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+		else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
 		if (fattr->valid & NFS_ATTR_FATTR_OWNER)
 			inode->i_uid = fattr->uid;
-		else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+		else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
 		if (fattr->valid & NFS_ATTR_FATTR_GROUP)
 			inode->i_gid = fattr->gid;
-		else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+		else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
 		if (nfs_server_capable(inode, NFS_CAP_XATTR))
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
 		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
 			inode->i_blocks = fattr->du.nfs2.blocks;
-		else if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
+		else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED &&
+			 fattr->size != 0)
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
+		if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
 			/*
 			 * report the blocks in 512byte units
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-		} else if (fattr->size != 0)
+		} else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED &&
+			   fattr->size != 0)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
 
 		nfs_setsecurity(inode, fattr, label);
@@ -1952,9 +1957,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
  */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-	struct nfs_server *server;
+	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_isize, new_isize;
+	u64 fattr_supported = server->fattr_valid;
 	unsigned long invalid = 0;
 	unsigned long now = jiffies;
 	unsigned long save_cache_validity;
@@ -1998,7 +2004,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		goto out_err;
 	}
 
-	server = NFS_SERVER(inode);
 	/* Update the fsid? */
 	if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
 			!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
@@ -2066,7 +2071,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
 		inode->i_mtime = fattr->mtime;
-	} else if (server->caps & NFS_CAP_MTIME) {
+	} else if (fattr_supported & NFS_ATTR_FATTR_MTIME) {
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_MTIME;
 		cache_revalidated = false;
@@ -2074,7 +2079,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
 		inode->i_ctime = fattr->ctime;
-	} else if (server->caps & NFS_CAP_CTIME) {
+	} else if (fattr_supported & NFS_ATTR_FATTR_CTIME) {
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_CTIME;
 		cache_revalidated = false;
@@ -2114,7 +2119,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	if (fattr->valid & NFS_ATTR_FATTR_ATIME)
 		inode->i_atime = fattr->atime;
-	else if (server->caps & NFS_CAP_ATIME) {
+	else if (fattr_supported & NFS_ATTR_FATTR_ATIME) {
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_ATIME;
 		cache_revalidated = false;
@@ -2129,7 +2134,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				| NFS_INO_INVALID_ACL;
 			attr_changed = true;
 		}
-	} else if (server->caps & NFS_CAP_MODE) {
+	} else if (fattr_supported & NFS_ATTR_FATTR_MODE) {
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_MODE;
 		cache_revalidated = false;
@@ -2142,7 +2147,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			inode->i_uid = fattr->uid;
 			attr_changed = true;
 		}
-	} else if (server->caps & NFS_CAP_OWNER) {
+	} else if (fattr_supported & NFS_ATTR_FATTR_OWNER) {
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_OTHER;
 		cache_revalidated = false;
@@ -2155,7 +2160,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			inode->i_gid = fattr->gid;
 			attr_changed = true;
 		}
-	} else if (server->caps & NFS_CAP_OWNER_GROUP) {
+	} else if (fattr_supported & NFS_ATTR_FATTR_GROUP) {
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_OTHER;
 		cache_revalidated = false;
@@ -2168,7 +2173,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			set_nlink(inode, fattr->nlink);
 			attr_changed = true;
 		}
-	} else if (server->caps & NFS_CAP_NLINK) {
+	} else if (fattr_supported & NFS_ATTR_FATTR_NLINK) {
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_NLINK;
 		cache_revalidated = false;
@@ -2179,9 +2184,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		 * report the blocks in 512byte units
 		 */
 		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-	} else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+	} else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) {
+		nfsi->cache_validity |=
+			save_cache_validity & NFS_INO_INVALID_BLOCKS;
+		cache_revalidated = false;
+	}
+
+	if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) {
 		inode->i_blocks = fattr->du.nfs2.blocks;
-	else {
+	} else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) {
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_BLOCKS;
 		cache_revalidated = false;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bcbb057d5529..21c31aebb116 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3868,12 +3868,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
 		}
 		memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
-		server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
-				NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
-				NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
-				NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
-				NFS_CAP_CTIME|NFS_CAP_MTIME|
-				NFS_CAP_SECURITY_LABEL);
+		server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
+				  NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL);
+		server->fattr_valid = NFS_ATTR_FATTR_V4;
 		if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
 				res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
 			server->caps |= NFS_CAP_ACLS;
@@ -3881,25 +3878,29 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->caps |= NFS_CAP_HARDLINKS;
 		if (res.has_symlinks != 0)
 			server->caps |= NFS_CAP_SYMLINKS;
-		if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
-			server->caps |= NFS_CAP_FILEID;
-		if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
-			server->caps |= NFS_CAP_MODE;
-		if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
-			server->caps |= NFS_CAP_NLINK;
-		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
-			server->caps |= NFS_CAP_OWNER;
-		if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
-			server->caps |= NFS_CAP_OWNER_GROUP;
-		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
-			server->caps |= NFS_CAP_ATIME;
-		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
-			server->caps |= NFS_CAP_CTIME;
-		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
-			server->caps |= NFS_CAP_MTIME;
+		if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID;
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_MODE;
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_NLINK;
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER))
+			server->fattr_valid &= ~(NFS_ATTR_FATTR_OWNER |
+				NFS_ATTR_FATTR_OWNER_NAME);
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP))
+			server->fattr_valid &= ~(NFS_ATTR_FATTR_GROUP |
+				NFS_ATTR_FATTR_GROUP_NAME);
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_SPACE_USED))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_SPACE_USED;
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_ATIME;
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME;
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
-		if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
-			server->caps |= NFS_CAP_SECURITY_LABEL;
+		if (!(res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_V4_SECURITY_LABEL;
 #endif
 		memcpy(server->attr_bitmask_nl, res.attr_bitmask,
 				sizeof(server->attr_bitmask));
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index fbcdfd9f7a7f..d28d7a62864f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -191,6 +191,8 @@ struct nfs_server {
 	dev_t			s_dev;		/* superblock dev numbers */
 	struct nfs_auth_info	auth_info;	/* parsed auth flavors */
 
+	__u64			fattr_valid;	/* Valid attributes */
+
 #ifdef CONFIG_NFS_FSCACHE
 	struct nfs_fscache_key	*fscache_key;	/* unique key for superblock */
 	struct fscache_cookie	*fscache;	/* superblock cookie */
@@ -267,16 +269,7 @@ struct nfs_server {
 #define NFS_CAP_SYMLINKS	(1U << 2)
 #define NFS_CAP_ACLS		(1U << 3)
 #define NFS_CAP_ATOMIC_OPEN	(1U << 4)
-/* #define NFS_CAP_CHANGE_ATTR	(1U << 5) */
 #define NFS_CAP_LGOPEN		(1U << 5)
-#define NFS_CAP_FILEID		(1U << 6)
-#define NFS_CAP_MODE		(1U << 7)
-#define NFS_CAP_NLINK		(1U << 8)
-#define NFS_CAP_OWNER		(1U << 9)
-#define NFS_CAP_OWNER_GROUP	(1U << 10)
-#define NFS_CAP_ATIME		(1U << 11)
-#define NFS_CAP_CTIME		(1U << 12)
-#define NFS_CAP_MTIME		(1U << 13)
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
 #define NFS_CAP_UIDGID_NOMAP	(1U << 15)
 #define NFS_CAP_STATEID_NFSV41	(1U << 16)
-- 
cgit v1.2.3


From 123086843372bc93d26f52edfb71dbf951cd2f17 Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan <badhri@google.com>
Date: Tue, 13 Apr 2021 19:39:58 -0700
Subject: usb: typec: tcpm: Honour pSnkStdby requirement during negotiation

>From PD Spec:
The Sink Shall transition to Sink Standby before a positive or
negative voltage transition of VBUS. During Sink Standby
the Sink Shall reduce its power draw to pSnkStdby. This allows
the Source to manage the voltage transition as well as
supply sufficient operating current to the Sink to maintain PD
operation during the transition. The Sink Shall
complete this transition to Sink Standby within tSnkStdby
after evaluating the Accept Message from the Source. The
transition when returning to Sink operation from Sink Standby
Shall be completed within tSnkNewPower. The
pSnkStdby requirement Shall only apply if the Sink power draw
is higher than this level.

The above requirement needs to be met to prevent hard resets
from port partner.

Without the patch: (5V/3A during SNK_DISCOVERY all the way through
explicit contract)
[   95.711984] CC1: 0 -> 0, CC2: 0 -> 5 [state TOGGLING, polarity 0, connected]
[   95.712007] state change TOGGLING -> SNK_ATTACH_WAIT [rev3 NONE_AMS]
[   95.712017] pending state change SNK_ATTACH_WAIT -> SNK_DEBOUNCED @ 170 ms [rev3 NONE_AMS]
[   95.837190] VBUS on
[   95.882075] state change SNK_ATTACH_WAIT -> SNK_DEBOUNCED [delayed 170 ms]
[   95.882082] state change SNK_DEBOUNCED -> SNK_ATTACHED [rev3 NONE_AMS]
[   95.882086] polarity 1
[   95.883151] set_auto_vbus_discharge_threshold mode:0 pps_active:n vbus:5000 ret:0
[   95.883441] enable vbus discharge ret:0
[   95.883445] Requesting mux state 1, usb-role 2, orientation 2
[   95.883776] state change SNK_ATTACHED -> SNK_STARTUP [rev3 NONE_AMS]
[   95.883879] pending state change SNK_STARTUP -> SNK_DISCOVERY @ 500 ms [rev3 NONE_AMS]
[   96.038960] VBUS on
[   96.383939] state change SNK_STARTUP -> SNK_DISCOVERY [delayed 500 ms]
[   96.383946] Setting voltage/current limit 5000 mV 3000 mA
[   96.383961] vbus=0 charge:=1
[   96.386044] state change SNK_DISCOVERY -> SNK_WAIT_CAPABILITIES [rev3 NONE_AMS]
[   96.386309] pending state change SNK_WAIT_CAPABILITIES -> HARD_RESET_SEND @ 450 ms [rev3 NONE_AMS]
[   96.394404] PD RX, header: 0x2161 [1]
[   96.394408]  PDO 0: type 0, 5000 mV, 3000 mA [E]
[   96.394410]  PDO 1: type 0, 9000 mV, 2000 mA []
[   96.394412] state change SNK_WAIT_CAPABILITIES -> SNK_NEGOTIATE_CAPABILITIES [rev2 POWER_NEGOTIATION]
[   96.394416] Setting usb_comm capable false
[   96.395083] cc=0 cc1=0 cc2=5 vbus=0 vconn=sink polarity=1
[   96.395089] Requesting PDO 1: 9000 mV, 2000 mA
[   96.395093] PD TX, header: 0x1042
[   96.397404] PD TX complete, status: 0
[   96.397424] pending state change SNK_NEGOTIATE_CAPABILITIES -> HARD_RESET_SEND @ 60 ms [rev2 POWER_NEGOTIATION]
[   96.400826] PD RX, header: 0x363 [1]
[   96.400829] state change SNK_NEGOTIATE_CAPABILITIES -> SNK_TRANSITION_SINK [rev2 POWER_NEGOTIATION]
[   96.400832] pending state change SNK_TRANSITION_SINK -> HARD_RESET_SEND @ 500 ms [rev2 POWER_NEGOTIATION]
[   96.577315] PD RX, header: 0x566 [1]
[   96.577321] Setting voltage/current limit 9000 mV 2000 mA
[   96.578363] set_auto_vbus_discharge_threshold mode:3 pps_active:n vbus:9000 ret:0
[   96.578370] state change SNK_TRANSITION_SINK -> SNK_READY [rev2 POWER_NEGOTIATION]

With the patch:
[  168.398573] CC1: 0 -> 0, CC2: 0 -> 5 [state TOGGLING, polarity 0, connected]
[  168.398605] state change TOGGLING -> SNK_ATTACH_WAIT [rev3 NONE_AMS]
[  168.398619] pending state change SNK_ATTACH_WAIT -> SNK_DEBOUNCED @ 170 ms [rev3 NONE_AMS]
[  168.522348] VBUS on
[  168.568676] state change SNK_ATTACH_WAIT -> SNK_DEBOUNCED [delayed 170 ms]
[  168.568684] state change SNK_DEBOUNCED -> SNK_ATTACHED [rev3 NONE_AMS]
[  168.568688] polarity 1
[  168.569867] set_auto_vbus_discharge_threshold mode:0 pps_active:n vbus:5000 ret:0
[  168.570158] enable vbus discharge ret:0
[  168.570161] Requesting mux state 1, usb-role 2, orientation 2
[  168.570504] state change SNK_ATTACHED -> SNK_STARTUP [rev3 NONE_AMS]
[  168.570634] pending state change SNK_STARTUP -> SNK_DISCOVERY @ 500 ms [rev3 NONE_AMS]
[  169.070689] state change SNK_STARTUP -> SNK_DISCOVERY [delayed 500 ms]
[  169.070695] Setting voltage/current limit 5000 mV 3000 mA
[  169.070702] vbus=0 charge:=1
[  169.072719] state change SNK_DISCOVERY -> SNK_WAIT_CAPABILITIES [rev3 NONE_AMS]
[  169.073145] pending state change SNK_WAIT_CAPABILITIES -> HARD_RESET_SEND @ 450 ms [rev3 NONE_AMS]
[  169.077162] PD RX, header: 0x2161 [1]
[  169.077172]  PDO 0: type 0, 5000 mV, 3000 mA [E]
[  169.077178]  PDO 1: type 0, 9000 mV, 2000 mA []
[  169.077183] state change SNK_WAIT_CAPABILITIES -> SNK_NEGOTIATE_CAPABILITIES [rev2 POWER_NEGOTIATION]
[  169.077191] Setting usb_comm capable false
[  169.077753] cc=0 cc1=0 cc2=5 vbus=0 vconn=sink polarity=1
[  169.077759] Requesting PDO 1: 9000 mV, 2000 mA
[  169.077762] PD TX, header: 0x1042
[  169.079990] PD TX complete, status: 0
[  169.080013] pending state change SNK_NEGOTIATE_CAPABILITIES -> HARD_RESET_SEND @ 60 ms [rev2 POWER_NEGOTIATION]
[  169.083183] VBUS on
[  169.084195] PD RX, header: 0x363 [1]
[  169.084200] state change SNK_NEGOTIATE_CAPABILITIES -> SNK_TRANSITION_SINK [rev2 POWER_NEGOTIATION]
[  169.084206] Setting standby current 5000 mV @ 500 mA
[  169.084209] Setting voltage/current limit 5000 mV 500 mA
[  169.084220] pending state change SNK_TRANSITION_SINK -> HARD_RESET_SEND @ 500 ms [rev2 POWER_NEGOTIATION]
[  169.260222] PD RX, header: 0x566 [1]
[  169.260227] Setting voltage/current limit 9000 mV 2000 mA
[  169.261315] set_auto_vbus_discharge_threshold mode:3 pps_active:n vbus:9000 ret:0
[  169.261321] state change SNK_TRANSITION_SINK -> SNK_READY [rev2 POWER_NEGOTIATION]
[  169.261570] AMS POWER_NEGOTIATION finished

Fixes: f0690a25a140b ("staging: typec: USB Type-C Port Manager (tcpm)")
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Badhri Jagan Sridharan <badhri@google.com>
Link: https://lore.kernel.org/r/20210414024000.4175263-1-badhri@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/tcpm/tcpm.c | 17 +++++++++++++++++
 include/linux/usb/pd.h        |  2 ++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index 1c32bdf62852..aedc8bb9532a 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -4131,6 +4131,23 @@ static void run_state_machine(struct tcpm_port *port)
 		}
 		break;
 	case SNK_TRANSITION_SINK:
+		/* From the USB PD spec:
+		 * "The Sink Shall transition to Sink Standby before a positive or
+		 * negative voltage transition of VBUS. During Sink Standby
+		 * the Sink Shall reduce its power draw to pSnkStdby."
+		 *
+		 * This is not applicable to PPS though as the port can continue
+		 * to draw negotiated power without switching to standby.
+		 */
+		if (port->supply_voltage != port->req_supply_voltage && !port->pps_data.active &&
+		    port->current_limit * port->supply_voltage / 1000 > PD_P_SNK_STDBY_MW) {
+			u32 stdby_ma = PD_P_SNK_STDBY_MW * 1000 / port->supply_voltage;
+
+			tcpm_log(port, "Setting standby current %u mV @ %u mA",
+				 port->supply_voltage, stdby_ma);
+			tcpm_set_current_limit(port, stdby_ma, port->supply_voltage);
+		}
+		fallthrough;
 	case SNK_TRANSITION_SINK_VBUS:
 		tcpm_set_state(port, hard_reset_state(port),
 			       PD_T_PS_TRANSITION);
diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h
index 70d681918d01..bf00259493e0 100644
--- a/include/linux/usb/pd.h
+++ b/include/linux/usb/pd.h
@@ -493,4 +493,6 @@ static inline unsigned int rdo_max_power(u32 rdo)
 #define PD_N_CAPS_COUNT		(PD_T_NO_RESPONSE / PD_T_SEND_SOURCE_CAP)
 #define PD_N_HARD_RESET_COUNT	2
 
+#define PD_P_SNK_STDBY_MW	2500	/* 2500 mW */
+
 #endif /* __LINUX_USB_PD_H */
-- 
cgit v1.2.3


From 1c7c1488898e1b3fd15d38f373a8c947a2d9b1e3 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Wed, 10 Feb 2021 15:07:36 +0800
Subject: mfd: dbx500-prcmu: Use true and false for bool variable

Fix the following coccicheck warning:

./include/linux/mfd/db8500-prcmu.h:723:8-9: WARNING: return of 0/1 in
function 'db8500_prcmu_is_ac_wake_requested' with return type bool.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/db8500-prcmu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/db8500-prcmu.h b/include/linux/mfd/db8500-prcmu.h
index 4b63d3ecdcff..a62de3d155ed 100644
--- a/include/linux/mfd/db8500-prcmu.h
+++ b/include/linux/mfd/db8500-prcmu.h
@@ -720,7 +720,7 @@ static inline int db8500_prcmu_load_a9wdog(u8 id, u32 val)
 
 static inline bool db8500_prcmu_is_ac_wake_requested(void)
 {
-	return 0;
+	return false;
 }
 
 static inline int db8500_prcmu_set_arm_opp(u8 opp)
-- 
cgit v1.2.3


From 911490a5ca2a3289ac734ca7c127c7dbc72466cc Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca@lucaceresoli.net>
Date: Fri, 19 Feb 2021 23:39:09 +0100
Subject: mfd: lp87565: Remove unused define 'LP87565_NUM_BUCK'

This define appears incorrect, but it is completely unused so it can be
removed.

Signed-off-by: Luca Ceresoli <luca@lucaceresoli.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/lp87565.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/lp87565.h b/include/linux/mfd/lp87565.h
index d44ddfb6bb63..5640e6088fe6 100644
--- a/include/linux/mfd/lp87565.h
+++ b/include/linux/mfd/lp87565.h
@@ -237,9 +237,6 @@ enum lp87565_device_type {
 #define LP87565_GOIO2_OUT			BIT(1)
 #define LP87565_GOIO1_OUT			BIT(0)
 
-/* Number of step-down converters available */
-#define LP87565_NUM_BUCK		6
-
 enum LP87565_regulator_id {
 	/* BUCK's */
 	LP87565_BUCK_0,
-- 
cgit v1.2.3


From 16f961544bfd7170f75d805d7585e09023671dbc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sun, 7 Mar 2021 21:23:25 +0100
Subject: mfd: Remove support for AB3100

The ST-Ericsson U300 platform has been removed, so this driver is no
longer needed.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig        |  26 +-
 drivers/mfd/Makefile       |   2 -
 drivers/mfd/ab3100-core.c  | 929 ---------------------------------------------
 drivers/mfd/ab3100-otp.c   | 240 ------------
 include/linux/mfd/ab3100.h | 128 -------
 5 files changed, 1 insertion(+), 1324 deletions(-)
 delete mode 100644 drivers/mfd/ab3100-core.c
 delete mode 100644 drivers/mfd/ab3100-otp.c
 delete mode 100644 include/linux/mfd/ab3100.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 6e31210915c7..2152126c5b0b 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1235,7 +1235,7 @@ config MFD_SC27XX_PMIC
 
 config ABX500_CORE
 	bool "ST-Ericsson ABX500 Mixed Signal Circuit register functions"
-	default y if ARCH_U300 || ARCH_U8500 || COMPILE_TEST
+	default y if ARCH_U8500 || COMPILE_TEST
 	help
 	  Say yes here if you have the ABX500 Mixed Signal IC family
 	  chips. This core driver expose register access functions.
@@ -1243,30 +1243,6 @@ config ABX500_CORE
 	  remain unchanged when IC changes. Binding of the functions to
 	  actual register access is done by the IC core driver.
 
-config AB3100_CORE
-	bool "ST-Ericsson AB3100 Mixed Signal Circuit core functions"
-	depends on I2C=y && ABX500_CORE
-	select MFD_CORE
-	default y if ARCH_U300
-	help
-	  Select this to enable the AB3100 Mixed Signal IC core
-	  functionality. This connects to a AB3100 on the I2C bus
-	  and expose a number of symbols needed for dependent devices
-	  to read and write registers and subscribe to events from
-	  this multi-functional IC. This is needed to use other features
-	  of the AB3100 such as battery-backed RTC, charging control,
-	  LEDs, vibrator, system power and temperature, power management
-	  and ALSA sound.
-
-config AB3100_OTP
-	tristate "ST-Ericsson AB3100 OTP functions"
-	depends on AB3100_CORE
-	default y if AB3100_CORE
-	help
-	  Select this to enable the AB3100 Mixed Signal IC OTP (one-time
-	  programmable memory) support. This exposes a sysfs file to read
-	  out OTP values.
-
 config AB8500_CORE
 	bool "ST-Ericsson AB8500 Mixed Signal Power Management chip"
 	depends on ABX500_CORE && MFD_DB8500_PRCMU
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 279b80822147..4f6d2b8a5f76 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -178,8 +178,6 @@ obj-$(CONFIG_MFD_PCF50633)	+= pcf50633.o
 obj-$(CONFIG_PCF50633_ADC)	+= pcf50633-adc.o
 obj-$(CONFIG_PCF50633_GPIO)	+= pcf50633-gpio.o
 obj-$(CONFIG_ABX500_CORE)	+= abx500-core.o
-obj-$(CONFIG_AB3100_CORE)	+= ab3100-core.o
-obj-$(CONFIG_AB3100_OTP)	+= ab3100-otp.o
 obj-$(CONFIG_AB8500_DEBUG)	+= ab8500-debugfs.o
 obj-$(CONFIG_MFD_DB8500_PRCMU)	+= db8500-prcmu.o
 # ab8500-core need to come after db8500-prcmu (which provides the channel)
diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
deleted file mode 100644
index ee71ae04b5e6..000000000000
--- a/drivers/mfd/ab3100-core.c
+++ /dev/null
@@ -1,929 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2007-2010 ST-Ericsson
- * Low-level core for exclusive access to the AB3100 IC on the I2C bus
- * and some basic chip-configuration.
- * Author: Linus Walleij <linus.walleij@stericsson.com>
- */
-
-#include <linux/i2c.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/notifier.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/device.h>
-#include <linux/interrupt.h>
-#include <linux/random.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/uaccess.h>
-#include <linux/mfd/core.h>
-#include <linux/mfd/ab3100.h>
-#include <linux/mfd/abx500.h>
-
-/* These are the only registers inside AB3100 used in this main file */
-
-/* Interrupt event registers */
-#define AB3100_EVENTA1		0x21
-#define AB3100_EVENTA2		0x22
-#define AB3100_EVENTA3		0x23
-
-/* AB3100 DAC converter registers */
-#define AB3100_DIS		0x00
-#define AB3100_D0C		0x01
-#define AB3100_D1C		0x02
-#define AB3100_D2C		0x03
-#define AB3100_D3C		0x04
-
-/* Chip ID register */
-#define AB3100_CID		0x20
-
-/* AB3100 interrupt registers */
-#define AB3100_IMRA1		0x24
-#define AB3100_IMRA2		0x25
-#define AB3100_IMRA3		0x26
-#define AB3100_IMRB1		0x2B
-#define AB3100_IMRB2		0x2C
-#define AB3100_IMRB3		0x2D
-
-/* System Power Monitoring and control registers */
-#define AB3100_MCA		0x2E
-#define AB3100_MCB		0x2F
-
-/* SIM power up */
-#define AB3100_SUP		0x50
-
-/*
- * I2C communication
- *
- * The AB3100 is usually assigned address 0x48 (7-bit)
- * The chip is defined in the platform i2c_board_data section.
- */
-static int ab3100_get_chip_id(struct device *dev)
-{
-	struct ab3100 *ab3100 = dev_get_drvdata(dev->parent);
-
-	return (int)ab3100->chip_id;
-}
-
-static int ab3100_set_register_interruptible(struct ab3100 *ab3100,
-	u8 reg, u8 regval)
-{
-	u8 regandval[2] = {reg, regval};
-	int err;
-
-	err = mutex_lock_interruptible(&ab3100->access_mutex);
-	if (err)
-		return err;
-
-	/*
-	 * A two-byte write message with the first byte containing the register
-	 * number and the second byte containing the value to be written
-	 * effectively sets a register in the AB3100.
-	 */
-	err = i2c_master_send(ab3100->i2c_client, regandval, 2);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (write register): %d\n",
-			err);
-	} else if (err != 2) {
-		dev_err(ab3100->dev,
-			"write error (write register)\n"
-			"  %d bytes transferred (expected 2)\n",
-			err);
-		err = -EIO;
-	} else {
-		/* All is well */
-		err = 0;
-	}
-	mutex_unlock(&ab3100->access_mutex);
-	return err;
-}
-
-static int set_register_interruptible(struct device *dev,
-	u8 bank, u8 reg, u8 value)
-{
-	struct ab3100 *ab3100 = dev_get_drvdata(dev->parent);
-
-	return ab3100_set_register_interruptible(ab3100, reg, value);
-}
-
-/*
- * The test registers exist at an I2C bus address up one
- * from the ordinary base. They are not supposed to be used
- * in production code, but sometimes you have to do that
- * anyway. It's currently only used from this file so declare
- * it static and do not export.
- */
-static int ab3100_set_test_register_interruptible(struct ab3100 *ab3100,
-				    u8 reg, u8 regval)
-{
-	u8 regandval[2] = {reg, regval};
-	int err;
-
-	err = mutex_lock_interruptible(&ab3100->access_mutex);
-	if (err)
-		return err;
-
-	err = i2c_master_send(ab3100->testreg_client, regandval, 2);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (write test register): %d\n",
-			err);
-	} else if (err != 2) {
-		dev_err(ab3100->dev,
-			"write error (write test register)\n"
-			"  %d bytes transferred (expected 2)\n",
-			err);
-		err = -EIO;
-	} else {
-		/* All is well */
-		err = 0;
-	}
-	mutex_unlock(&ab3100->access_mutex);
-
-	return err;
-}
-
-static int ab3100_get_register_interruptible(struct ab3100 *ab3100,
-					     u8 reg, u8 *regval)
-{
-	int err;
-
-	err = mutex_lock_interruptible(&ab3100->access_mutex);
-	if (err)
-		return err;
-
-	/*
-	 * AB3100 require an I2C "stop" command between each message, else
-	 * it will not work. The only way of achieveing this with the
-	 * message transport layer is to send the read and write messages
-	 * separately.
-	 */
-	err = i2c_master_send(ab3100->i2c_client, &reg, 1);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (send register address): %d\n",
-			err);
-		goto get_reg_out_unlock;
-	} else if (err != 1) {
-		dev_err(ab3100->dev,
-			"write error (send register address)\n"
-			"  %d bytes transferred (expected 1)\n",
-			err);
-		err = -EIO;
-		goto get_reg_out_unlock;
-	} else {
-		/* All is well */
-		err = 0;
-	}
-
-	err = i2c_master_recv(ab3100->i2c_client, regval, 1);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (read register): %d\n",
-			err);
-		goto get_reg_out_unlock;
-	} else if (err != 1) {
-		dev_err(ab3100->dev,
-			"write error (read register)\n"
-			"  %d bytes transferred (expected 1)\n",
-			err);
-		err = -EIO;
-		goto get_reg_out_unlock;
-	} else {
-		/* All is well */
-		err = 0;
-	}
-
- get_reg_out_unlock:
-	mutex_unlock(&ab3100->access_mutex);
-	return err;
-}
-
-static int get_register_interruptible(struct device *dev, u8 bank, u8 reg,
-				      u8 *value)
-{
-	struct ab3100 *ab3100 = dev_get_drvdata(dev->parent);
-
-	return ab3100_get_register_interruptible(ab3100, reg, value);
-}
-
-static int ab3100_get_register_page_interruptible(struct ab3100 *ab3100,
-			     u8 first_reg, u8 *regvals, u8 numregs)
-{
-	int err;
-
-	if (ab3100->chip_id == 0xa0 ||
-	    ab3100->chip_id == 0xa1)
-		/* These don't support paged reads */
-		return -EIO;
-
-	err = mutex_lock_interruptible(&ab3100->access_mutex);
-	if (err)
-		return err;
-
-	/*
-	 * Paged read also require an I2C "stop" command.
-	 */
-	err = i2c_master_send(ab3100->i2c_client, &first_reg, 1);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (send first register address): %d\n",
-			err);
-		goto get_reg_page_out_unlock;
-	} else if (err != 1) {
-		dev_err(ab3100->dev,
-			"write error (send first register address)\n"
-			"  %d bytes transferred (expected 1)\n",
-			err);
-		err = -EIO;
-		goto get_reg_page_out_unlock;
-	}
-
-	err = i2c_master_recv(ab3100->i2c_client, regvals, numregs);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (read register page): %d\n",
-			err);
-		goto get_reg_page_out_unlock;
-	} else if (err != numregs) {
-		dev_err(ab3100->dev,
-			"write error (read register page)\n"
-			"  %d bytes transferred (expected %d)\n",
-			err, numregs);
-		err = -EIO;
-		goto get_reg_page_out_unlock;
-	}
-
-	/* All is well */
-	err = 0;
-
- get_reg_page_out_unlock:
-	mutex_unlock(&ab3100->access_mutex);
-	return err;
-}
-
-static int get_register_page_interruptible(struct device *dev, u8 bank,
-	u8 first_reg, u8 *regvals, u8 numregs)
-{
-	struct ab3100 *ab3100 = dev_get_drvdata(dev->parent);
-
-	return ab3100_get_register_page_interruptible(ab3100,
-			first_reg, regvals, numregs);
-}
-
-static int ab3100_mask_and_set_register_interruptible(struct ab3100 *ab3100,
-				 u8 reg, u8 andmask, u8 ormask)
-{
-	u8 regandval[2] = {reg, 0};
-	int err;
-
-	err = mutex_lock_interruptible(&ab3100->access_mutex);
-	if (err)
-		return err;
-
-	/* First read out the target register */
-	err = i2c_master_send(ab3100->i2c_client, &reg, 1);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (maskset send address): %d\n",
-			err);
-		goto get_maskset_unlock;
-	} else if (err != 1) {
-		dev_err(ab3100->dev,
-			"write error (maskset send address)\n"
-			"  %d bytes transferred (expected 1)\n",
-			err);
-		err = -EIO;
-		goto get_maskset_unlock;
-	}
-
-	err = i2c_master_recv(ab3100->i2c_client, &regandval[1], 1);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (maskset read register): %d\n",
-			err);
-		goto get_maskset_unlock;
-	} else if (err != 1) {
-		dev_err(ab3100->dev,
-			"write error (maskset read register)\n"
-			"  %d bytes transferred (expected 1)\n",
-			err);
-		err = -EIO;
-		goto get_maskset_unlock;
-	}
-
-	/* Modify the register */
-	regandval[1] &= andmask;
-	regandval[1] |= ormask;
-
-	/* Write the register */
-	err = i2c_master_send(ab3100->i2c_client, regandval, 2);
-	if (err < 0) {
-		dev_err(ab3100->dev,
-			"write error (write register): %d\n",
-			err);
-		goto get_maskset_unlock;
-	} else if (err != 2) {
-		dev_err(ab3100->dev,
-			"write error (write register)\n"
-			"  %d bytes transferred (expected 2)\n",
-			err);
-		err = -EIO;
-		goto get_maskset_unlock;
-	}
-
-	/* All is well */
-	err = 0;
-
- get_maskset_unlock:
-	mutex_unlock(&ab3100->access_mutex);
-	return err;
-}
-
-static int mask_and_set_register_interruptible(struct device *dev, u8 bank,
-	u8 reg, u8 bitmask, u8 bitvalues)
-{
-	struct ab3100 *ab3100 = dev_get_drvdata(dev->parent);
-
-	return ab3100_mask_and_set_register_interruptible(ab3100,
-			reg, bitmask, (bitmask & bitvalues));
-}
-
-/*
- * Register a simple callback for handling any AB3100 events.
- */
-int ab3100_event_register(struct ab3100 *ab3100,
-			  struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(&ab3100->event_subscribers,
-					       nb);
-}
-EXPORT_SYMBOL(ab3100_event_register);
-
-/*
- * Remove a previously registered callback.
- */
-int ab3100_event_unregister(struct ab3100 *ab3100,
-			    struct notifier_block *nb)
-{
-	return blocking_notifier_chain_unregister(&ab3100->event_subscribers,
-					    nb);
-}
-EXPORT_SYMBOL(ab3100_event_unregister);
-
-
-static int ab3100_event_registers_startup_state_get(struct device *dev,
-					     u8 *event)
-{
-	struct ab3100 *ab3100 = dev_get_drvdata(dev->parent);
-
-	if (!ab3100->startup_events_read)
-		return -EAGAIN; /* Try again later */
-	memcpy(event, ab3100->startup_events, 3);
-
-	return 0;
-}
-
-static struct abx500_ops ab3100_ops = {
-	.get_chip_id = ab3100_get_chip_id,
-	.set_register = set_register_interruptible,
-	.get_register = get_register_interruptible,
-	.get_register_page = get_register_page_interruptible,
-	.set_register_page = NULL,
-	.mask_and_set_register = mask_and_set_register_interruptible,
-	.event_registers_startup_state_get =
-		ab3100_event_registers_startup_state_get,
-	.startup_irq_enabled = NULL,
-};
-
-/*
- * This is a threaded interrupt handler so we can make some
- * I2C calls etc.
- */
-static irqreturn_t ab3100_irq_handler(int irq, void *data)
-{
-	struct ab3100 *ab3100 = data;
-	u8 event_regs[3];
-	u32 fatevent;
-	int err;
-
-	err = ab3100_get_register_page_interruptible(ab3100, AB3100_EVENTA1,
-				       event_regs, 3);
-	if (err)
-		goto err_event;
-
-	fatevent = (event_regs[0] << 16) |
-		(event_regs[1] << 8) |
-		event_regs[2];
-
-	if (!ab3100->startup_events_read) {
-		ab3100->startup_events[0] = event_regs[0];
-		ab3100->startup_events[1] = event_regs[1];
-		ab3100->startup_events[2] = event_regs[2];
-		ab3100->startup_events_read = true;
-	}
-	/*
-	 * The notified parties will have to mask out the events
-	 * they're interested in and react to them. They will be
-	 * notified on all events, then they use the fatevent value
-	 * to determine if they're interested.
-	 */
-	blocking_notifier_call_chain(&ab3100->event_subscribers,
-				     fatevent, NULL);
-
-	dev_dbg(ab3100->dev,
-		"IRQ Event: 0x%08x\n", fatevent);
-
-	return IRQ_HANDLED;
-
- err_event:
-	dev_dbg(ab3100->dev,
-		"error reading event status\n");
-	return IRQ_HANDLED;
-}
-
-#ifdef CONFIG_DEBUG_FS
-/*
- * Some debugfs entries only exposed if we're using debug
- */
-static int ab3100_registers_print(struct seq_file *s, void *p)
-{
-	struct ab3100 *ab3100 = s->private;
-	u8 value;
-	u8 reg;
-
-	seq_puts(s, "AB3100 registers:\n");
-
-	for (reg = 0; reg < 0xff; reg++) {
-		ab3100_get_register_interruptible(ab3100, reg, &value);
-		seq_printf(s, "[0x%x]:  0x%x\n", reg, value);
-	}
-	return 0;
-}
-
-static int ab3100_registers_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ab3100_registers_print, inode->i_private);
-}
-
-static const struct file_operations ab3100_registers_fops = {
-	.open = ab3100_registers_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-	.owner = THIS_MODULE,
-};
-
-struct ab3100_get_set_reg_priv {
-	struct ab3100 *ab3100;
-	bool mode;
-};
-
-static ssize_t ab3100_get_set_reg(struct file *file,
-				  const char __user *user_buf,
-				  size_t count, loff_t *ppos)
-{
-	struct ab3100_get_set_reg_priv *priv = file->private_data;
-	struct ab3100 *ab3100 = priv->ab3100;
-	char buf[32];
-	ssize_t buf_size;
-	int regp;
-	u8 user_reg;
-	int err;
-	int i = 0;
-
-	/* Get userspace string and assure termination */
-	buf_size = min((ssize_t)count, (ssize_t)(sizeof(buf)-1));
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-	buf[buf_size] = 0;
-
-	/*
-	 * The idea is here to parse a string which is either
-	 * "0xnn" for reading a register, or "0xaa 0xbb" for
-	 * writing 0xbb to the register 0xaa. First move past
-	 * whitespace and then begin to parse the register.
-	 */
-	while ((i < buf_size) && (buf[i] == ' '))
-		i++;
-	regp = i;
-
-	/*
-	 * Advance pointer to end of string then terminate
-	 * the register string. This is needed to satisfy
-	 * the kstrtou8() function.
-	 */
-	while ((i < buf_size) && (buf[i] != ' '))
-		i++;
-	buf[i] = '\0';
-
-	err = kstrtou8(&buf[regp], 16, &user_reg);
-	if (err)
-		return err;
-
-	/* Either we read or we write a register here */
-	if (!priv->mode) {
-		/* Reading */
-		u8 regvalue;
-
-		ab3100_get_register_interruptible(ab3100, user_reg, &regvalue);
-
-		dev_info(ab3100->dev,
-			 "debug read AB3100 reg[0x%02x]: 0x%02x\n",
-			 user_reg, regvalue);
-	} else {
-		int valp;
-		u8 user_value;
-		u8 regvalue;
-
-		/*
-		 * Writing, we need some value to write to
-		 * the register so keep parsing the string
-		 * from userspace.
-		 */
-		i++;
-		while ((i < buf_size) && (buf[i] == ' '))
-			i++;
-		valp = i;
-		while ((i < buf_size) && (buf[i] != ' '))
-			i++;
-		buf[i] = '\0';
-
-		err = kstrtou8(&buf[valp], 16, &user_value);
-		if (err)
-			return err;
-
-		ab3100_set_register_interruptible(ab3100, user_reg, user_value);
-		ab3100_get_register_interruptible(ab3100, user_reg, &regvalue);
-
-		dev_info(ab3100->dev,
-			 "debug write reg[0x%02x]\n"
-			 "  with 0x%02x, after readback: 0x%02x\n",
-			 user_reg, user_value, regvalue);
-	}
-	return buf_size;
-}
-
-static const struct file_operations ab3100_get_set_reg_fops = {
-	.open = simple_open,
-	.write = ab3100_get_set_reg,
-	.llseek = noop_llseek,
-};
-
-static struct ab3100_get_set_reg_priv ab3100_get_priv;
-static struct ab3100_get_set_reg_priv ab3100_set_priv;
-
-static void ab3100_setup_debugfs(struct ab3100 *ab3100)
-{
-	struct dentry *ab3100_dir;
-
-	ab3100_dir = debugfs_create_dir("ab3100", NULL);
-
-	debugfs_create_file("registers", S_IRUGO, ab3100_dir, ab3100,
-			    &ab3100_registers_fops);
-
-	ab3100_get_priv.ab3100 = ab3100;
-	ab3100_get_priv.mode = false;
-	debugfs_create_file("get_reg", S_IWUSR, ab3100_dir, &ab3100_get_priv,
-			    &ab3100_get_set_reg_fops);
-
-	ab3100_set_priv.ab3100 = ab3100;
-	ab3100_set_priv.mode = true;
-	debugfs_create_file("set_reg", S_IWUSR, ab3100_dir, &ab3100_set_priv,
-			    &ab3100_get_set_reg_fops);
-}
-#else
-static inline void ab3100_setup_debugfs(struct ab3100 *ab3100)
-{
-}
-#endif
-
-/*
- * Basic set-up, datastructure creation/destruction and I2C interface.
- * This sets up a default config in the AB3100 chip so that it
- * will work as expected.
- */
-
-struct ab3100_init_setting {
-	u8 abreg;
-	u8 setting;
-};
-
-static const struct ab3100_init_setting ab3100_init_settings[] = {
-	{
-		.abreg = AB3100_MCA,
-		.setting = 0x01
-	}, {
-		.abreg = AB3100_MCB,
-		.setting = 0x30
-	}, {
-		.abreg = AB3100_IMRA1,
-		.setting = 0x00
-	}, {
-		.abreg = AB3100_IMRA2,
-		.setting = 0xFF
-	}, {
-		.abreg = AB3100_IMRA3,
-		.setting = 0x01
-	}, {
-		.abreg = AB3100_IMRB1,
-		.setting = 0xBF
-	}, {
-		.abreg = AB3100_IMRB2,
-		.setting = 0xFF
-	}, {
-		.abreg = AB3100_IMRB3,
-		.setting = 0xFF
-	}, {
-		.abreg = AB3100_SUP,
-		.setting = 0x00
-	}, {
-		.abreg = AB3100_DIS,
-		.setting = 0xF0
-	}, {
-		.abreg = AB3100_D0C,
-		.setting = 0x00
-	}, {
-		.abreg = AB3100_D1C,
-		.setting = 0x00
-	}, {
-		.abreg = AB3100_D2C,
-		.setting = 0x00
-	}, {
-		.abreg = AB3100_D3C,
-		.setting = 0x00
-	},
-};
-
-static int ab3100_setup(struct ab3100 *ab3100)
-{
-	int err = 0;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(ab3100_init_settings); i++) {
-		err = ab3100_set_register_interruptible(ab3100,
-					  ab3100_init_settings[i].abreg,
-					  ab3100_init_settings[i].setting);
-		if (err)
-			goto exit_no_setup;
-	}
-
-	/*
-	 * Special trick to make the AB3100 use the 32kHz clock (RTC)
-	 * bit 3 in test register 0x02 is a special, undocumented test
-	 * register bit that only exist in AB3100 P1E
-	 */
-	if (ab3100->chip_id == 0xc4) {
-		dev_warn(ab3100->dev,
-			 "AB3100 P1E variant detected forcing chip to 32KHz\n");
-		err = ab3100_set_test_register_interruptible(ab3100,
-			0x02, 0x08);
-	}
-
- exit_no_setup:
-	return err;
-}
-
-/* The subdevices of the AB3100 */
-static struct mfd_cell ab3100_devs[] = {
-	{
-		.name = "ab3100-dac",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-leds",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-power",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-regulators",
-		.of_compatible = "stericsson,ab3100-regulators",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-sim",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-uart",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-rtc",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-charger",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-boost",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-adc",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-fuelgauge",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-vibrator",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-otp",
-		.id = -1,
-	},
-	{
-		.name = "ab3100-codec",
-		.id = -1,
-	},
-};
-
-struct ab_family_id {
-	u8	id;
-	char	*name;
-};
-
-static const struct ab_family_id ids[] = {
-	/* AB3100 */
-	{
-		.id = 0xc0,
-		.name = "P1A"
-	}, {
-		.id = 0xc1,
-		.name = "P1B"
-	}, {
-		.id = 0xc2,
-		.name = "P1C"
-	}, {
-		.id = 0xc3,
-		.name = "P1D"
-	}, {
-		.id = 0xc4,
-		.name = "P1E"
-	}, {
-		.id = 0xc5,
-		.name = "P1F/R1A"
-	}, {
-		.id = 0xc6,
-		.name = "P1G/R1A"
-	}, {
-		.id = 0xc7,
-		.name = "P2A/R2A"
-	}, {
-		.id = 0xc8,
-		.name = "P2B/R2B"
-	},
-	/* AB3000 variants, not supported */
-	{
-		.id = 0xa0
-	}, {
-		.id = 0xa1
-	}, {
-		.id = 0xa2
-	}, {
-		.id = 0xa3
-	}, {
-		.id = 0xa4
-	}, {
-		.id = 0xa5
-	}, {
-		.id = 0xa6
-	}, {
-		.id = 0xa7
-	},
-	/* Terminator */
-	{
-		.id = 0x00,
-	},
-};
-
-static int ab3100_probe(struct i2c_client *client,
-				  const struct i2c_device_id *id)
-{
-	struct ab3100 *ab3100;
-	struct ab3100_platform_data *ab3100_plf_data =
-		dev_get_platdata(&client->dev);
-	int err;
-	int i;
-
-	ab3100 = devm_kzalloc(&client->dev, sizeof(struct ab3100), GFP_KERNEL);
-	if (!ab3100)
-		return -ENOMEM;
-
-	/* Initialize data structure */
-	mutex_init(&ab3100->access_mutex);
-	BLOCKING_INIT_NOTIFIER_HEAD(&ab3100->event_subscribers);
-
-	ab3100->i2c_client = client;
-	ab3100->dev = &ab3100->i2c_client->dev;
-
-	i2c_set_clientdata(client, ab3100);
-
-	/* Read chip ID register */
-	err = ab3100_get_register_interruptible(ab3100, AB3100_CID,
-						&ab3100->chip_id);
-	if (err) {
-		dev_err(&client->dev,
-			"failed to communicate with AB3100 chip\n");
-		goto exit_no_detect;
-	}
-
-	for (i = 0; ids[i].id != 0x0; i++) {
-		if (ids[i].id == ab3100->chip_id) {
-			if (ids[i].name)
-				break;
-
-			dev_err(&client->dev, "AB3000 is not supported\n");
-			goto exit_no_detect;
-		}
-	}
-
-	snprintf(&ab3100->chip_name[0],
-		 sizeof(ab3100->chip_name) - 1, "AB3100 %s", ids[i].name);
-
-	if (ids[i].id == 0x0) {
-		dev_err(&client->dev, "unknown analog baseband chip id: 0x%x\n",
-			ab3100->chip_id);
-		dev_err(&client->dev,
-			"accepting it anyway. Please update the driver.\n");
-		goto exit_no_detect;
-	}
-
-	dev_info(&client->dev, "Detected chip: %s\n",
-		 &ab3100->chip_name[0]);
-
-	/* Attach a second dummy i2c_client to the test register address */
-	ab3100->testreg_client = i2c_new_dummy_device(client->adapter,
-					       client->addr + 1);
-	if (IS_ERR(ab3100->testreg_client)) {
-		err = PTR_ERR(ab3100->testreg_client);
-		goto exit_no_testreg_client;
-	}
-
-	err = ab3100_setup(ab3100);
-	if (err)
-		goto exit_no_setup;
-
-	err = devm_request_threaded_irq(&client->dev,
-					client->irq, NULL, ab3100_irq_handler,
-					IRQF_ONESHOT, "ab3100-core", ab3100);
-	if (err)
-		goto exit_no_irq;
-
-	err = abx500_register_ops(&client->dev, &ab3100_ops);
-	if (err)
-		goto exit_no_ops;
-
-	/* Set up and register the platform devices. */
-	for (i = 0; i < ARRAY_SIZE(ab3100_devs); i++) {
-		ab3100_devs[i].platform_data = ab3100_plf_data;
-		ab3100_devs[i].pdata_size = sizeof(struct ab3100_platform_data);
-	}
-
-	err = mfd_add_devices(&client->dev, 0, ab3100_devs,
-			      ARRAY_SIZE(ab3100_devs), NULL, 0, NULL);
-
-	ab3100_setup_debugfs(ab3100);
-
-	return 0;
-
- exit_no_ops:
- exit_no_irq:
- exit_no_setup:
-	i2c_unregister_device(ab3100->testreg_client);
- exit_no_testreg_client:
- exit_no_detect:
-	return err;
-}
-
-static const struct i2c_device_id ab3100_id[] = {
-	{ "ab3100", 0 },
-	{ }
-};
-
-static struct i2c_driver ab3100_driver = {
-	.driver = {
-		.name			= "ab3100",
-		.suppress_bind_attrs	= true,
-	},
-	.id_table	= ab3100_id,
-	.probe		= ab3100_probe,
-};
-
-static int __init ab3100_i2c_init(void)
-{
-	return i2c_add_driver(&ab3100_driver);
-}
-subsys_initcall(ab3100_i2c_init);
diff --git a/drivers/mfd/ab3100-otp.c b/drivers/mfd/ab3100-otp.c
deleted file mode 100644
index c393102e3a39..000000000000
--- a/drivers/mfd/ab3100-otp.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * drivers/mfd/ab3100_otp.c
- *
- * Copyright (C) 2007-2009 ST-Ericsson AB
- * Driver to read out OTP from the AB3100 Mixed-signal circuit
- * Author: Linus Walleij <linus.walleij@stericsson.com>
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mfd/abx500.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-/* The OTP registers */
-#define AB3100_OTP0		0xb0
-#define AB3100_OTP1		0xb1
-#define AB3100_OTP2		0xb2
-#define AB3100_OTP3		0xb3
-#define AB3100_OTP4		0xb4
-#define AB3100_OTP5		0xb5
-#define AB3100_OTP6		0xb6
-#define AB3100_OTP7		0xb7
-#define AB3100_OTPP		0xbf
-
-/**
- * struct ab3100_otp
- * @dev: containing device
- * @locked: whether the OTP is locked, after locking, no more bits
- *       can be changed but before locking it is still possible
- *       to change bits from 1->0.
- * @freq: clocking frequency for the OTP, this frequency is either
- *       32768Hz or 1MHz/30
- * @paf: product activation flag, indicates whether this is a real
- *       product (paf true) or a lab board etc (paf false)
- * @imeich: if this is set it is possible to override the
- *       IMEI number found in the tac, fac and svn fields with
- *       (secured) software
- * @cid: customer ID
- * @tac: type allocation code of the IMEI
- * @fac: final assembly code of the IMEI
- * @svn: software version number of the IMEI
- * @debugfs: a debugfs file used when dumping to file
- */
-struct ab3100_otp {
-	struct device *dev;
-	bool locked;
-	u32 freq;
-	bool paf;
-	bool imeich;
-	u16 cid:14;
-	u32 tac:20;
-	u8 fac;
-	u32 svn:20;
-	struct dentry *debugfs;
-};
-
-static int __init ab3100_otp_read(struct ab3100_otp *otp)
-{
-	u8 otpval[8];
-	u8 otpp;
-	int err;
-
-	err = abx500_get_register_interruptible(otp->dev, 0,
-		AB3100_OTPP, &otpp);
-	if (err) {
-		dev_err(otp->dev, "unable to read OTPP register\n");
-		return err;
-	}
-
-	err = abx500_get_register_page_interruptible(otp->dev, 0,
-		AB3100_OTP0, otpval, 8);
-	if (err) {
-		dev_err(otp->dev, "unable to read OTP register page\n");
-		return err;
-	}
-
-	/* Cache OTP properties, they never change by nature */
-	otp->locked = (otpp & 0x80);
-	otp->freq = (otpp & 0x40) ? 32768 : 34100;
-	otp->paf = (otpval[1] & 0x80);
-	otp->imeich = (otpval[1] & 0x40);
-	otp->cid = ((otpval[1] << 8) | otpval[0]) & 0x3fff;
-	otp->tac = ((otpval[4] & 0x0f) << 16) | (otpval[3] << 8) | otpval[2];
-	otp->fac = ((otpval[5] & 0x0f) << 4) | (otpval[4] >> 4);
-	otp->svn = (otpval[7] << 12) | (otpval[6] << 4) | (otpval[5] >> 4);
-	return 0;
-}
-
-/*
- * This is a simple debugfs human-readable file that dumps out
- * the contents of the OTP.
- */
-#ifdef CONFIG_DEBUG_FS
-static int ab3100_show_otp(struct seq_file *s, void *v)
-{
-	struct ab3100_otp *otp = s->private;
-
-	seq_printf(s, "OTP is %s\n", otp->locked ? "LOCKED" : "UNLOCKED");
-	seq_printf(s, "OTP clock switch startup is %uHz\n", otp->freq);
-	seq_printf(s, "PAF is %s\n", otp->paf ? "SET" : "NOT SET");
-	seq_printf(s, "IMEI is %s\n", otp->imeich ?
-		   "CHANGEABLE" : "NOT CHANGEABLE");
-	seq_printf(s, "CID: 0x%04x (decimal: %d)\n", otp->cid, otp->cid);
-	seq_printf(s, "IMEI: %u-%u-%u\n", otp->tac, otp->fac, otp->svn);
-	return 0;
-}
-
-static int ab3100_otp_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ab3100_show_otp, inode->i_private);
-}
-
-static const struct file_operations ab3100_otp_operations = {
-	.open		= ab3100_otp_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static void __init ab3100_otp_init_debugfs(struct device *dev,
-					   struct ab3100_otp *otp)
-{
-	otp->debugfs = debugfs_create_file("ab3100_otp", S_IFREG | S_IRUGO,
-					   NULL, otp, &ab3100_otp_operations);
-}
-
-static void __exit ab3100_otp_exit_debugfs(struct ab3100_otp *otp)
-{
-	debugfs_remove(otp->debugfs);
-}
-#else
-/* Compile this out if debugfs not selected */
-static inline void __init ab3100_otp_init_debugfs(struct device *dev,
-						  struct ab3100_otp *otp)
-{
-}
-
-static inline void __exit ab3100_otp_exit_debugfs(struct ab3100_otp *otp)
-{
-}
-#endif
-
-#define SHOW_AB3100_ATTR(name) \
-static ssize_t ab3100_otp_##name##_show(struct device *dev, \
-			       struct device_attribute *attr, \
-			       char *buf) \
-{\
-	struct ab3100_otp *otp = dev_get_drvdata(dev); \
-	return sprintf(buf, "%u\n", otp->name); \
-}
-
-SHOW_AB3100_ATTR(locked)
-SHOW_AB3100_ATTR(freq)
-SHOW_AB3100_ATTR(paf)
-SHOW_AB3100_ATTR(imeich)
-SHOW_AB3100_ATTR(cid)
-SHOW_AB3100_ATTR(fac)
-SHOW_AB3100_ATTR(tac)
-SHOW_AB3100_ATTR(svn)
-
-static struct device_attribute ab3100_otp_attrs[] = {
-	__ATTR(locked, S_IRUGO, ab3100_otp_locked_show, NULL),
-	__ATTR(freq, S_IRUGO, ab3100_otp_freq_show, NULL),
-	__ATTR(paf, S_IRUGO, ab3100_otp_paf_show, NULL),
-	__ATTR(imeich, S_IRUGO, ab3100_otp_imeich_show, NULL),
-	__ATTR(cid, S_IRUGO, ab3100_otp_cid_show, NULL),
-	__ATTR(fac, S_IRUGO, ab3100_otp_fac_show, NULL),
-	__ATTR(tac, S_IRUGO, ab3100_otp_tac_show, NULL),
-	__ATTR(svn, S_IRUGO, ab3100_otp_svn_show, NULL),
-};
-
-static int __init ab3100_otp_probe(struct platform_device *pdev)
-{
-	struct ab3100_otp *otp;
-	int err = 0;
-	int i;
-
-	otp = devm_kzalloc(&pdev->dev, sizeof(struct ab3100_otp), GFP_KERNEL);
-	if (!otp)
-		return -ENOMEM;
-
-	otp->dev = &pdev->dev;
-
-	/* Replace platform data coming in with a local struct */
-	platform_set_drvdata(pdev, otp);
-
-	err = ab3100_otp_read(otp);
-	if (err)
-		return err;
-
-	dev_info(&pdev->dev, "AB3100 OTP readout registered\n");
-
-	/* sysfs entries */
-	for (i = 0; i < ARRAY_SIZE(ab3100_otp_attrs); i++) {
-		err = device_create_file(&pdev->dev,
-					 &ab3100_otp_attrs[i]);
-		if (err)
-			goto err;
-	}
-
-	/* debugfs entries */
-	ab3100_otp_init_debugfs(&pdev->dev, otp);
-
-	return 0;
-
-err:
-	while (--i >= 0)
-		device_remove_file(&pdev->dev, &ab3100_otp_attrs[i]);
-	return err;
-}
-
-static int __exit ab3100_otp_remove(struct platform_device *pdev)
-{
-	struct ab3100_otp *otp = platform_get_drvdata(pdev);
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(ab3100_otp_attrs); i++)
-		device_remove_file(&pdev->dev,
-				   &ab3100_otp_attrs[i]);
-	ab3100_otp_exit_debugfs(otp);
-	return 0;
-}
-
-static struct platform_driver ab3100_otp_driver = {
-	.driver = {
-		.name = "ab3100-otp",
-	},
-	.remove	 = __exit_p(ab3100_otp_remove),
-};
-
-module_platform_driver_probe(ab3100_otp_driver, ab3100_otp_probe);
-
-MODULE_AUTHOR("Linus Walleij <linus.walleij@stericsson.com>");
-MODULE_DESCRIPTION("AB3100 OTP Readout Driver");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/ab3100.h b/include/linux/mfd/ab3100.h
deleted file mode 100644
index a881d8495186..000000000000
--- a/include/linux/mfd/ab3100.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2007-2009 ST-Ericsson AB
- * AB3100 core access functions
- * Author: Linus Walleij <linus.walleij@stericsson.com>
- */
-
-#include <linux/regulator/machine.h>
-
-struct device;
-
-#ifndef MFD_AB3100_H
-#define MFD_AB3100_H
-
-
-#define AB3100_P1A	0xc0
-#define AB3100_P1B	0xc1
-#define AB3100_P1C	0xc2
-#define AB3100_P1D	0xc3
-#define AB3100_P1E	0xc4
-#define AB3100_P1F	0xc5
-#define AB3100_P1G	0xc6
-#define AB3100_R2A	0xc7
-#define AB3100_R2B	0xc8
-
-/*
- * AB3100, EVENTA1, A2 and A3 event register flags
- * these are catenated into a single 32-bit flag in the code
- * for event notification broadcasts.
- */
-#define AB3100_EVENTA1_ONSWA				(0x01<<16)
-#define AB3100_EVENTA1_ONSWB				(0x02<<16)
-#define AB3100_EVENTA1_ONSWC				(0x04<<16)
-#define AB3100_EVENTA1_DCIO				(0x08<<16)
-#define AB3100_EVENTA1_OVER_TEMP			(0x10<<16)
-#define AB3100_EVENTA1_SIM_OFF				(0x20<<16)
-#define AB3100_EVENTA1_VBUS				(0x40<<16)
-#define AB3100_EVENTA1_VSET_USB				(0x80<<16)
-
-#define AB3100_EVENTA2_READY_TX				(0x01<<8)
-#define AB3100_EVENTA2_READY_RX				(0x02<<8)
-#define AB3100_EVENTA2_OVERRUN_ERROR			(0x04<<8)
-#define AB3100_EVENTA2_FRAMING_ERROR			(0x08<<8)
-#define AB3100_EVENTA2_CHARG_OVERCURRENT		(0x10<<8)
-#define AB3100_EVENTA2_MIDR				(0x20<<8)
-#define AB3100_EVENTA2_BATTERY_REM			(0x40<<8)
-#define AB3100_EVENTA2_ALARM				(0x80<<8)
-
-#define AB3100_EVENTA3_ADC_TRIG5			(0x01)
-#define AB3100_EVENTA3_ADC_TRIG4			(0x02)
-#define AB3100_EVENTA3_ADC_TRIG3			(0x04)
-#define AB3100_EVENTA3_ADC_TRIG2			(0x08)
-#define AB3100_EVENTA3_ADC_TRIGVBAT			(0x10)
-#define AB3100_EVENTA3_ADC_TRIGVTX			(0x20)
-#define AB3100_EVENTA3_ADC_TRIG1			(0x40)
-#define AB3100_EVENTA3_ADC_TRIG0			(0x80)
-
-/* AB3100, STR register flags */
-#define AB3100_STR_ONSWA				(0x01)
-#define AB3100_STR_ONSWB				(0x02)
-#define AB3100_STR_ONSWC				(0x04)
-#define AB3100_STR_DCIO					(0x08)
-#define AB3100_STR_BOOT_MODE				(0x10)
-#define AB3100_STR_SIM_OFF				(0x20)
-#define AB3100_STR_BATT_REMOVAL				(0x40)
-#define AB3100_STR_VBUS					(0x80)
-
-/*
- * AB3100 contains 8 regulators, one external regulator controller
- * and a buck converter, further the LDO E and buck converter can
- * have separate settings if they are in sleep mode, this is
- * modeled as a separate regulator.
- */
-#define AB3100_NUM_REGULATORS				10
-
-/**
- * struct ab3100
- * @access_mutex: lock out concurrent accesses to the AB3100 registers
- * @dev: pointer to the containing device
- * @i2c_client: I2C client for this chip
- * @testreg_client: secondary client for test registers
- * @chip_name: name of this chip variant
- * @chip_id: 8 bit chip ID for this chip variant
- * @event_subscribers: event subscribers are listed here
- * @startup_events: a copy of the first reading of the event registers
- * @startup_events_read: whether the first events have been read
- *
- * This struct is PRIVATE and devices using it should NOT
- * access ANY fields. It is used as a token for calling the
- * AB3100 functions.
- */
-struct ab3100 {
-	struct mutex access_mutex;
-	struct device *dev;
-	struct i2c_client *i2c_client;
-	struct i2c_client *testreg_client;
-	char chip_name[32];
-	u8 chip_id;
-	struct blocking_notifier_head event_subscribers;
-	u8 startup_events[3];
-	bool startup_events_read;
-};
-
-/**
- * struct ab3100_platform_data
- * Data supplied to initialize board connections to the AB3100
- * @reg_constraints: regulator constraints for target board
- *     the order of these constraints are: LDO A, C, D, E,
- *     F, G, H, K, EXT and BUCK.
- * @reg_initvals: initial values for the regulator registers
- *     plus two sleep settings for LDO E and the BUCK converter.
- *     exactly AB3100_NUM_REGULATORS+2 values must be sent in.
- *     Order: LDO A, C, E, E sleep, F, G, H, K, EXT, BUCK,
- *     BUCK sleep, LDO D. (LDO D need to be initialized last.)
- * @external_voltage: voltage level of the external regulator.
- */
-struct ab3100_platform_data {
-	struct regulator_init_data reg_constraints[AB3100_NUM_REGULATORS];
-	u8 reg_initvals[AB3100_NUM_REGULATORS+2];
-	int external_voltage;
-};
-
-int ab3100_event_register(struct ab3100 *ab3100,
-			  struct notifier_block *nb);
-int ab3100_event_unregister(struct ab3100 *ab3100,
-			    struct notifier_block *nb);
-
-#endif /*  MFD_AB3100_H */
-- 
cgit v1.2.3


From d9b326b2c3673f939941806146aee38e5c635fd0 Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@intel.com>
Date: Wed, 10 Mar 2021 23:55:45 +0800
Subject: mfd: intel-m10-bmc: Fix the register access range

This patch fixes the max register address of MAX 10 BMC. The range
0x20000000 ~ 0x200000fc are for control registers of the QSPI flash
controller, which are not accessible to host.

Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Reviewed-by: Tom Rix <trix@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/intel-m10-bmc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/intel-m10-bmc.h b/include/linux/mfd/intel-m10-bmc.h
index 74d4e193966a..9b54ca13eac3 100644
--- a/include/linux/mfd/intel-m10-bmc.h
+++ b/include/linux/mfd/intel-m10-bmc.h
@@ -11,7 +11,7 @@
 
 #define M10BMC_LEGACY_SYS_BASE		0x300400
 #define M10BMC_SYS_BASE			0x300800
-#define M10BMC_MEM_END			0x200000fc
+#define M10BMC_MEM_END			0x1fffffff
 
 /* Register offset of system registers */
 #define NIOS2_FW_VERSION		0x0
-- 
cgit v1.2.3


From 5893f4d1f43036664010e3ae1d3f7a98b2165a5d Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@intel.com>
Date: Wed, 10 Mar 2021 23:55:46 +0800
Subject: mfd: intel-m10-bmc: Simplify the legacy version reg definition

The version register is the only one in the legacy I/O space to be
accessed, so it is not necessary to define the legacy base & version
register offset. A direct definition of the legacy version register
address would be fine.

Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Reviewed-by: Tom Rix <trix@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/intel-m10-bmc.c       | 17 +++++++----------
 include/linux/mfd/intel-m10-bmc.h |  2 +-
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/intel-m10-bmc.c b/drivers/mfd/intel-m10-bmc.c
index 06c977519479..90d0448a5500 100644
--- a/drivers/mfd/intel-m10-bmc.c
+++ b/drivers/mfd/intel-m10-bmc.c
@@ -116,17 +116,14 @@ static int check_m10bmc_version(struct intel_m10bmc *ddata)
 	int ret;
 
 	/*
-	 * This check is to filter out the very old legacy BMC versions,
-	 * M10BMC_LEGACY_SYS_BASE is the offset to this old block of mmio
-	 * registers. In the old BMC chips, the BMC version info is stored
-	 * in this old version register (M10BMC_LEGACY_SYS_BASE +
-	 * M10BMC_BUILD_VER), so its read out value would have not been
-	 * LEGACY_INVALID (0xffffffff). But in new BMC chips that the
-	 * driver supports, the value of this register should be
-	 * LEGACY_INVALID.
+	 * This check is to filter out the very old legacy BMC versions. In the
+	 * old BMC chips, the BMC version info is stored in the old version
+	 * register (M10BMC_LEGACY_BUILD_VER), so its read out value would have
+	 * not been M10BMC_VER_LEGACY_INVALID (0xffffffff). But in new BMC
+	 * chips that the driver supports, the value of this register should be
+	 * M10BMC_VER_LEGACY_INVALID.
 	 */
-	ret = m10bmc_raw_read(ddata,
-			      M10BMC_LEGACY_SYS_BASE + M10BMC_BUILD_VER, &v);
+	ret = m10bmc_raw_read(ddata, M10BMC_LEGACY_BUILD_VER, &v);
 	if (ret)
 		return -ENODEV;
 
diff --git a/include/linux/mfd/intel-m10-bmc.h b/include/linux/mfd/intel-m10-bmc.h
index 9b54ca13eac3..4f1071febb9e 100644
--- a/include/linux/mfd/intel-m10-bmc.h
+++ b/include/linux/mfd/intel-m10-bmc.h
@@ -9,7 +9,7 @@
 
 #include <linux/regmap.h>
 
-#define M10BMC_LEGACY_SYS_BASE		0x300400
+#define M10BMC_LEGACY_BUILD_VER		0x300468
 #define M10BMC_SYS_BASE			0x300800
 #define M10BMC_MEM_END			0x1fffffff
 
-- 
cgit v1.2.3


From 8169f74ca6f318f4187536050d2f5408fce9c264 Mon Sep 17 00:00:00 2001
From: Matthew Gerlach <matthew.gerlach@linux.intel.com>
Date: Wed, 10 Mar 2021 23:55:47 +0800
Subject: mfd: intel-m10-bmc: Add access table configuration to the regmap

This patch adds access tables to the MAX 10 BMC regmap. This prevents
the host from accessing the unwanted I/O space. It also filters out the
invalid outputs when reading the regmap debugfs interface.

Signed-off-by: Matthew Gerlach <matthew.gerlach@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@intel.com>
Reviewed-by: Tom Rix <trix@redhat.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/intel-m10-bmc.c       | 13 +++++++++++++
 include/linux/mfd/intel-m10-bmc.h |  5 ++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/intel-m10-bmc.c b/drivers/mfd/intel-m10-bmc.c
index 90d0448a5500..1161933cbd4a 100644
--- a/drivers/mfd/intel-m10-bmc.c
+++ b/drivers/mfd/intel-m10-bmc.c
@@ -23,10 +23,23 @@ static struct mfd_cell m10bmc_pacn3000_subdevs[] = {
 	{ .name = "n3000bmc-secure" },
 };
 
+static const struct regmap_range m10bmc_regmap_range[] = {
+	regmap_reg_range(M10BMC_LEGACY_BUILD_VER, M10BMC_LEGACY_BUILD_VER),
+	regmap_reg_range(M10BMC_SYS_BASE, M10BMC_SYS_END),
+	regmap_reg_range(M10BMC_FLASH_BASE, M10BMC_FLASH_END),
+};
+
+static const struct regmap_access_table m10bmc_access_table = {
+	.yes_ranges	= m10bmc_regmap_range,
+	.n_yes_ranges	= ARRAY_SIZE(m10bmc_regmap_range),
+};
+
 static struct regmap_config intel_m10bmc_regmap_config = {
 	.reg_bits = 32,
 	.val_bits = 32,
 	.reg_stride = 4,
+	.wr_table = &m10bmc_access_table,
+	.rd_table = &m10bmc_access_table,
 	.max_register = M10BMC_MEM_END,
 };
 
diff --git a/include/linux/mfd/intel-m10-bmc.h b/include/linux/mfd/intel-m10-bmc.h
index 4f1071febb9e..c4eb38c13eda 100644
--- a/include/linux/mfd/intel-m10-bmc.h
+++ b/include/linux/mfd/intel-m10-bmc.h
@@ -11,7 +11,10 @@
 
 #define M10BMC_LEGACY_BUILD_VER		0x300468
 #define M10BMC_SYS_BASE			0x300800
-#define M10BMC_MEM_END			0x1fffffff
+#define M10BMC_SYS_END			0x300fff
+#define M10BMC_FLASH_BASE		0x10000000
+#define M10BMC_FLASH_END		0x1fffffff
+#define M10BMC_MEM_END			M10BMC_FLASH_END
 
 /* Register offset of system registers */
 #define NIOS2_FW_VERSION		0x0
-- 
cgit v1.2.3


From 586478bfc9f7e16504d6f64cf18bcbdf6fd0cbc9 Mon Sep 17 00:00:00 2001
From: Hubert Streidl <hubert.streidl@de.bosch.com>
Date: Tue, 16 Mar 2021 17:22:37 +0100
Subject: mfd: da9063: Support SMBus and I2C mode

By default the PMIC DA9063 2-wire interface is SMBus compliant. This
means the PMIC will automatically reset the interface when the clock
signal ceases for more than the SMBus timeout of 35 ms.

If the I2C driver / device is not capable of creating atomic I2C
transactions, a context change can cause a ceasing of the clock signal.
This can happen if for example a real-time thread is scheduled. Then
the DA9063 in SMBus mode will reset the 2-wire interface. Subsequently
a write message could end up in the wrong register. This could cause
unpredictable system behavior.

The DA9063 PMIC also supports an I2C compliant mode for the 2-wire
interface. This mode does not reset the interface when the clock
signal ceases. Thus the problem depicted above does not occur.

This patch tests for the bus functionality "I2C_FUNC_I2C". It can
reasonably be assumed that the bus cannot obey SMBus timings if
this functionality is set. SMBus commands most probably are emulated
in this case which is prone to the latency issue described above.

This patch enables the I2C bus mode if I2C_FUNC_I2C is set or
otherwise keeps the default SMBus mode.

Signed-off-by: Hubert Streidl <hubert.streidl@de.bosch.com>
Signed-off-by: Mark Jonas <mark.jonas@de.bosch.com>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/da9063-i2c.c             | 10 ++++++++++
 include/linux/mfd/da9063/registers.h |  3 +++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/da9063-i2c.c b/drivers/mfd/da9063-i2c.c
index 3781d0bb7786..783a14af18e2 100644
--- a/drivers/mfd/da9063-i2c.c
+++ b/drivers/mfd/da9063-i2c.c
@@ -442,6 +442,16 @@ static int da9063_i2c_probe(struct i2c_client *i2c,
 		return ret;
 	}
 
+	/* If SMBus is not available and only I2C is possible, enter I2C mode */
+	if (i2c_check_functionality(i2c->adapter, I2C_FUNC_I2C)) {
+		ret = regmap_clear_bits(da9063->regmap, DA9063_REG_CONFIG_J,
+					DA9063_TWOWIRE_TO);
+		if (ret < 0) {
+			dev_err(da9063->dev, "Failed to set Two-Wire Bus Mode.\n");
+			return -EIO;
+		}
+	}
+
 	return da9063_device_init(da9063, i2c->irq);
 }
 
diff --git a/include/linux/mfd/da9063/registers.h b/include/linux/mfd/da9063/registers.h
index 1dbabf1b3cb8..6e0f66a2e727 100644
--- a/include/linux/mfd/da9063/registers.h
+++ b/include/linux/mfd/da9063/registers.h
@@ -1037,6 +1037,9 @@
 #define		DA9063_NONKEY_PIN_AUTODOWN	0x02
 #define		DA9063_NONKEY_PIN_AUTOFLPRT	0x03
 
+/* DA9063_REG_CONFIG_J (addr=0x10F) */
+#define DA9063_TWOWIRE_TO			0x40
+
 /* DA9063_REG_MON_REG_5 (addr=0x116) */
 #define DA9063_MON_A8_IDX_MASK			0x07
 #define		DA9063_MON_A8_IDX_NONE		0x00
-- 
cgit v1.2.3


From 42e59982917a25ad254b74e6e8decee5e684763d Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 1 Mar 2021 16:42:19 +0200
Subject: mfd: core: Add support for software nodes

The old device property API is going to be removed and
replaced with the newer software node API. This prepares MFD
subsystem for the transition.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mfd-core.c   | 9 +++++++++
 include/linux/mfd/core.h | 4 ++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c
index fc00aaccb5f7..e24008b94aac 100644
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -244,6 +244,12 @@ static int mfd_add_device(struct device *parent, int id,
 			goto fail_of_entry;
 	}
 
+	if (cell->swnode) {
+		ret = device_add_software_node(&pdev->dev, cell->swnode);
+		if (ret)
+			goto fail_of_entry;
+	}
+
 	for (r = 0; r < cell->num_resources; r++) {
 		res[r].name = cell->resources[r].name;
 		res[r].flags = cell->resources[r].flags;
@@ -304,6 +310,7 @@ fail_of_entry:
 			list_del(&of_entry->list);
 			kfree(of_entry);
 		}
+	device_remove_software_node(&pdev->dev);
 fail_alias:
 	regulator_bulk_unregister_supply_alias(&pdev->dev,
 					       cell->parent_supplies,
@@ -372,6 +379,8 @@ static int mfd_remove_devices_fn(struct device *dev, void *data)
 	regulator_bulk_unregister_supply_alias(dev, cell->parent_supplies,
 					       cell->num_parent_supplies);
 
+	device_remove_software_node(&pdev->dev);
+
 	platform_device_unregister(pdev);
 	return 0;
 }
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 2009c4b936d9..9ec599167fe6 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -50,6 +50,7 @@
 #define MFD_DEP_LEVEL_HIGH 1
 
 struct irq_domain;
+struct software_node;
 struct property_entry;
 
 /* Matches ACPI PNP id, either _HID or _CID, or ACPI _ADR */
@@ -78,6 +79,9 @@ struct mfd_cell {
 	void			*platform_data;
 	size_t			pdata_size;
 
+	/* Software node for the device. */
+	const struct software_node *swnode;
+
 	/* device properties passed to the sub devices drivers */
 	const struct property_entry *properties;
 
-- 
cgit v1.2.3


From b4a66acc0997cff7cb9a4c3992e97808700aa1ff Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Mon, 1 Mar 2021 16:42:22 +0200
Subject: mfd: core: Remove support for dangling device properties

From now on only accepting complete software nodes.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/mfd-core.c   | 6 ------
 include/linux/mfd/core.h | 4 ----
 2 files changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c
index e24008b94aac..6dcff4b8e15e 100644
--- a/drivers/mfd/mfd-core.c
+++ b/drivers/mfd/mfd-core.c
@@ -238,12 +238,6 @@ static int mfd_add_device(struct device *parent, int id,
 			goto fail_of_entry;
 	}
 
-	if (cell->properties) {
-		ret = platform_device_add_properties(pdev, cell->properties);
-		if (ret)
-			goto fail_of_entry;
-	}
-
 	if (cell->swnode) {
 		ret = device_add_software_node(&pdev->dev, cell->swnode);
 		if (ret)
diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h
index 9ec599167fe6..0bc7cba798a3 100644
--- a/include/linux/mfd/core.h
+++ b/include/linux/mfd/core.h
@@ -51,7 +51,6 @@
 
 struct irq_domain;
 struct software_node;
-struct property_entry;
 
 /* Matches ACPI PNP id, either _HID or _CID, or ACPI _ADR */
 struct mfd_cell_acpi_match {
@@ -82,9 +81,6 @@ struct mfd_cell {
 	/* Software node for the device. */
 	const struct software_node *swnode;
 
-	/* device properties passed to the sub devices drivers */
-	const struct property_entry *properties;
-
 	/*
 	 * Device Tree compatible string
 	 * See: Documentation/devicetree/usage-model.rst Chapter 2.2 for details
-- 
cgit v1.2.3


From 4502647e60cb4afd74f74d648bc2990954c1b73a Mon Sep 17 00:00:00 2001
From: Andreas Kemnade <andreas@kemnade.info>
Date: Sun, 14 Mar 2021 12:02:36 +0100
Subject: mfd: rn5t618: Do not cache various USB related registers

These register get reset to their OTP defaults after USB plugging.
And while at it, also add a missing register for detecting the
charger type.

Signed-off-by: Andreas Kemnade <andreas@kemnade.info>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rn5t618.c       | 3 +++
 include/linux/mfd/rn5t618.h | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/rn5t618.c b/drivers/mfd/rn5t618.c
index dc452df1f1bf..6ed04e6dbc78 100644
--- a/drivers/mfd/rn5t618.c
+++ b/drivers/mfd/rn5t618.c
@@ -45,8 +45,11 @@ static bool rn5t618_volatile_reg(struct device *dev, unsigned int reg)
 	case RN5T618_INTMON:
 	case RN5T618_RTC_CTRL1 ... RN5T618_RTC_CTRL2:
 	case RN5T618_RTC_SECONDS ... RN5T618_RTC_YEAR:
+	case RN5T618_CHGCTL1:
+	case RN5T618_REGISET1 ... RN5T618_REGISET2:
 	case RN5T618_CHGSTATE:
 	case RN5T618_CHGCTRL_IRR ... RN5T618_CHGERR_MONI:
+	case RN5T618_GCHGDET:
 	case RN5T618_CONTROL ... RN5T618_CC_AVEREG0:
 		return true;
 	default:
diff --git a/include/linux/mfd/rn5t618.h b/include/linux/mfd/rn5t618.h
index fba0df13d9a8..8aa0bda1af4f 100644
--- a/include/linux/mfd/rn5t618.h
+++ b/include/linux/mfd/rn5t618.h
@@ -188,6 +188,7 @@
 #define RN5T618_CHGOSCSCORESET3		0xd7
 #define RN5T618_CHGOSCFREQSET1		0xd8
 #define RN5T618_CHGOSCFREQSET2		0xd9
+#define RN5T618_GCHGDET			0xda
 #define RN5T618_CONTROL			0xe0
 #define RN5T618_SOC			0xe1
 #define RN5T618_RE_CAP_H		0xe2
-- 
cgit v1.2.3


From d1157530d476ffce4485182eea5b492065362a09 Mon Sep 17 00:00:00 2001
From: Andreas Kemnade <andreas@kemnade.info>
Date: Mon, 15 Mar 2021 20:18:32 +0100
Subject: mfd: ntxec: Support for EC in Tolino Shine 2 HD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the version of the EC in the Tolino Shine 2 HD
to the supported versions. It seems not to have an RTC
and does not ack data written to it.
The vendor kernel happily ignores write errors, using
I2C via userspace i2c-set also shows the error.
So add a quirk to ignore that error.

PWM can be successfully configured despite of that error.

Signed-off-by: Andreas Kemnade <andreas@kemnade.info>
Reviewed-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/ntxec.c       | 56 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/mfd/ntxec.h |  1 +
 2 files changed, 54 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ntxec.c b/drivers/mfd/ntxec.c
index 957de2b03529..b711e73eedcb 100644
--- a/drivers/mfd/ntxec.c
+++ b/drivers/mfd/ntxec.c
@@ -96,6 +96,38 @@ static struct notifier_block ntxec_restart_handler = {
 	.priority = 128,
 };
 
+static int regmap_ignore_write(void *context,
+			       unsigned int reg, unsigned int val)
+
+{
+	struct regmap *regmap = context;
+
+	regmap_write(regmap, reg, val);
+
+	return 0;
+}
+
+static int regmap_wrap_read(void *context, unsigned int reg,
+			    unsigned int *val)
+{
+	struct regmap *regmap = context;
+
+	return regmap_read(regmap, reg, val);
+}
+
+/*
+ * Some firmware versions do not ack written data, add a wrapper. It
+ * is used to stack another regmap on top.
+ */
+static const struct regmap_config regmap_config_noack = {
+	.name = "ntxec_noack",
+	.reg_bits = 8,
+	.val_bits = 16,
+	.cache_type = REGCACHE_NONE,
+	.reg_write = regmap_ignore_write,
+	.reg_read = regmap_wrap_read
+};
+
 static const struct regmap_config regmap_config = {
 	.name = "ntxec",
 	.reg_bits = 8,
@@ -104,16 +136,22 @@ static const struct regmap_config regmap_config = {
 	.val_format_endian = REGMAP_ENDIAN_BIG,
 };
 
-static const struct mfd_cell ntxec_subdevices[] = {
+static const struct mfd_cell ntxec_subdev[] = {
 	{ .name = "ntxec-rtc" },
 	{ .name = "ntxec-pwm" },
 };
 
+static const struct mfd_cell ntxec_subdev_pwm[] = {
+	{ .name = "ntxec-pwm" },
+};
+
 static int ntxec_probe(struct i2c_client *client)
 {
 	struct ntxec *ec;
 	unsigned int version;
 	int res;
+	const struct mfd_cell *subdevs;
+	size_t n_subdevs;
 
 	ec = devm_kmalloc(&client->dev, sizeof(*ec), GFP_KERNEL);
 	if (!ec)
@@ -137,6 +175,18 @@ static int ntxec_probe(struct i2c_client *client)
 	/* Bail out if we encounter an unknown firmware version */
 	switch (version) {
 	case NTXEC_VERSION_KOBO_AURA:
+		subdevs = ntxec_subdev;
+		n_subdevs = ARRAY_SIZE(ntxec_subdev);
+		break;
+	case NTXEC_VERSION_TOLINO_SHINE2:
+		subdevs = ntxec_subdev_pwm;
+		n_subdevs = ARRAY_SIZE(ntxec_subdev_pwm);
+		/* Another regmap stacked on top of the other */
+		ec->regmap = devm_regmap_init(ec->dev, NULL,
+					      ec->regmap,
+					      &regmap_config_noack);
+		if (IS_ERR(ec->regmap))
+			return PTR_ERR(ec->regmap);
 		break;
 	default:
 		dev_err(ec->dev,
@@ -181,8 +231,8 @@ static int ntxec_probe(struct i2c_client *client)
 
 	i2c_set_clientdata(client, ec);
 
-	res = devm_mfd_add_devices(ec->dev, PLATFORM_DEVID_NONE, ntxec_subdevices,
-				   ARRAY_SIZE(ntxec_subdevices), NULL, 0, NULL);
+	res = devm_mfd_add_devices(ec->dev, PLATFORM_DEVID_NONE,
+				   subdevs, n_subdevs, NULL, 0, NULL);
 	if (res)
 		dev_err(ec->dev, "Failed to add subdevices: %d\n", res);
 
diff --git a/include/linux/mfd/ntxec.h b/include/linux/mfd/ntxec.h
index 361204d125f1..26ab3b8eb612 100644
--- a/include/linux/mfd/ntxec.h
+++ b/include/linux/mfd/ntxec.h
@@ -33,5 +33,6 @@ static inline __be16 ntxec_reg8(u8 value)
 
 /* Known firmware versions */
 #define NTXEC_VERSION_KOBO_AURA	0xd726	/* found in Kobo Aura */
+#define NTXEC_VERSION_TOLINO_SHINE2 0xf110 /* found in Tolino Shine 2 HD */
 
 #endif
-- 
cgit v1.2.3


From 0b79c53e8a1c5c26c5a364c8f041ca6890a29f08 Mon Sep 17 00:00:00 2001
From: Timon Baetz <timon.baetz@protonmail.com>
Date: Wed, 27 Jan 2021 07:32:42 +0000
Subject: mfd: max8997: Replace 8998 with 8997

The max8997 header is using "max8998" in some identifiers.
Fix it by replacing 8998 with 8997 in enum and macro.

Signed-off-by: Timon Baetz <timon.baetz@protonmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/max8997.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max8997.h b/include/linux/mfd/max8997.h
index e955e2f0a2cc..6c98edcf4b0b 100644
--- a/include/linux/mfd/max8997.h
+++ b/include/linux/mfd/max8997.h
@@ -14,13 +14,13 @@
  * others and b) it can be enabled simply by using MAX17042 driver.
  */
 
-#ifndef __LINUX_MFD_MAX8998_H
-#define __LINUX_MFD_MAX8998_H
+#ifndef __LINUX_MFD_MAX8997_H
+#define __LINUX_MFD_MAX8997_H
 
 #include <linux/regulator/consumer.h>
 
 /* MAX8997/8966 regulator IDs */
-enum max8998_regulators {
+enum max8997_regulators {
 	MAX8997_LDO1 = 0,
 	MAX8997_LDO2,
 	MAX8997_LDO3,
@@ -207,4 +207,4 @@ struct max8997_platform_data {
 	struct max8997_led_platform_data *led_pdata;
 };
 
-#endif /* __LINUX_MFD_MAX8998_H */
+#endif /* __LINUX_MFD_MAX8997_H */
-- 
cgit v1.2.3


From 5a517b5bf687028149d55ba50b393c288a054601 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 31 Mar 2021 18:48:51 +0300
Subject: i2c: designware: Get rid of legacy platform data

Platform data is a legacy interface to supply device properties
to the driver. In this case we don't have anymore in-kernel users
for it. Just remove it for good.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Wolfram Sang <wsa@kernel.org>
Acked-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/i2c/busses/i2c-designware-platdrv.c  |  7 +------
 include/linux/platform_data/i2c-designware.h | 13 -------------
 2 files changed, 1 insertion(+), 19 deletions(-)
 delete mode 100644 include/linux/platform_data/i2c-designware.h

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c
index 0dfeb2d11603..4b37f28ec0c6 100644
--- a/drivers/i2c/busses/i2c-designware-platdrv.c
+++ b/drivers/i2c/busses/i2c-designware-platdrv.c
@@ -22,7 +22,6 @@
 #include <linux/mfd/syscon.h>
 #include <linux/module.h>
 #include <linux/of.h>
-#include <linux/platform_data/i2c-designware.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
@@ -206,7 +205,6 @@ static const struct dmi_system_id dw_i2c_hwmon_class_dmi[] = {
 
 static int dw_i2c_plat_probe(struct platform_device *pdev)
 {
-	struct dw_i2c_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	struct i2c_adapter *adap;
 	struct dw_i2c_dev *dev;
 	struct i2c_timings *t;
@@ -236,10 +234,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev)
 	reset_control_deassert(dev->rst);
 
 	t = &dev->timings;
-	if (pdata)
-		t->bus_freq_hz = pdata->i2c_scl_freq;
-	else
-		i2c_parse_fw_timings(&pdev->dev, t, false);
+	i2c_parse_fw_timings(&pdev->dev, t, false);
 
 	i2c_dw_adjust_bus_speed(dev);
 
diff --git a/include/linux/platform_data/i2c-designware.h b/include/linux/platform_data/i2c-designware.h
deleted file mode 100644
index 014c4a5a7e13..000000000000
--- a/include/linux/platform_data/i2c-designware.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright(c) 2014 Intel Corporation.
- */
-
-#ifndef I2C_DESIGNWARE_H
-#define I2C_DESIGNWARE_H
-
-struct dw_i2c_platform_data {
-	unsigned int i2c_scl_freq;
-};
-
-#endif
-- 
cgit v1.2.3


From ed25b4f00b61e109b29dc443dd1333b9b0da9bb4 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 1 Apr 2021 22:18:40 +0800
Subject: mfd: twl: Remove unused inline function twl4030charger_usb_en()

There is no caller in tree, so can remove it.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/twl.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/twl.h b/include/linux/mfd/twl.h
index 089e8942223a..8871cc5188a0 100644
--- a/include/linux/mfd/twl.h
+++ b/include/linux/mfd/twl.h
@@ -781,8 +781,6 @@ int twl4030_sih_setup(struct device *dev, int module, int irq_base);
 #define TWL4030_VAUX3_DEV_GRP		0x1F
 #define TWL4030_VAUX3_DEDICATED		0x22
 
-static inline int twl4030charger_usb_en(int enable) { return 0; }
-
 /*----------------------------------------------------------------------*/
 
 /* Linux-specific regulator identifiers ... for now, we only support
-- 
cgit v1.2.3


From f9386c91574fe6da9f4fca9a47734816b0db0019 Mon Sep 17 00:00:00 2001
From: Russ Weight <russell.h.weight@intel.com>
Date: Mon, 12 Apr 2021 12:53:28 -0700
Subject: mfd: intel-m10-bmc: Add support for MAX10 BMC Secure Updates

Add macros and definitions required by the MAX10 BMC
Secure Update driver.

Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/intel-m10-bmc.h | 85 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/intel-m10-bmc.h b/include/linux/mfd/intel-m10-bmc.h
index c4eb38c13eda..f0044b14136e 100644
--- a/include/linux/mfd/intel-m10-bmc.h
+++ b/include/linux/mfd/intel-m10-bmc.h
@@ -16,6 +16,9 @@
 #define M10BMC_FLASH_END		0x1fffffff
 #define M10BMC_MEM_END			M10BMC_FLASH_END
 
+#define M10BMC_STAGING_BASE		0x18000000
+#define M10BMC_STAGING_SIZE		0x3800000
+
 /* Register offset of system registers */
 #define NIOS2_FW_VERSION		0x0
 #define M10BMC_MAC_LOW			0x10
@@ -33,6 +36,88 @@
 #define M10BMC_VER_PCB_INFO_MSK		GENMASK(31, 24)
 #define M10BMC_VER_LEGACY_INVALID	0xffffffff
 
+/* Secure update doorbell register, in system register region */
+#define M10BMC_DOORBELL			0x400
+
+/* Authorization Result register, in system register region */
+#define M10BMC_AUTH_RESULT		0x404
+
+/* Doorbell register fields */
+#define DRBL_RSU_REQUEST		BIT(0)
+#define DRBL_RSU_PROGRESS		GENMASK(7, 4)
+#define DRBL_HOST_STATUS		GENMASK(11, 8)
+#define DRBL_RSU_STATUS			GENMASK(23, 16)
+#define DRBL_PKVL_EEPROM_LOAD_SEC	BIT(24)
+#define DRBL_PKVL1_POLL_EN		BIT(25)
+#define DRBL_PKVL2_POLL_EN		BIT(26)
+#define DRBL_CONFIG_SEL			BIT(28)
+#define DRBL_REBOOT_REQ			BIT(29)
+#define DRBL_REBOOT_DISABLED		BIT(30)
+
+/* Progress states */
+#define RSU_PROG_IDLE			0x0
+#define RSU_PROG_PREPARE		0x1
+#define RSU_PROG_READY			0x3
+#define RSU_PROG_AUTHENTICATING		0x4
+#define RSU_PROG_COPYING		0x5
+#define RSU_PROG_UPDATE_CANCEL		0x6
+#define RSU_PROG_PROGRAM_KEY_HASH	0x7
+#define RSU_PROG_RSU_DONE		0x8
+#define RSU_PROG_PKVL_PROM_DONE		0x9
+
+/* Device and error states */
+#define RSU_STAT_NORMAL			0x0
+#define RSU_STAT_TIMEOUT		0x1
+#define RSU_STAT_AUTH_FAIL		0x2
+#define RSU_STAT_COPY_FAIL		0x3
+#define RSU_STAT_FATAL			0x4
+#define RSU_STAT_PKVL_REJECT		0x5
+#define RSU_STAT_NON_INC		0x6
+#define RSU_STAT_ERASE_FAIL		0x7
+#define RSU_STAT_WEAROUT		0x8
+#define RSU_STAT_NIOS_OK		0x80
+#define RSU_STAT_USER_OK		0x81
+#define RSU_STAT_FACTORY_OK		0x82
+#define RSU_STAT_USER_FAIL		0x83
+#define RSU_STAT_FACTORY_FAIL		0x84
+#define RSU_STAT_NIOS_FLASH_ERR		0x85
+#define RSU_STAT_FPGA_FLASH_ERR		0x86
+
+#define HOST_STATUS_IDLE		0x0
+#define HOST_STATUS_WRITE_DONE		0x1
+#define HOST_STATUS_ABORT_RSU		0x2
+
+#define rsu_prog(doorbell)	FIELD_GET(DRBL_RSU_PROGRESS, doorbell)
+#define rsu_stat(doorbell)	FIELD_GET(DRBL_RSU_STATUS, doorbell)
+
+/* interval 100ms and timeout 5s */
+#define NIOS_HANDSHAKE_INTERVAL_US	(100 * 1000)
+#define NIOS_HANDSHAKE_TIMEOUT_US	(5 * 1000 * 1000)
+
+/* RSU PREP Timeout (2 minutes) to erase flash staging area */
+#define RSU_PREP_INTERVAL_MS		100
+#define RSU_PREP_TIMEOUT_MS		(2 * 60 * 1000)
+
+/* RSU Complete Timeout (40 minutes) for full flash update */
+#define RSU_COMPLETE_INTERVAL_MS	1000
+#define RSU_COMPLETE_TIMEOUT_MS		(40 * 60 * 1000)
+
+/* Addresses for security related data in FLASH */
+#define BMC_REH_ADDR	0x17ffc004
+#define BMC_PROG_ADDR	0x17ffc000
+#define BMC_PROG_MAGIC	0x5746
+
+#define SR_REH_ADDR	0x17ffd004
+#define SR_PROG_ADDR	0x17ffd000
+#define SR_PROG_MAGIC	0x5253
+
+#define PR_REH_ADDR	0x17ffe004
+#define PR_PROG_ADDR	0x17ffe000
+#define PR_PROG_MAGIC	0x5250
+
+/* Address of 4KB inverted bit vector containing staging area FLASH count */
+#define STAGING_FLASH_COUNT	0x17ffb000
+
 /**
  * struct intel_m10bmc - Intel MAX 10 BMC parent driver data structure
  * @dev: this device
-- 
cgit v1.2.3


From 4bad58ebc8bc4f20d89cff95417c9b4674769709 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 23 Mar 2021 22:05:39 +0100
Subject: signal: Allow tasks to cache one sigqueue struct

The idea for this originates from the real time tree to make signal
delivery for realtime applications more efficient. In quite some of these
application scenarios a control tasks signals workers to start their
computations. There is usually only one signal per worker on flight.  This
works nicely as long as the kmem cache allocations do not hit the slow path
and cause latencies.

To cure this an optimistic caching was introduced (limited to RT tasks)
which allows a task to cache a single sigqueue in a pointer in task_struct
instead of handing it back to the kmem cache after consuming a signal. When
the next signal is sent to the task then the cached sigqueue is used
instead of allocating a new one. This solved the problem for this set of
application scenarios nicely.

The task cache is not preallocated so the first signal sent to a task goes
always to the cache allocator. The cached sigqueue stays around until the
task exits and is freed when task::sighand is dropped.

After posting this solution for mainline the discussion came up whether
this would be useful in general and should not be limited to realtime
tasks: https://lore.kernel.org/r/m11rcu7nbr.fsf@fess.ebiederm.org

One concern leading to the original limitation was to avoid a large amount
of pointlessly cached sigqueues in alive tasks. The other concern was
vs. RLIMIT_SIGPENDING as these cached sigqueues are not accounted for.

The accounting problem is real, but on the other hand slightly academic.
After gathering some statistics it turned out that after boot of a regular
distro install there are less than 10 sigqueues cached in ~1500 tasks.

In case of a 'mass fork and fire signal to child' scenario the extra 80
bytes of memory per task are well in the noise of the overall memory
consumption of the fork bomb.

If this should be limited then this would need an extra counter in struct
user, more atomic instructions and a seperate rlimit. Yet another tunable
which is mostly unused.

The caching is actually used. After boot and a full kernel compile on a
64CPU machine with make -j128 the number of 'allocations' looks like this:

  From slab:	   23996
  From task cache: 52223

I.e. it reduces the number of slab cache operations by ~68%.

A typical pattern there is:

<...>-58490 __sigqueue_alloc:  for 58488 from slab ffff8881132df460
<...>-58488 __sigqueue_free:   cache ffff8881132df460
<...>-58488 __sigqueue_alloc:  for 1149 from cache ffff8881103dc550
  bash-1149 exit_task_sighand: free ffff8881132df460
  bash-1149 __sigqueue_free:   cache ffff8881103dc550

The interesting sequence is that the exiting task 58488 grabs the sigqueue
from bash's task cache to signal exit and bash sticks it back into it's own
cache. Lather, rinse and repeat.

The caching is probably not noticable for the general use case, but the
benefit for latency sensitive applications is clear. While kmem caches are
usually just serving from the fast path the slab merging (default) can
depending on the usage pattern of the merged slabs cause occasional slow
path allocations.

The time spared per cached entry is a few micro seconds per signal which is
not relevant for e.g. a kernel build, but for signal heavy workloads it's
measurable.

As there is no real downside of this caching mechanism making it
unconditionally available is preferred over more conditional code or new
magic tunables.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/87sg4lbmxo.fsf@nanos.tec.linutronix.de
---
 include/linux/sched.h  |  1 +
 include/linux/signal.h |  1 +
 kernel/exit.c          |  1 +
 kernel/fork.c          |  1 +
 kernel/signal.c        | 44 ++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 05572e2140ad..f5ca798acb3a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -984,6 +984,7 @@ struct task_struct {
 	/* Signal handlers: */
 	struct signal_struct		*signal;
 	struct sighand_struct __rcu		*sighand;
+	struct sigqueue			*sigqueue_cache;
 	sigset_t			blocked;
 	sigset_t			real_blocked;
 	/* Restored if set_restore_sigmask() was used: */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 205526c4003a..c3cbea266136 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -265,6 +265,7 @@ static inline void init_sigpending(struct sigpending *sig)
 }
 
 extern void flush_sigqueue(struct sigpending *queue);
+extern void exit_task_sigqueue_cache(struct task_struct *tsk);
 
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
diff --git a/kernel/exit.c b/kernel/exit.c
index 04029e35e69a..0596526ed9ea 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -162,6 +162,7 @@ static void __exit_signal(struct task_struct *tsk)
 		flush_sigqueue(&sig->shared_pending);
 		tty_kref_put(tty);
 	}
+	exit_task_sigqueue_cache(tsk);
 }
 
 static void delayed_put_task_struct(struct rcu_head *rhp)
diff --git a/kernel/fork.c b/kernel/fork.c
index d3171e8e88e5..3c43a9f3e75c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1995,6 +1995,7 @@ static __latent_entropy struct task_struct *copy_process(
 	spin_lock_init(&p->alloc_lock);
 
 	init_sigpending(&p->pending);
+	p->sigqueue_cache = NULL;
 
 	p->utime = p->stime = p->gtime = 0;
 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
diff --git a/kernel/signal.c b/kernel/signal.c
index 568a2e2fc9ab..2d9463e05ae6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -433,7 +433,16 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 	rcu_read_unlock();
 
 	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
-		q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
+		/*
+		 * Preallocation does not hold sighand::siglock so it can't
+		 * use the cache. The lockless caching requires that only
+		 * one consumer and only one producer run at a time.
+		 */
+		q = READ_ONCE(t->sigqueue_cache);
+		if (!q || sigqueue_flags)
+			q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
+		else
+			WRITE_ONCE(t->sigqueue_cache, NULL);
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -450,13 +459,44 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 	return q;
 }
 
+void exit_task_sigqueue_cache(struct task_struct *tsk)
+{
+	/* Race free because @tsk is mopped up */
+	struct sigqueue *q = tsk->sigqueue_cache;
+
+	if (q) {
+		tsk->sigqueue_cache = NULL;
+		/*
+		 * Hand it back to the cache as the task might
+		 * be self reaping which would leak the object.
+		 */
+		 kmem_cache_free(sigqueue_cachep, q);
+	}
+}
+
+static void sigqueue_cache_or_free(struct sigqueue *q)
+{
+	/*
+	 * Cache one sigqueue per task. This pairs with the consumer side
+	 * in __sigqueue_alloc() and needs READ/WRITE_ONCE() to prevent the
+	 * compiler from store tearing and to tell KCSAN that the data race
+	 * is intentional when run without holding current->sighand->siglock,
+	 * which is fine as current obviously cannot run __sigqueue_free()
+	 * concurrently.
+	 */
+	if (!READ_ONCE(current->sigqueue_cache))
+		WRITE_ONCE(current->sigqueue_cache, q);
+	else
+		kmem_cache_free(sigqueue_cachep, q);
+}
+
 static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
 		return;
 	if (atomic_dec_and_test(&q->user->sigpending))
 		free_uid(q->user);
-	kmem_cache_free(sigqueue_cachep, q);
+	sigqueue_cache_or_free(q);
 }
 
 void flush_sigqueue(struct sigpending *queue)
-- 
cgit v1.2.3


From 6308a5f06be08f3ea1f1a895a9ef54c7b65c4c35 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 2 Mar 2021 21:27:47 +0200
Subject: net/mlx5: E-Switch, Make vport number u16

Vport number is 16-bit field in hardware. Make it u16.

Move location of vport in the structure so that it reduces a hole
in the structure.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++-
 include/linux/mlx5/eswitch.h                               | 3 +--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index c7a73dbd64b4..a4b9f78bf4d6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -152,7 +152,6 @@ enum mlx5_eswitch_vport_event {
 
 struct mlx5_vport {
 	struct mlx5_core_dev    *dev;
-	int                     vport;
 	struct hlist_head       uc_list[MLX5_L2_ADDR_HASH_SIZE];
 	struct hlist_head       mc_list[MLX5_L2_ADDR_HASH_SIZE];
 	struct mlx5_flow_handle *promisc_rule;
@@ -174,6 +173,7 @@ struct mlx5_vport {
 		u32 max_rate;
 	} qos;
 
+	u16 vport;
 	bool                    enabled;
 	enum mlx5_eswitch_vport_event enabled_events;
 	struct devlink_port *dl_port;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 1f58e84bdfc6..bbb707117296 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -986,12 +986,13 @@ static void mlx5_eswitch_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw)
 static int
 mlx5_eswitch_add_send_to_vport_meta_rules(struct mlx5_eswitch *esw)
 {
-	int num_vfs, vport_num, rule_idx = 0, err = 0;
 	struct mlx5_flow_destination dest = {};
 	struct mlx5_flow_act flow_act = {0};
+	int num_vfs, rule_idx = 0, err = 0;
 	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_flow_handle **flows;
 	struct mlx5_flow_spec *spec;
+	u16 vport_num;
 
 	num_vfs = esw->esw_funcs.num_vfs;
 	flows = kvzalloc(num_vfs * sizeof(*flows), GFP_KERNEL);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 429a710c5a99..9cf1da2883c6 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -152,8 +152,7 @@ mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
 };
 
 static inline u32
-mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw,
-					  int vport_num)
+mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, u16 vport_num)
 {
 	return 0;
 };
-- 
cgit v1.2.3


From f4da56529da602010979e8497d1f02eaf5df8883 Mon Sep 17 00:00:00 2001
From: Tan Tee Min <tee.min.tan@intel.com>
Date: Wed, 14 Apr 2021 08:16:17 +0800
Subject: net: stmmac: Add support for external trigger timestamping

The Synopsis MAC controller supports auxiliary snapshot feature that
allows user to store a snapshot of the system time based on an external
event.

This patch add supports to the above mentioned feature. Users will be
able to triggered capturing the time snapshot from user-space using
application such as testptp or any other applications that uses the
PTP_EXTTS_REQUEST ioctl request.

Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Tan Tee Min <tee.min.tan@intel.com>
Co-developed-by: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Signed-off-by: Wong Vee Khee <vee.khee.wong@linux.intel.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c  | 10 ++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  5 +++
 drivers/net/ethernet/stmicro/stmmac/stmmac.h       |  3 ++
 .../net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c  | 39 +++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  2 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c   | 40 +++++++++++++++++++++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h   |  1 +
 include/linux/stmmac.h                             |  2 ++
 8 files changed, 101 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
index 60566598d644..ec140fc4a0f5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c
@@ -296,6 +296,13 @@ static int intel_crosststamp(ktime_t *device,
 
 	intel_priv = priv->plat->bsp_priv;
 
+	/* Both internal crosstimestamping and external triggered event
+	 * timestamping cannot be run concurrently.
+	 */
+	if (priv->plat->ext_snapshot_en)
+		return -EBUSY;
+
+	mutex_lock(&priv->aux_ts_lock);
 	/* Enable Internal snapshot trigger */
 	acr_value = readl(ptpaddr + PTP_ACR);
 	acr_value &= ~PTP_ACR_MASK;
@@ -321,6 +328,8 @@ static int intel_crosststamp(ktime_t *device,
 	acr_value = readl(ptpaddr + PTP_ACR);
 	acr_value |= PTP_ACR_ATSFC;
 	writel(acr_value, ptpaddr + PTP_ACR);
+	/* Release the mutex */
+	mutex_unlock(&priv->aux_ts_lock);
 
 	/* Trigger Internal snapshot signal
 	 * Create a rising edge by just toggle the GPO1 to low
@@ -520,6 +529,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev,
 	plat->mdio_bus_data->phy_mask |= 1 << INTEL_MGBE_XPCS_ADDR;
 
 	plat->int_snapshot_num = AUX_SNAPSHOT1;
+	plat->ext_snapshot_num = AUX_SNAPSHOT0;
 
 	plat->has_crossts = true;
 	plat->crosststamp = intel_crosststamp;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 2b5022ef1e52..2cc91759b91f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -504,6 +504,8 @@ struct stmmac_ops {
 #define stmmac_fpe_irq_status(__priv, __args...) \
 	stmmac_do_callback(__priv, mac, fpe_irq_status, __args)
 
+struct stmmac_priv;
+
 /* PTP and HW Timer helpers */
 struct stmmac_hwtimestamp {
 	void (*config_hw_tstamping) (void __iomem *ioaddr, u32 data);
@@ -515,6 +517,7 @@ struct stmmac_hwtimestamp {
 			       int add_sub, int gmac4);
 	void (*get_systime) (void __iomem *ioaddr, u64 *systime);
 	void (*get_ptptime)(void __iomem *ioaddr, u64 *ptp_time);
+	void (*timestamp_interrupt)(struct stmmac_priv *priv);
 };
 
 #define stmmac_config_hw_tstamping(__priv, __args...) \
@@ -531,6 +534,8 @@ struct stmmac_hwtimestamp {
 	stmmac_do_void_callback(__priv, ptp, get_systime, __args)
 #define stmmac_get_ptptime(__priv, __args...) \
 	stmmac_do_void_callback(__priv, ptp, get_ptptime, __args)
+#define stmmac_timestamp_interrupt(__priv, __args...) \
+	stmmac_do_void_callback(__priv, ptp, timestamp_interrupt, __args)
 
 /* Helpers to manage the descriptors for chain and ring modes */
 struct stmmac_mode_ops {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index b8a42260066d..b6cd43eda7ac 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -250,6 +250,9 @@ struct stmmac_priv {
 	int use_riwt;
 	int irq_wake;
 	spinlock_t ptp_lock;
+	/* Protects auxiliary snapshot registers from concurrent access. */
+	struct mutex aux_ts_lock;
+
 	void __iomem *mmcaddr;
 	void __iomem *ptpaddr;
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
index 113c51bcc0b5..074e2cdfb0fa 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
@@ -12,8 +12,11 @@
 #include <linux/io.h>
 #include <linux/iopoll.h>
 #include <linux/delay.h>
+#include <linux/ptp_clock_kernel.h>
 #include "common.h"
 #include "stmmac_ptp.h"
+#include "dwmac4.h"
+#include "stmmac.h"
 
 static void config_hw_tstamping(void __iomem *ioaddr, u32 data)
 {
@@ -163,6 +166,41 @@ static void get_ptptime(void __iomem *ptpaddr, u64 *ptp_time)
 	*ptp_time = ns;
 }
 
+static void timestamp_interrupt(struct stmmac_priv *priv)
+{
+	u32 num_snapshot, ts_status, tsync_int;
+	struct ptp_clock_event event;
+	unsigned long flags;
+	u64 ptp_time;
+	int i;
+
+	tsync_int = readl(priv->ioaddr + GMAC_INT_STATUS) & GMAC_INT_TSIE;
+
+	if (!tsync_int)
+		return;
+
+	/* Read timestamp status to clear interrupt from either external
+	 * timestamp or start/end of PPS.
+	 */
+	ts_status = readl(priv->ioaddr + GMAC_TIMESTAMP_STATUS);
+
+	if (!priv->plat->ext_snapshot_en)
+		return;
+
+	num_snapshot = (ts_status & GMAC_TIMESTAMP_ATSNS_MASK) >>
+		       GMAC_TIMESTAMP_ATSNS_SHIFT;
+
+	for (i = 0; i < num_snapshot; i++) {
+		spin_lock_irqsave(&priv->ptp_lock, flags);
+		get_ptptime(priv->ptpaddr, &ptp_time);
+		spin_unlock_irqrestore(&priv->ptp_lock, flags);
+		event.type = PTP_CLOCK_EXTTS;
+		event.index = 0;
+		event.timestamp = ptp_time;
+		ptp_clock_event(priv->ptp_clock, &event);
+	}
+}
+
 const struct stmmac_hwtimestamp stmmac_ptp = {
 	.config_hw_tstamping = config_hw_tstamping,
 	.init_systime = init_systime,
@@ -171,4 +209,5 @@ const struct stmmac_hwtimestamp stmmac_ptp = {
 	.adjust_systime = adjust_systime,
 	.get_systime = get_systime,
 	.get_ptptime = get_ptptime,
+	.timestamp_interrupt = timestamp_interrupt,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index e3e22200a4fd..3a5ca5833ce1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5687,6 +5687,8 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv)
 			else
 				netif_carrier_off(priv->dev);
 		}
+
+		stmmac_timestamp_interrupt(priv, priv);
 	}
 }
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index b164ae22e35f..4e86cdf2bc9f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -135,7 +135,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp,
 {
 	struct stmmac_priv *priv =
 	    container_of(ptp, struct stmmac_priv, ptp_clock_ops);
+	void __iomem *ptpaddr = priv->ptpaddr;
+	void __iomem *ioaddr = priv->hw->pcsr;
 	struct stmmac_pps_cfg *cfg;
+	u32 intr_value, acr_value;
 	int ret = -EOPNOTSUPP;
 	unsigned long flags;
 
@@ -159,6 +162,37 @@ static int stmmac_enable(struct ptp_clock_info *ptp,
 					     priv->systime_flags);
 		spin_unlock_irqrestore(&priv->ptp_lock, flags);
 		break;
+	case PTP_CLK_REQ_EXTTS:
+		priv->plat->ext_snapshot_en = on;
+		mutex_lock(&priv->aux_ts_lock);
+		acr_value = readl(ptpaddr + PTP_ACR);
+		acr_value &= ~PTP_ACR_MASK;
+		if (on) {
+			/* Enable External snapshot trigger */
+			acr_value |= priv->plat->ext_snapshot_num;
+			acr_value |= PTP_ACR_ATSFC;
+			netdev_dbg(priv->dev, "Auxiliary Snapshot %d enabled.\n",
+				   priv->plat->ext_snapshot_num >>
+				   PTP_ACR_ATSEN_SHIFT);
+			/* Enable Timestamp Interrupt */
+			intr_value = readl(ioaddr + GMAC_INT_EN);
+			intr_value |= GMAC_INT_TSIE;
+			writel(intr_value, ioaddr + GMAC_INT_EN);
+
+		} else {
+			netdev_dbg(priv->dev, "Auxiliary Snapshot %d disabled.\n",
+				   priv->plat->ext_snapshot_num >>
+				   PTP_ACR_ATSEN_SHIFT);
+			/* Disable Timestamp Interrupt */
+			intr_value = readl(ioaddr + GMAC_INT_EN);
+			intr_value &= ~GMAC_INT_TSIE;
+			writel(intr_value, ioaddr + GMAC_INT_EN);
+		}
+		writel(acr_value, ptpaddr + PTP_ACR);
+		mutex_unlock(&priv->aux_ts_lock);
+		ret = 0;
+		break;
+
 	default:
 		break;
 	}
@@ -202,7 +236,7 @@ static struct ptp_clock_info stmmac_ptp_clock_ops = {
 	.name = "stmmac ptp",
 	.max_adj = 62500000,
 	.n_alarm = 0,
-	.n_ext_ts = 0,
+	.n_ext_ts = 0, /* will be overwritten in stmmac_ptp_register */
 	.n_per_out = 0, /* will be overwritten in stmmac_ptp_register */
 	.n_pins = 0,
 	.pps = 0,
@@ -237,8 +271,10 @@ void stmmac_ptp_register(struct stmmac_priv *priv)
 		stmmac_ptp_clock_ops.max_adj = priv->plat->ptp_max_adj;
 
 	stmmac_ptp_clock_ops.n_per_out = priv->dma_cap.pps_out_num;
+	stmmac_ptp_clock_ops.n_ext_ts = priv->dma_cap.aux_snapshot_n;
 
 	spin_lock_init(&priv->ptp_lock);
+	mutex_init(&priv->aux_ts_lock);
 	priv->ptp_clock_ops = stmmac_ptp_clock_ops;
 
 	priv->ptp_clock = ptp_clock_register(&priv->ptp_clock_ops,
@@ -264,4 +300,6 @@ void stmmac_ptp_unregister(struct stmmac_priv *priv)
 		pr_debug("Removed PTP HW clock successfully on %s\n",
 			 priv->dev->name);
 	}
+
+	mutex_destroy(&priv->aux_ts_lock);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
index f88727ce4d30..53172a439810 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h
@@ -73,6 +73,7 @@
 #define	PTP_ACR_ATSEN1		BIT(5)	/* Auxiliary Snapshot 1 Enable */
 #define	PTP_ACR_ATSEN2		BIT(6)	/* Auxiliary Snapshot 2 Enable */
 #define	PTP_ACR_ATSEN3		BIT(7)	/* Auxiliary Snapshot 3 Enable */
+#define	PTP_ACR_ATSEN_SHIFT	5	/* Auxiliary Snapshot shift */
 #define	PTP_ACR_MASK		GENMASK(7, 4)	/* Aux Snapshot Mask */
 #define	PMC_ART_VALUE0		0x01	/* PMC_ART[15:0] timer value */
 #define	PMC_ART_VALUE1		0x02	/* PMC_ART[31:16] timer value */
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index e338ef7abc00..97edb31d6310 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -238,6 +238,8 @@ struct plat_stmmacenet_data {
 	struct pci_dev *pdev;
 	bool has_crossts;
 	int int_snapshot_num;
+	int ext_snapshot_num;
+	bool ext_snapshot_en;
 	bool multi_msi_en;
 	int msi_mac_vec;
 	int msi_wol_vec;
-- 
cgit v1.2.3


From 98602c010ceba82f2c2384122dbd07bc965fd367 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 8 Apr 2021 14:51:22 +0200
Subject: tty: create internal tty.h file

There are a number of functions and #defines in include/linux/tty.h that
do not belong there as they are private to the tty core code.

Create an initial drivers/tty/tty.h file and copy the odd "tty logging"
macros into it to seed the file with some initial things that we know
nothing outside of the tty core should be calling.

Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20210408125134.3016837-2-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c       |  1 +
 drivers/tty/pty.c         |  1 +
 drivers/tty/tty.h         | 21 +++++++++++++++++++++
 drivers/tty/tty_io.c      |  1 +
 drivers/tty/tty_jobctrl.c |  1 +
 drivers/tty/tty_ldisc.c   |  1 +
 drivers/tty/tty_port.c    |  1 +
 include/linux/tty.h       | 12 ------------
 8 files changed, 27 insertions(+), 12 deletions(-)
 create mode 100644 drivers/tty/tty.h

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index a6f3c5c148eb..9686c5d10571 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/ratelimit.h>
 #include <linux/vmalloc.h>
+#include "tty.h"
 
 /*
  * Until this number of characters is queued in the xmit buffer, select will
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 106265a3ff9c..9b5d4ae5d8f2 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -29,6 +29,7 @@
 #include <linux/file.h>
 #include <linux/ioctl.h>
 #include <linux/compat.h>
+#include "tty.h"
 
 #undef TTY_DEBUG_HANGUP
 #ifdef TTY_DEBUG_HANGUP
diff --git a/drivers/tty/tty.h b/drivers/tty/tty.h
new file mode 100644
index 000000000000..f4cd20261e91
--- /dev/null
+++ b/drivers/tty/tty.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * TTY core internal functions
+ */
+
+#ifndef _TTY_INTERNAL_H
+#define _TTY_INTERNAL_H
+
+#define tty_msg(fn, tty, f, ...) \
+	fn("%s %s: " f, tty_driver_name(tty), tty_name(tty), ##__VA_ARGS__)
+
+#define tty_debug(tty, f, ...)	tty_msg(pr_debug, tty, f, ##__VA_ARGS__)
+#define tty_info(tty, f, ...)	tty_msg(pr_info, tty, f, ##__VA_ARGS__)
+#define tty_notice(tty, f, ...)	tty_msg(pr_notice, tty, f, ##__VA_ARGS__)
+#define tty_warn(tty, f, ...)	tty_msg(pr_warn, tty, f, ##__VA_ARGS__)
+#define tty_err(tty, f, ...)	tty_msg(pr_err, tty, f, ##__VA_ARGS__)
+
+#define tty_info_ratelimited(tty, f, ...) \
+		tty_msg(pr_info_ratelimited, tty, f, ##__VA_ARGS__)
+
+#endif
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 36e6b28fb80d..7bb3524e1111 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -108,6 +108,7 @@
 
 #include <linux/kmod.h>
 #include <linux/nsproxy.h>
+#include "tty.h"
 
 #undef TTY_DEBUG_HANGUP
 #ifdef TTY_DEBUG_HANGUP
diff --git a/drivers/tty/tty_jobctrl.c b/drivers/tty/tty_jobctrl.c
index 72df97b27227..7813dc910a19 100644
--- a/drivers/tty/tty_jobctrl.c
+++ b/drivers/tty/tty_jobctrl.c
@@ -11,6 +11,7 @@
 #include <linux/tty.h>
 #include <linux/fcntl.h>
 #include <linux/uaccess.h>
+#include "tty.h"
 
 static int is_ignored(int sig)
 {
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index fdc4fa31ca72..03f414172f34 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -19,6 +19,7 @@
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/ratelimit.h>
+#include "tty.h"
 
 #undef LDISC_DEBUG_HANGUP
 
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 346d20f4a486..303c198fbf5c 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -18,6 +18,7 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/serdev.h>
+#include "tty.h"
 
 static int tty_port_default_receive_buf(struct tty_port *port,
 					const unsigned char *p,
diff --git a/include/linux/tty.h b/include/linux/tty.h
index a228c0ee484f..c8af52451587 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -772,16 +772,4 @@ static inline void proc_tty_register_driver(struct tty_driver *d) {}
 static inline void proc_tty_unregister_driver(struct tty_driver *d) {}
 #endif
 
-#define tty_msg(fn, tty, f, ...) \
-	fn("%s %s: " f, tty_driver_name(tty), tty_name(tty), ##__VA_ARGS__)
-
-#define tty_debug(tty, f, ...)	tty_msg(pr_debug, tty, f, ##__VA_ARGS__)
-#define tty_info(tty, f, ...)	tty_msg(pr_info, tty, f, ##__VA_ARGS__)
-#define tty_notice(tty, f, ...)	tty_msg(pr_notice, tty, f, ##__VA_ARGS__)
-#define tty_warn(tty, f, ...)	tty_msg(pr_warn, tty, f, ##__VA_ARGS__)
-#define tty_err(tty, f, ...)	tty_msg(pr_err, tty, f, ##__VA_ARGS__)
-
-#define tty_info_ratelimited(tty, f, ...) \
-		tty_msg(pr_info_ratelimited, tty, f, ##__VA_ARGS__)
-
 #endif
-- 
cgit v1.2.3


From da5d669e00d2c437b3f508d60add417fc74f4bb6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 8 Apr 2021 14:51:29 +0200
Subject: tty: audit: move some local functions out of tty.h

The functions tty_audit_add_data() and tty_audit_tiocsti() are local to
the tty core code, and do not need to be in a "kernel-wide" header file
so move them to drivers/tty/tty.h

Cc: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20210408125134.3016837-9-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty.h       | 14 ++++++++++++++
 drivers/tty/tty_audit.c |  1 +
 include/linux/tty.h     | 10 ----------
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty.h b/drivers/tty/tty.h
index 75624d7d84ae..55003840cd68 100644
--- a/drivers/tty/tty.h
+++ b/drivers/tty/tty.h
@@ -17,4 +17,18 @@
 #define tty_info_ratelimited(tty, f, ...) \
 		tty_msg(pr_info_ratelimited, tty, f, ##__VA_ARGS__)
 
+/* tty_audit.c */
+#ifdef CONFIG_AUDIT
+void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size);
+void tty_audit_tiocsti(struct tty_struct *tty, char ch);
+#else
+static inline void tty_audit_add_data(struct tty_struct *tty, const void *data,
+				      size_t size)
+{
+}
+static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch)
+{
+}
+#endif
+
 #endif
diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index 32898aabcd06..ca7afd7b2716 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -10,6 +10,7 @@
 #include <linux/audit.h>
 #include <linux/slab.h>
 #include <linux/tty.h>
+#include "tty.h"
 
 struct tty_audit_buf {
 	struct mutex mutex;	/* Protects all data below */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index c8af52451587..516c63c1bfe2 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -717,20 +717,10 @@ static inline void n_tty_init(void) { }
 
 /* tty_audit.c */
 #ifdef CONFIG_AUDIT
-extern void tty_audit_add_data(struct tty_struct *tty, const void *data,
-			       size_t size);
 extern void tty_audit_exit(void);
 extern void tty_audit_fork(struct signal_struct *sig);
-extern void tty_audit_tiocsti(struct tty_struct *tty, char ch);
 extern int tty_audit_push(void);
 #else
-static inline void tty_audit_add_data(struct tty_struct *tty, const void *data,
-				      size_t size)
-{
-}
-static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch)
-{
-}
 static inline void tty_audit_exit(void)
 {
 }
-- 
cgit v1.2.3


From 6c80c0b94b94192d9a34b400f8237703c6475f4d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 8 Apr 2021 14:51:30 +0200
Subject: tty: move some internal tty lock enums and functions out of tty.h

Move the TTY_LOCK_* enums and tty_ldisc lock functions out of the global
tty.h into the local header file to clean things up.

Cc: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20210408125134.3016837-10-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty.h        | 26 ++++++++++++++++++++++++++
 drivers/tty/tty_buffer.c |  2 +-
 drivers/tty/tty_mutex.c  |  1 +
 include/linux/tty.h      | 26 --------------------------
 4 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty.h b/drivers/tty/tty.h
index 55003840cd68..af997a4174c5 100644
--- a/drivers/tty/tty.h
+++ b/drivers/tty/tty.h
@@ -17,6 +17,32 @@
 #define tty_info_ratelimited(tty, f, ...) \
 		tty_msg(pr_info_ratelimited, tty, f, ##__VA_ARGS__)
 
+/*
+ * Lock subclasses for tty locks
+ *
+ * TTY_LOCK_NORMAL is for normal ttys and master ptys.
+ * TTY_LOCK_SLAVE is for slave ptys only.
+ *
+ * Lock subclasses are necessary for handling nested locking with pty pairs.
+ * tty locks which use nested locking:
+ *
+ * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
+ *		  The stable lock order is master pty first, then slave pty.
+ * termios_rwsem - The stable lock order is tty_buffer lock->termios_rwsem.
+ *		   Subclassing this lock enables the slave pty to hold its
+ *		   termios_rwsem when claiming the master tty_buffer lock.
+ * tty_buffer lock - slave ptys can claim nested buffer lock when handling
+ *		     signal chars. The stable lock order is slave pty, then
+ *		     master.
+ */
+enum {
+	TTY_LOCK_NORMAL = 0,
+	TTY_LOCK_SLAVE,
+};
+
+int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout);
+void tty_ldisc_unlock(struct tty_struct *tty);
+
 /* tty_audit.c */
 #ifdef CONFIG_AUDIT
 void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size);
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 6d4995a5f318..9733469a14b2 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -17,7 +17,7 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/ratelimit.h>
-
+#include "tty.h"
 
 #define MIN_TTYB_SIZE	256
 #define TTYB_ALIGN_MASK	255
diff --git a/drivers/tty/tty_mutex.c b/drivers/tty/tty_mutex.c
index 2640635ee177..393518a24cfe 100644
--- a/drivers/tty/tty_mutex.c
+++ b/drivers/tty/tty_mutex.c
@@ -4,6 +4,7 @@
 #include <linux/kallsyms.h>
 #include <linux/semaphore.h>
 #include <linux/sched.h>
+#include "tty.h"
 
 /* Legacy tty mutex glue */
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 516c63c1bfe2..031e7e4b5c97 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -16,30 +16,6 @@
 #include <linux/llist.h>
 
 
-/*
- * Lock subclasses for tty locks
- *
- * TTY_LOCK_NORMAL is for normal ttys and master ptys.
- * TTY_LOCK_SLAVE is for slave ptys only.
- *
- * Lock subclasses are necessary for handling nested locking with pty pairs.
- * tty locks which use nested locking:
- *
- * legacy_mutex - Nested tty locks are necessary for releasing pty pairs.
- *		  The stable lock order is master pty first, then slave pty.
- * termios_rwsem - The stable lock order is tty_buffer lock->termios_rwsem.
- *		   Subclassing this lock enables the slave pty to hold its
- *		   termios_rwsem when claiming the master tty_buffer lock.
- * tty_buffer lock - slave ptys can claim nested buffer lock when handling
- *		     signal chars. The stable lock order is slave pty, then
- *		     master.
- */
-
-enum {
-	TTY_LOCK_NORMAL = 0,
-	TTY_LOCK_SLAVE,
-};
-
 /*
  * (Note: the *_driver.minor_start values 1, 64, 128, 192 are
  * hardcoded at present.)
@@ -419,8 +395,6 @@ extern struct tty_struct *tty_kopen_exclusive(dev_t device);
 extern struct tty_struct *tty_kopen_shared(dev_t device);
 extern void tty_kclose(struct tty_struct *tty);
 extern int tty_dev_name_to_number(const char *name, dev_t *number);
-extern int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout);
-extern void tty_ldisc_unlock(struct tty_struct *tty);
 extern ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *);
 extern struct file *tty_release_redirect(struct tty_struct *tty);
 #else
-- 
cgit v1.2.3


From dd9f611442578ca8392e90adbca5815d79946b5e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 8 Apr 2021 14:51:31 +0200
Subject: tty: make tty_release_redirect() static

No one calls this outside of the tty_io.c file, so mark this static and
do not export the symbol anymore.

Cc: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20210408125134.3016837-11-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 3 +--
 include/linux/tty.h  | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 7bb3524e1111..5b5e99604989 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -547,7 +547,7 @@ EXPORT_SYMBOL_GPL(tty_wakeup);
  *	This is available to the pty code so if the master closes, if the
  *	slave is a redirect it can release the redirect.
  */
-struct file *tty_release_redirect(struct tty_struct *tty)
+static struct file *tty_release_redirect(struct tty_struct *tty)
 {
 	struct file *f = NULL;
 
@@ -560,7 +560,6 @@ struct file *tty_release_redirect(struct tty_struct *tty)
 
 	return f;
 }
-EXPORT_SYMBOL_GPL(tty_release_redirect);
 
 /**
  *	__tty_hangup		-	actual handler for hangup events
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 031e7e4b5c97..23e5028ab52d 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -396,7 +396,6 @@ extern struct tty_struct *tty_kopen_shared(dev_t device);
 extern void tty_kclose(struct tty_struct *tty);
 extern int tty_dev_name_to_number(const char *name, dev_t *number);
 extern ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *);
-extern struct file *tty_release_redirect(struct tty_struct *tty);
 #else
 static inline void tty_kref_put(struct tty_struct *tty)
 { }
-- 
cgit v1.2.3


From 9f72cab1596327e1011ab4599c07b165e0fb45db Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 8 Apr 2021 14:51:32 +0200
Subject: tty: move some tty-only functions to drivers/tty/tty.h

The flow change and restricted_tty_write() logic is internal to the tty
core only, so move it out of the include/linux/tty.h file.

Cc: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20210408125134.3016837-12-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty.h       | 17 +++++++++++++++++
 drivers/tty/tty_ioctl.c |  1 +
 include/linux/tty.h     | 16 ----------------
 3 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty.h b/drivers/tty/tty.h
index af997a4174c5..cf0b1346b346 100644
--- a/drivers/tty/tty.h
+++ b/drivers/tty/tty.h
@@ -40,6 +40,21 @@ enum {
 	TTY_LOCK_SLAVE,
 };
 
+/* Values for tty->flow_change */
+#define TTY_THROTTLE_SAFE	1
+#define TTY_UNTHROTTLE_SAFE	2
+
+static inline void __tty_set_flow_change(struct tty_struct *tty, int val)
+{
+	tty->flow_change = val;
+}
+
+static inline void tty_set_flow_change(struct tty_struct *tty, int val)
+{
+	tty->flow_change = val;
+	smp_mb();
+}
+
 int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout);
 void tty_ldisc_unlock(struct tty_struct *tty);
 
@@ -57,4 +72,6 @@ static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch)
 }
 #endif
 
+ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *);
+
 #endif
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 420e7678330f..41f7449d0464 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -21,6 +21,7 @@
 #include <linux/bitops.h>
 #include <linux/mutex.h>
 #include <linux/compat.h>
+#include "tty.h"
 
 #include <asm/io.h>
 #include <linux/uaccess.h>
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 23e5028ab52d..cf65618e7bd9 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -349,21 +349,6 @@ struct tty_file_private {
 #define TTY_LDISC_CHANGING	20	/* Change pending - non-block IO */
 #define TTY_LDISC_HALTED	22	/* Line discipline is halted */
 
-/* Values for tty->flow_change */
-#define TTY_THROTTLE_SAFE 1
-#define TTY_UNTHROTTLE_SAFE 2
-
-static inline void __tty_set_flow_change(struct tty_struct *tty, int val)
-{
-	tty->flow_change = val;
-}
-
-static inline void tty_set_flow_change(struct tty_struct *tty, int val)
-{
-	tty->flow_change = val;
-	smp_mb();
-}
-
 static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file)
 {
 	return file->f_flags & O_NONBLOCK ||
@@ -395,7 +380,6 @@ extern struct tty_struct *tty_kopen_exclusive(dev_t device);
 extern struct tty_struct *tty_kopen_shared(dev_t device);
 extern void tty_kclose(struct tty_struct *tty);
 extern int tty_dev_name_to_number(const char *name, dev_t *number);
-extern ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *);
 #else
 static inline void tty_kref_put(struct tty_struct *tty)
 { }
-- 
cgit v1.2.3


From 5ffa6e344a1c92a27c242f500fc74e6eb361a4bc Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 8 Apr 2021 14:51:34 +0200
Subject: tty: clean include/linux/tty.h up

There are a lot of tty-core-only functions that are listed in
include/linux/tty.h.  Move them to drivers/tty/tty.h so that no one else
can accidentally call them or think that they are public functions.

Cc: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20210408125134.3016837-14-gregkh@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_gsm.c        |  1 +
 drivers/tty/n_hdlc.c       |  1 +
 drivers/tty/tty.h          | 37 +++++++++++++++++++++++++++++++++++++
 drivers/tty/tty_baudrate.c |  1 +
 include/linux/tty.h        | 34 ----------------------------------
 5 files changed, 40 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 2514d529f5f3..5fea02cfb0cc 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -50,6 +50,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/gsmmux.h>
+#include "tty.h"
 
 static int debug;
 module_param(debug, int, 0600);
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index 290c757db1b9..b0f33e8ac819 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -100,6 +100,7 @@
 
 #include <asm/termios.h>
 #include <linux/uaccess.h>
+#include "tty.h"
 
 /*
  * Buffers for individual HDLC frames
diff --git a/drivers/tty/tty.h b/drivers/tty/tty.h
index cf0b1346b346..b710c5ef89ab 100644
--- a/drivers/tty/tty.h
+++ b/drivers/tty/tty.h
@@ -58,6 +58,43 @@ static inline void tty_set_flow_change(struct tty_struct *tty, int val)
 int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout);
 void tty_ldisc_unlock(struct tty_struct *tty);
 
+int __tty_check_change(struct tty_struct *tty, int sig);
+int tty_check_change(struct tty_struct *tty);
+void __stop_tty(struct tty_struct *tty);
+void __start_tty(struct tty_struct *tty);
+void tty_vhangup_session(struct tty_struct *tty);
+void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty);
+int tty_signal_session_leader(struct tty_struct *tty, int exit_session);
+void session_clear_tty(struct pid *session);
+void tty_buffer_free_all(struct tty_port *port);
+void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld);
+void tty_buffer_init(struct tty_port *port);
+void tty_buffer_set_lock_subclass(struct tty_port *port);
+bool tty_buffer_restart_work(struct tty_port *port);
+bool tty_buffer_cancel_work(struct tty_port *port);
+void tty_buffer_flush_work(struct tty_port *port);
+speed_t tty_termios_input_baud_rate(struct ktermios *termios);
+void tty_ldisc_hangup(struct tty_struct *tty, bool reset);
+int tty_ldisc_reinit(struct tty_struct *tty, int disc);
+long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty,
+		       struct file *file, unsigned int cmd, unsigned long arg);
+void tty_default_fops(struct file_operations *fops);
+struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx);
+int tty_alloc_file(struct file *file);
+void tty_add_file(struct tty_struct *tty, struct file *file);
+void tty_free_file(struct file *file);
+int tty_release(struct inode *inode, struct file *filp);
+
+#define tty_is_writelocked(tty)  (mutex_is_locked(&tty->atomic_write_lock))
+
+int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty);
+void tty_ldisc_release(struct tty_struct *tty);
+int __must_check tty_ldisc_init(struct tty_struct *tty);
+void tty_ldisc_deinit(struct tty_struct *tty);
+
+void tty_sysctl_init(void);
+
 /* tty_audit.c */
 #ifdef CONFIG_AUDIT
 void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size);
diff --git a/drivers/tty/tty_baudrate.c b/drivers/tty/tty_baudrate.c
index 6551b188b736..48b5de659c77 100644
--- a/drivers/tty/tty_baudrate.c
+++ b/drivers/tty/tty_baudrate.c
@@ -8,6 +8,7 @@
 #include <linux/termios.h>
 #include <linux/tty.h>
 #include <linux/export.h>
+#include "tty.h"
 
 
 /*
diff --git a/include/linux/tty.h b/include/linux/tty.h
index cf65618e7bd9..e5d6b1f28823 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -432,11 +432,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 
 extern const char *tty_driver_name(const struct tty_struct *tty);
 extern void tty_wait_until_sent(struct tty_struct *tty, long timeout);
-extern int __tty_check_change(struct tty_struct *tty, int sig);
-extern int tty_check_change(struct tty_struct *tty);
-extern void __stop_tty(struct tty_struct *tty);
 extern void stop_tty(struct tty_struct *tty);
-extern void __start_tty(struct tty_struct *tty);
 extern void start_tty(struct tty_struct *tty);
 extern int tty_register_driver(struct tty_driver *driver);
 extern void tty_unregister_driver(struct tty_driver *driver);
@@ -463,23 +459,11 @@ extern int tty_get_icount(struct tty_struct *tty,
 extern int is_current_pgrp_orphaned(void);
 extern void tty_hangup(struct tty_struct *tty);
 extern void tty_vhangup(struct tty_struct *tty);
-extern void tty_vhangup_session(struct tty_struct *tty);
 extern int tty_hung_up_p(struct file *filp);
 extern void do_SAK(struct tty_struct *tty);
 extern void __do_SAK(struct tty_struct *tty);
-extern void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty);
-extern int tty_signal_session_leader(struct tty_struct *tty, int exit_session);
-extern void session_clear_tty(struct pid *session);
 extern void no_tty(void);
-extern void tty_buffer_free_all(struct tty_port *port);
-extern void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld);
-extern void tty_buffer_init(struct tty_port *port);
-extern void tty_buffer_set_lock_subclass(struct tty_port *port);
-extern bool tty_buffer_restart_work(struct tty_port *port);
-extern bool tty_buffer_cancel_work(struct tty_port *port);
-extern void tty_buffer_flush_work(struct tty_port *port);
 extern speed_t tty_termios_baud_rate(struct ktermios *termios);
-extern speed_t tty_termios_input_baud_rate(struct ktermios *termios);
 extern void tty_termios_encode_baud_rate(struct ktermios *termios,
 						speed_t ibaud, speed_t obaud);
 extern void tty_encode_baud_rate(struct tty_struct *tty,
@@ -507,27 +491,16 @@ extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
 extern void tty_ldisc_deref(struct tty_ldisc *);
 extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
-extern void tty_ldisc_hangup(struct tty_struct *tty, bool reset);
-extern int tty_ldisc_reinit(struct tty_struct *tty, int disc);
 extern const struct seq_operations tty_ldiscs_seq_ops;
 
 extern void tty_wakeup(struct tty_struct *tty);
 extern void tty_ldisc_flush(struct tty_struct *tty);
 
-extern long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 			unsigned int cmd, unsigned long arg);
-extern long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty,
-			      struct file *file, unsigned int cmd, unsigned long arg);
 extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
-extern void tty_default_fops(struct file_operations *fops);
-extern struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx);
-extern int tty_alloc_file(struct file *file);
-extern void tty_add_file(struct tty_struct *tty, struct file *file);
-extern void tty_free_file(struct file *file);
 extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx);
 extern void tty_release_struct(struct tty_struct *tty, int idx);
-extern int tty_release(struct inode *inode, struct file *filp);
 extern void tty_init_termios(struct tty_struct *tty);
 extern void tty_save_termios(struct tty_struct *tty);
 extern int tty_standard_install(struct tty_driver *driver,
@@ -535,8 +508,6 @@ extern int tty_standard_install(struct tty_driver *driver,
 
 extern struct mutex tty_mutex;
 
-#define tty_is_writelocked(tty)  (mutex_is_locked(&tty->atomic_write_lock))
-
 extern void tty_port_init(struct tty_port *port);
 extern void tty_port_link_device(struct tty_port *port,
 		struct tty_driver *driver, unsigned index);
@@ -656,13 +627,8 @@ static inline int tty_port_users(struct tty_port *port)
 extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
 extern int tty_set_ldisc(struct tty_struct *tty, int disc);
-extern int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty);
-extern void tty_ldisc_release(struct tty_struct *tty);
-extern int __must_check tty_ldisc_init(struct tty_struct *tty);
-extern void tty_ldisc_deinit(struct tty_struct *tty);
 extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 				 char *f, int count);
-extern void tty_sysctl_init(void);
 
 /* n_tty.c */
 extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
-- 
cgit v1.2.3


From d3debfcc4e3f65f1370ad4ca2ab61e7f0ff683cd Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 14 Apr 2021 17:41:14 +0100
Subject: bug: Provide dummy version of bug_get_file_line() when !GENERIC_BUG

Provide the missing dummy bug_get_file_line() implementation when
GENENERIC_BUG isn't selected.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 26dbc7e299c7 ("bug: Factor out a getter for a bug's file line")
Cc: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/bug.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index e3841bee4c8d..348acf2558f3 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -61,6 +61,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 	return BUG_TRAP_TYPE_BUG;
 }
 
+struct bug_entry;
+static inline void bug_get_file_line(struct bug_entry *bug, const char **file,
+				     unsigned int *line)
+{
+	*file = NULL;
+	*line = 0;
+}
 
 static inline void generic_bug_clear_once(void) {}
 
-- 
cgit v1.2.3


From c5797f8a64158f724238d13fa5a4b351b03fe42d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 15 Apr 2021 15:53:13 -0700
Subject: ethtool: move ethtool_stats_init

We'll need it for FEC stats as well.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 6 ++++++
 net/ethtool/pause.c     | 6 ------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9f6f323af59a..069100b252bd 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -244,6 +244,12 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 
 #define ETHTOOL_STAT_NOT_SET	(~0ULL)
 
+static inline void ethtool_stats_init(u64 *stats, unsigned int n)
+{
+	while (n--)
+		stats[n] = ETHTOOL_STAT_NOT_SET;
+}
+
 /**
  * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames
  * @tx_pause_frames: transmitted pause frame count. Reported to user space
diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c
index 09998dc5c185..f1967c121278 100644
--- a/net/ethtool/pause.c
+++ b/net/ethtool/pause.c
@@ -21,12 +21,6 @@ const struct nla_policy ethnl_pause_get_policy[] = {
 		NLA_POLICY_NESTED(ethnl_header_policy_stats),
 };
 
-static void ethtool_stats_init(u64 *stats, unsigned int n)
-{
-	while (n--)
-		stats[n] = ETHTOOL_STAT_NOT_SET;
-}
-
 static int pause_prepare_data(const struct ethnl_req_info *req_base,
 			      struct ethnl_reply_data *reply_base,
 			      struct genl_info *info)
-- 
cgit v1.2.3


From be85dbfeb37c8c4d4344da2ee594d78034b82489 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 15 Apr 2021 15:53:15 -0700
Subject: ethtool: add FEC statistics

Similarly to pause statistics add stats for FEC.

The IEEE standard mandates two sets of counters:
 - 30.5.1.1.17 aFECCorrectedBlocks
 - 30.5.1.1.18 aFECUncorrectableBlocks
where block is a block of bits FEC operates on.
Each of these counters is defined per lane (PCS instance).

Multiple vendors provide number of corrected _bits_ rather
than/as well as blocks.

This set adds the 2 standard-based block counters and a extra
one for corrected bits.

Counters are exposed to user space via netlink in new attributes.
Each attribute carries an array of u64s, first element is
the total count, and the following ones are a per-lane break down.

Much like with pause stats the operation will not fail when driver
does not implement the get_fec_stats callback (nor can the driver
fail the operation by returning an error). If stats can't be
reported the relevant attributes will be empty.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst | 21 ++++++++
 Documentation/networking/statistics.rst      |  2 +
 include/linux/ethtool.h                      | 40 +++++++++++++++
 include/uapi/linux/ethtool_netlink.h         | 14 ++++++
 net/ethtool/fec.c                            | 73 +++++++++++++++++++++++++++-
 5 files changed, 149 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index bbecffc7b11a..f8219e2f489e 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1302,6 +1302,7 @@ Kernel response contents:
   ``ETHTOOL_A_FEC_MODES``                bitset  configured modes
   ``ETHTOOL_A_FEC_AUTO``                 bool    FEC mode auto selection
   ``ETHTOOL_A_FEC_ACTIVE``               u32     index of active FEC mode
+  ``ETHTOOL_A_FEC_STATS``                nested  FEC statistics
   =====================================  ======  ==========================
 
 ``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently
@@ -1315,6 +1316,26 @@ This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface.
 ``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode
 bits (rather than old ``ETHTOOL_FEC_*`` bits).
 
+``ETHTOOL_A_FEC_STATS`` are reported if ``ETHTOOL_FLAG_STATS`` was set in
+``ETHTOOL_A_HEADER_FLAGS``.
+Each attribute carries an array of 64bit statistics. First entry in the array
+contains the total number of events on the port, while the following entries
+are counters corresponding to lanes/PCS instances. The number of entries in
+the array will be:
+
++--------------+---------------------------------------------+
+| `0`          | device does not support FEC statistics      |
++--------------+---------------------------------------------+
+| `1`          | device does not support per-lane break down |
++--------------+---------------------------------------------+
+| `1 + #lanes` | device has full support for FEC stats       |
++--------------+---------------------------------------------+
+
+Drivers fill in the statistics in the following structure:
+
+.. kernel-doc:: include/linux/ethtool.h
+    :identifiers: ethtool_fec_stats
+
 FEC_SET
 =======
 
diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst
index 234abedc29b2..b748fe44ee02 100644
--- a/Documentation/networking/statistics.rst
+++ b/Documentation/networking/statistics.rst
@@ -130,6 +130,7 @@ the `ETHTOOL_FLAG_STATS` flag in `ETHTOOL_A_HEADER_FLAGS`. Currently
 statistics are supported in the following commands:
 
   - `ETHTOOL_MSG_PAUSE_GET`
+  - `ETHTOOL_MSG_FEC_GET`
 
 debugfs
 -------
@@ -176,3 +177,4 @@ translated to netlink attributes when dumped. Drivers must not overwrite
 the statistics they don't report with 0.
 
 - ethtool_pause_stats()
+- ethtool_fec_stats()
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 069100b252bd..112a85b57f1f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -269,6 +269,39 @@ struct ethtool_pause_stats {
 	u64 rx_pause_frames;
 };
 
+#define ETHTOOL_MAX_LANES	8
+
+/**
+ * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC
+ * @corrected_blocks: number of received blocks corrected by FEC
+ *	Reported to user space as %ETHTOOL_A_FEC_STAT_CORRECTED.
+ *
+ *	Equivalent to `30.5.1.1.17 aFECCorrectedBlocks` from the standard.
+ *
+ * @uncorrectable_blocks: number of received blocks FEC was not able to correct
+ *	Reported to user space as %ETHTOOL_A_FEC_STAT_UNCORR.
+ *
+ *	Equivalent to `30.5.1.1.18 aFECUncorrectableBlocks` from the standard.
+ *
+ * @corrected_bits: number of bits corrected by FEC
+ *	Similar to @corrected_blocks but counts individual bit changes,
+ *	not entire FEC data blocks. This is a non-standard statistic.
+ *	Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS.
+ *
+ * @lane: per-lane/PCS-instance counts as defined by the standard
+ * @total: error counts for the entire port, for drivers incapable of reporting
+ *	per-lane stats
+ *
+ * Drivers should fill in either only total or per-lane statistics, core
+ * will take care of adding lane values up to produce the total.
+ */
+struct ethtool_fec_stats {
+	struct ethtool_fec_stat {
+		u64 total;
+		u64 lanes[ETHTOOL_MAX_LANES];
+	} corrected_blocks, uncorrectable_blocks, corrected_bits;
+};
+
 #define ETH_MODULE_EEPROM_PAGE_LEN	128
 #define ETH_MODULE_MAX_I2C_ADDRESS	0x7f
 
@@ -439,6 +472,11 @@ struct ethtool_module_eeprom {
  *	ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter),
  *	any change to them will be overwritten by kernel. Returns a negative
  *	error code or zero.
+ * @get_fec_stats: Report FEC statistics.
+ *	Core will sum up per-lane stats to get the total.
+ *	Drivers must not zero statistics which they don't report. The stats
+ *	structure is initialized to ETHTOOL_STAT_NOT_SET indicating driver does
+ *	not report statistics.
  * @get_fecparam: Get the network device Forward Error Correction parameters.
  * @set_fecparam: Set the network device Forward Error Correction parameters.
  * @get_ethtool_phy_stats: Return extended statistics about the PHY device.
@@ -544,6 +582,8 @@ struct ethtool_ops {
 				      struct ethtool_link_ksettings *);
 	int	(*set_link_ksettings)(struct net_device *,
 				      const struct ethtool_link_ksettings *);
+	void	(*get_fec_stats)(struct net_device *dev,
+				 struct ethtool_fec_stats *fec_stats);
 	int	(*get_fecparam)(struct net_device *,
 				      struct ethtool_fecparam *);
 	int	(*set_fecparam)(struct net_device *,
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 9612dcd48a6a..3a2b31ccbc5b 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -643,11 +643,25 @@ enum {
 	ETHTOOL_A_FEC_MODES,				/* bitset */
 	ETHTOOL_A_FEC_AUTO,				/* u8 */
 	ETHTOOL_A_FEC_ACTIVE,				/* u32 */
+	ETHTOOL_A_FEC_STATS,				/* nest - _A_FEC_STAT */
 
 	__ETHTOOL_A_FEC_CNT,
 	ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1)
 };
 
+enum {
+	ETHTOOL_A_FEC_STAT_UNSPEC,
+	ETHTOOL_A_FEC_STAT_PAD,
+
+	ETHTOOL_A_FEC_STAT_CORRECTED,			/* array, u64 */
+	ETHTOOL_A_FEC_STAT_UNCORR,			/* array, u64 */
+	ETHTOOL_A_FEC_STAT_CORR_BITS,			/* array, u64 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_FEC_STAT_CNT,
+	ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1)
+};
+
 /* MODULE EEPROM */
 
 enum {
diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c
index 3e7d091ee7aa..8738dafd5417 100644
--- a/net/ethtool/fec.c
+++ b/net/ethtool/fec.c
@@ -13,6 +13,10 @@ struct fec_reply_data {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes);
 	u32 active_fec;
 	u8 fec_auto;
+	struct fec_stat_grp {
+		u64 stats[1 + ETHTOOL_MAX_LANES];
+		u8 cnt;
+	} corr, uncorr, corr_bits;
 };
 
 #define FEC_REPDATA(__reply_base) \
@@ -21,7 +25,7 @@ struct fec_reply_data {
 #define ETHTOOL_FEC_MASK	((ETHTOOL_FEC_LLRS << 1) - 1)
 
 const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = {
-	[ETHTOOL_A_FEC_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_FEC_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy_stats),
 };
 
 static void
@@ -64,6 +68,28 @@ ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec,
 	return 0;
 }
 
+static void
+fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats)
+{
+	int i;
+
+	if (stats->lanes[0] == ETHTOOL_STAT_NOT_SET) {
+		grp->stats[0] = stats->total;
+		grp->cnt = stats->total != ETHTOOL_STAT_NOT_SET;
+		return;
+	}
+
+	grp->cnt = 1;
+	grp->stats[0] = 0;
+	for (i = 0; i < ETHTOOL_MAX_LANES; i++) {
+		if (stats->lanes[i] == ETHTOOL_STAT_NOT_SET)
+			break;
+
+		grp->stats[0] += stats->lanes[i];
+		grp->stats[grp->cnt++] = stats->lanes[i];
+	}
+}
+
 static int fec_prepare_data(const struct ethnl_req_info *req_base,
 			    struct ethnl_reply_data *reply_base,
 			    struct genl_info *info)
@@ -82,6 +108,17 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base,
 	ret = dev->ethtool_ops->get_fecparam(dev, &fec);
 	if (ret)
 		goto out_complete;
+	if (req_base->flags & ETHTOOL_FLAG_STATS &&
+	    dev->ethtool_ops->get_fec_stats) {
+		struct ethtool_fec_stats stats;
+
+		ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8);
+		dev->ethtool_ops->get_fec_stats(dev, &stats);
+
+		fec_stats_recalc(&data->corr, &stats.corrected_blocks);
+		fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks);
+		fec_stats_recalc(&data->corr_bits, &stats.corrected_bits);
+	}
 
 	WARN_ON_ONCE(fec.reserved);
 
@@ -120,9 +157,40 @@ static int fec_reply_size(const struct ethnl_req_info *req_base,
 	len += nla_total_size(sizeof(u8)) +	/* _FEC_AUTO */
 	       nla_total_size(sizeof(u32));	/* _FEC_ACTIVE */
 
+	if (req_base->flags & ETHTOOL_FLAG_STATS)
+		len += 3 * nla_total_size_64bit(sizeof(u64) *
+						(1 + ETHTOOL_MAX_LANES));
+
 	return len;
 }
 
+static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, ETHTOOL_A_FEC_STATS);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORRECTED,
+			  sizeof(u64) * data->corr.cnt,
+			  data->corr.stats, ETHTOOL_A_FEC_STAT_PAD) ||
+	    nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_UNCORR,
+			  sizeof(u64) * data->uncorr.cnt,
+			  data->uncorr.stats, ETHTOOL_A_FEC_STAT_PAD) ||
+	    nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORR_BITS,
+			  sizeof(u64) * data->corr_bits.cnt,
+			  data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD))
+		goto err_cancel;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+err_cancel:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int fec_fill_reply(struct sk_buff *skb,
 			  const struct ethnl_req_info *req_base,
 			  const struct ethnl_reply_data *reply_base)
@@ -143,6 +211,9 @@ static int fec_fill_reply(struct sk_buff *skb,
 	     nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec)))
 		return -EMSGSIZE;
 
+	if (req_base->flags & ETHTOOL_FLAG_STATS && fec_put_stats(skb, data))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 80e5d1ff5d5f1ed5167a69b7c2fe86071b615f6b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 15 Apr 2021 19:46:50 -0400
Subject: useful constants: struct qstr for ".."

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 2 ++
 fs/ext2/namei.c        | 3 +--
 fs/ext4/namei.c        | 3 +--
 fs/f2fs/dir.c          | 4 +---
 fs/f2fs/namei.c        | 3 +--
 fs/fuse/inode.c        | 3 +--
 fs/nilfs2/namei.c      | 3 +--
 fs/udf/namei.c         | 3 +--
 fs/ufs/super.c         | 3 +--
 include/linux/dcache.h | 1 +
 10 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 7d24ff7eb206..cf871a81f4fd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -84,6 +84,8 @@ const struct qstr empty_name = QSTR_INIT("", 0);
 EXPORT_SYMBOL(empty_name);
 const struct qstr slash_name = QSTR_INIT("/", 1);
 EXPORT_SYMBOL(slash_name);
+const struct qstr dotdot_name = QSTR_INIT("..", 2);
+EXPORT_SYMBOL(dotdot_name);
 
 /*
  * This is the single most critical data structure when it comes
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 3367384d344d..765fb389550f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -81,11 +81,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns
 
 struct dentry *ext2_get_parent(struct dentry *child)
 {
-	struct qstr dotdot = QSTR_INIT("..", 2);
 	ino_t ino;
 	int res;
 
-	res = ext2_inode_by_name(d_inode(child), &dotdot, &ino);
+	res = ext2_inode_by_name(d_inode(child), &dotdot_name, &ino);
 	if (res)
 		return ERR_PTR(res);
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 686bf982c84e..a9db885abb8b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1739,11 +1739,10 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 struct dentry *ext4_get_parent(struct dentry *child)
 {
 	__u32 ino;
-	static const struct qstr dotdot = QSTR_INIT("..", 2);
 	struct ext4_dir_entry_2 * de;
 	struct buffer_head *bh;
 
-	bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
+	bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL);
 	if (IS_ERR(bh))
 		return ERR_CAST(bh);
 	if (!bh)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index e6270a867be1..e211a1b6b013 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -449,9 +449,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 
 struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
 {
-	struct qstr dotdot = QSTR_INIT("..", 2);
-
-	return f2fs_find_entry(dir, &dotdot, p);
+	return f2fs_find_entry(dir, &dotdot_name, p);
 }
 
 ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 17bd072a5d39..f2ad7931e949 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -416,9 +416,8 @@ out:
 
 struct dentry *f2fs_get_parent(struct dentry *child)
 {
-	struct qstr dotdot = QSTR_INIT("..", 2);
 	struct page *page;
-	unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page);
+	unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page);
 	if (!ino) {
 		if (IS_ERR(page))
 			return ERR_CAST(page);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b0e18b470e91..8665e4d24691 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -872,14 +872,13 @@ static struct dentry *fuse_get_parent(struct dentry *child)
 	struct inode *inode;
 	struct dentry *parent;
 	struct fuse_entry_out outarg;
-	const struct qstr name = QSTR_INIT("..", 2);
 	int err;
 
 	if (!fc->export_support)
 		return ERR_PTR(-ESTALE);
 
 	err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
-			       &name, &outarg, &inode);
+			       &dotdot_name, &outarg, &inode);
 	if (err) {
 		if (err == -ENOENT)
 			return ERR_PTR(-ESTALE);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ecace5f96a95..b2709874ced5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -440,10 +440,9 @@ static struct dentry *nilfs_get_parent(struct dentry *child)
 {
 	unsigned long ino;
 	struct inode *inode;
-	struct qstr dotdot = QSTR_INIT("..", 2);
 	struct nilfs_root *root;
 
-	ino = nilfs_inode_by_name(d_inode(child), &dotdot);
+	ino = nilfs_inode_by_name(d_inode(child), &dotdot_name);
 	if (!ino)
 		return ERR_PTR(-ENOENT);
 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f146b3089f3d..3ae9f1e91984 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1215,11 +1215,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
 {
 	struct kernel_lb_addr tloc;
 	struct inode *inode = NULL;
-	struct qstr dotdot = QSTR_INIT("..", 2);
 	struct fileIdentDesc cfi;
 	struct udf_fileident_bh fibh;
 
-	if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi))
+	if (!udf_find_entry(d_inode(child), &dotdot_name, &fibh, &cfi))
 		return ERR_PTR(-EACCES);
 
 	if (fibh.sbh != fibh.ebh)
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 983558b572c7..74028b5a7b0a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -128,10 +128,9 @@ static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
 
 static struct dentry *ufs_get_parent(struct dentry *child)
 {
-	struct qstr dot_dot = QSTR_INIT("..", 2);
 	ino_t ino;
 
-	ino = ufs_inode_by_name(d_inode(child), &dot_dot);
+	ino = ufs_inode_by_name(d_inode(child), &dotdot_name);
 	if (!ino)
 		return ERR_PTR(-ENOENT);
 	return d_obtain_alias(ufs_iget(child->d_sb, ino));
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4ecde5d8250c..9e23d33bb6f1 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -59,6 +59,7 @@ struct qstr {
 
 extern const struct qstr empty_name;
 extern const struct qstr slash_name;
+extern const struct qstr dotdot_name;
 
 struct dentry_stat_t {
 	long nr_dentry;
-- 
cgit v1.2.3


From 42eb0d54c08a0331d6d295420f602237968d792b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 25 Mar 2021 09:22:09 +0100
Subject: fs: split receive_fd_replace from __receive_fd

receive_fd_replace shares almost no code with the general case, so split
it out.  Also remove the "Bump the sock usage counts" comment from
both copies, as that is now what __receive_sock actually does.

[AV: ... and make the only user of receive_fd_replace() choose between
it and receive_fd() according to what userland had passed to it in
flags]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c            | 39 +++++++++++++++++++--------------------
 include/linux/file.h | 11 ++++-------
 kernel/seccomp.c     | 17 ++++++++++++-----
 3 files changed, 35 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index f3a4bac2cbe9..d8ccb95a7f41 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1068,8 +1068,6 @@ out_unlock:
 
 /**
  * __receive_fd() - Install received file into file descriptor table
- *
- * @fd: fd to install into (if negative, a new fd will be allocated)
  * @file: struct file that was received from another process
  * @ufd: __user pointer to write new fd number to
  * @o_flags: the O_* flags to apply to the new fd entry
@@ -1083,7 +1081,7 @@ out_unlock:
  *
  * Returns newly install fd or -ve on error.
  */
-int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags)
+int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 {
 	int new_fd;
 	int error;
@@ -1092,32 +1090,33 @@ int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flag
 	if (error)
 		return error;
 
-	if (fd < 0) {
-		new_fd = get_unused_fd_flags(o_flags);
-		if (new_fd < 0)
-			return new_fd;
-	} else {
-		new_fd = fd;
-	}
+	new_fd = get_unused_fd_flags(o_flags);
+	if (new_fd < 0)
+		return new_fd;
 
 	if (ufd) {
 		error = put_user(new_fd, ufd);
 		if (error) {
-			if (fd < 0)
-				put_unused_fd(new_fd);
+			put_unused_fd(new_fd);
 			return error;
 		}
 	}
 
-	if (fd < 0) {
-		fd_install(new_fd, get_file(file));
-	} else {
-		error = replace_fd(new_fd, file, o_flags);
-		if (error)
-			return error;
-	}
+	fd_install(new_fd, get_file(file));
+	__receive_sock(file);
+	return new_fd;
+}
 
-	/* Bump the sock usage counts, if any. */
+int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
+{
+	int error;
+
+	error = security_file_receive(file);
+	if (error)
+		return error;
+	error = replace_fd(new_fd, file, o_flags);
+	if (error)
+		return error;
 	__receive_sock(file);
 	return new_fd;
 }
diff --git a/include/linux/file.h b/include/linux/file.h
index 225982792fa2..2de2e4613d7b 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -92,23 +92,20 @@ extern void put_unused_fd(unsigned int fd);
 
 extern void fd_install(unsigned int fd, struct file *file);
 
-extern int __receive_fd(int fd, struct file *file, int __user *ufd,
+extern int __receive_fd(struct file *file, int __user *ufd,
 			unsigned int o_flags);
 static inline int receive_fd_user(struct file *file, int __user *ufd,
 				  unsigned int o_flags)
 {
 	if (ufd == NULL)
 		return -EFAULT;
-	return __receive_fd(-1, file, ufd, o_flags);
+	return __receive_fd(file, ufd, o_flags);
 }
 static inline int receive_fd(struct file *file, unsigned int o_flags)
 {
-	return __receive_fd(-1, file, NULL, o_flags);
-}
-static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags)
-{
-	return __receive_fd(fd, file, NULL, o_flags);
+	return __receive_fd(file, NULL, o_flags);
 }
+int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
 extern void flush_delayed_fput(void);
 extern void __fput_sync(struct file *);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1d60fc2c9987..4fe19cecaa94 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -119,8 +119,11 @@ struct seccomp_kaddfd {
 	int fd;
 	unsigned int flags;
 
-	/* To only be set on reply */
-	int ret;
+	union {
+		bool setfd;
+		/* To only be set on reply */
+		int ret;
+	};
 	struct completion completion;
 	struct list_head list;
 };
@@ -1069,7 +1072,11 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
 	 * that it has been handled.
 	 */
 	list_del_init(&addfd->list);
-	addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+	if (!addfd->setfd)
+		addfd->ret = receive_fd(addfd->file, addfd->flags);
+	else
+		addfd->ret = receive_fd_replace(addfd->fd, addfd->file,
+						addfd->flags);
 	complete(&addfd->completion);
 }
 
@@ -1583,8 +1590,8 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
 		return -EBADF;
 
 	kaddfd.flags = addfd.newfd_flags;
-	kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
-		    addfd.newfd : -1;
+	kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD;
+	kaddfd.fd = addfd.newfd;
 	init_completion(&kaddfd.completion);
 
 	ret = mutex_lock_interruptible(&filter->notify_lock);
-- 
cgit v1.2.3


From 8d663f34f8afcf5fc6a84c3cc4fa28cc84d58e39 Mon Sep 17 00:00:00 2001
From: Lin Feng <linf@wangsu.com>
Date: Thu, 15 Apr 2021 11:39:20 +0800
Subject: blk-mq: bypass IO scheduler's limit_depth for passthrough request

Commit 01e99aeca39796003 ("blk-mq: insert passthrough request into
hctx->dispatch directly") gives high priority to passthrough requests and
bypass underlying IO scheduler. But as we allocate tag for such request it
still runs io-scheduler's callback limit_depth, while we really want is to
give full sbitmap-depth capabity to such request for acquiring available
tag.
blktrace shows PC requests(dmraid -s -c -i) hit bfq's limit_depth:
  8,0    2        0     0.000000000 39952 1,0  m   N bfq [bfq_limit_depth] wr_busy 0 sync 0 depth 8
  8,0    2        1     0.000008134 39952  D   R 4 [dmraid]
  8,0    2        2     0.000021538    24  C   R [0]
  8,0    2        0     0.000035442 39952 1,0  m   N bfq [bfq_limit_depth] wr_busy 0 sync 0 depth 8
  8,0    2        3     0.000038813 39952  D   R 24 [dmraid]
  8,0    2        4     0.000044356    24  C   R [0]

This patch introduce a new wrapper to make code not that ugly.

Signed-off-by: Lin Feng <linf@wangsu.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20210415033920.213963-1-linf@wangsu.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 3 ++-
 include/linux/blkdev.h | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d4d7c1caa439..927189a55575 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -361,11 +361,12 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 
 	if (e) {
 		/*
-		 * Flush requests are special and go directly to the
+		 * Flush/passthrough requests are special and go directly to the
 		 * dispatch list. Don't include reserved tags in the
 		 * limiting, as it isn't useful.
 		 */
 		if (!op_is_flush(data->cmd_flags) &&
+		    !blk_op_is_passthrough(data->cmd_flags) &&
 		    e->type->ops.limit_depth &&
 		    !(data->flags & BLK_MQ_REQ_RESERVED))
 			e->type->ops.limit_depth(data->cmd_flags, data);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 30d2090583ad..f2e77ba97550 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -274,6 +274,12 @@ static inline bool bio_is_passthrough(struct bio *bio)
 	return blk_op_is_scsi(op) || blk_op_is_private(op);
 }
 
+static inline bool blk_op_is_passthrough(unsigned int op)
+{
+	return (blk_op_is_scsi(op & REQ_OP_MASK) ||
+			blk_op_is_private(op & REQ_OP_MASK));
+}
+
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
-- 
cgit v1.2.3


From ef54c1a476aef7eef26fe13ea10dc090952c00f8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Apr 2021 12:35:56 +0200
Subject: perf: Rework perf_event_exit_event()

Make perf_event_exit_event() more robust, such that we can use it from
other contexts. Specifically the up and coming remove_on_exec.

For this to work we need to address a few issues. Remove_on_exec will
not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to
disable event_function_call() and we thus have to use
perf_remove_from_context().

When using perf_remove_from_context(), there's two races to consider.
The first is against close(), where we can have concurrent tear-down
of the event. The second is against child_list iteration, which should
not find a half baked event.

To address this, teach perf_remove_from_context() to special case
!ctx->is_active and about DETACH_CHILD.

[ elver@google.com: fix racing parent/child exit in sync_child_event(). ]
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210408103605.1676875-2-elver@google.com
---
 include/linux/perf_event.h |   1 +
 kernel/events/core.c       | 142 +++++++++++++++++++++++++--------------------
 2 files changed, 80 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3f7f89ea5e51..3d478abf411c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -607,6 +607,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK_DATA	0x08
 #define PERF_ATTACH_ITRACE	0x10
 #define PERF_ATTACH_SCHED_CB	0x20
+#define PERF_ATTACH_CHILD	0x40
 
 struct perf_cgroup;
 struct perf_buffer;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f07943183041..318ff7b021b4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2205,6 +2205,26 @@ out:
 	perf_event__header_size(leader);
 }
 
+static void sync_child_event(struct perf_event *child_event);
+
+static void perf_child_detach(struct perf_event *event)
+{
+	struct perf_event *parent_event = event->parent;
+
+	if (!(event->attach_state & PERF_ATTACH_CHILD))
+		return;
+
+	event->attach_state &= ~PERF_ATTACH_CHILD;
+
+	if (WARN_ON_ONCE(!parent_event))
+		return;
+
+	lockdep_assert_held(&parent_event->child_mutex);
+
+	sync_child_event(event);
+	list_del_init(&event->child_list);
+}
+
 static bool is_orphaned_event(struct perf_event *event)
 {
 	return event->state == PERF_EVENT_STATE_DEAD;
@@ -2312,6 +2332,7 @@ group_sched_out(struct perf_event *group_event,
 }
 
 #define DETACH_GROUP	0x01UL
+#define DETACH_CHILD	0x02UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2335,6 +2356,8 @@ __perf_remove_from_context(struct perf_event *event,
 	event_sched_out(event, cpuctx, ctx);
 	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
+	if (flags & DETACH_CHILD)
+		perf_child_detach(event);
 	list_del_event(event, ctx);
 
 	if (!ctx->nr_events && ctx->is_active) {
@@ -2363,25 +2386,21 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
 
 	lockdep_assert_held(&ctx->mutex);
 
-	event_function_call(event, __perf_remove_from_context, (void *)flags);
-
 	/*
-	 * The above event_function_call() can NO-OP when it hits
-	 * TASK_TOMBSTONE. In that case we must already have been detached
-	 * from the context (by perf_event_exit_event()) but the grouping
-	 * might still be in-tact.
+	 * Because of perf_event_exit_task(), perf_remove_from_context() ought
+	 * to work in the face of TASK_TOMBSTONE, unlike every other
+	 * event_function_call() user.
 	 */
-	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
-	if ((flags & DETACH_GROUP) &&
-	    (event->attach_state & PERF_ATTACH_GROUP)) {
-		/*
-		 * Since in that case we cannot possibly be scheduled, simply
-		 * detach now.
-		 */
-		raw_spin_lock_irq(&ctx->lock);
-		perf_group_detach(event);
+	raw_spin_lock_irq(&ctx->lock);
+	if (!ctx->is_active) {
+		__perf_remove_from_context(event, __get_cpu_context(ctx),
+					   ctx, (void *)flags);
 		raw_spin_unlock_irq(&ctx->lock);
+		return;
 	}
+	raw_spin_unlock_irq(&ctx->lock);
+
+	event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
@@ -12377,14 +12396,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
-static void sync_child_event(struct perf_event *child_event,
-			       struct task_struct *child)
+static void sync_child_event(struct perf_event *child_event)
 {
 	struct perf_event *parent_event = child_event->parent;
 	u64 child_val;
 
-	if (child_event->attr.inherit_stat)
-		perf_event_read_event(child_event, child);
+	if (child_event->attr.inherit_stat) {
+		struct task_struct *task = child_event->ctx->task;
+
+		if (task && task != TASK_TOMBSTONE)
+			perf_event_read_event(child_event, task);
+	}
 
 	child_val = perf_event_count(child_event);
 
@@ -12399,60 +12421,53 @@ static void sync_child_event(struct perf_event *child_event,
 }
 
 static void
-perf_event_exit_event(struct perf_event *child_event,
-		      struct perf_event_context *child_ctx,
-		      struct task_struct *child)
+perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_event *parent_event = child_event->parent;
+	struct perf_event *parent_event = event->parent;
+	unsigned long detach_flags = 0;
 
-	/*
-	 * Do not destroy the 'original' grouping; because of the context
-	 * switch optimization the original events could've ended up in a
-	 * random child task.
-	 *
-	 * If we were to destroy the original group, all group related
-	 * operations would cease to function properly after this random
-	 * child dies.
-	 *
-	 * Do destroy all inherited groups, we don't care about those
-	 * and being thorough is better.
-	 */
-	raw_spin_lock_irq(&child_ctx->lock);
-	WARN_ON_ONCE(child_ctx->is_active);
+	if (parent_event) {
+		/*
+		 * Do not destroy the 'original' grouping; because of the
+		 * context switch optimization the original events could've
+		 * ended up in a random child task.
+		 *
+		 * If we were to destroy the original group, all group related
+		 * operations would cease to function properly after this
+		 * random child dies.
+		 *
+		 * Do destroy all inherited groups, we don't care about those
+		 * and being thorough is better.
+		 */
+		detach_flags = DETACH_GROUP | DETACH_CHILD;
+		mutex_lock(&parent_event->child_mutex);
+	}
 
-	if (parent_event)
-		perf_group_detach(child_event);
-	list_del_event(child_event, child_ctx);
-	perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
-	raw_spin_unlock_irq(&child_ctx->lock);
+	perf_remove_from_context(event, detach_flags);
+
+	raw_spin_lock_irq(&ctx->lock);
+	if (event->state > PERF_EVENT_STATE_EXIT)
+		perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
+	raw_spin_unlock_irq(&ctx->lock);
 
 	/*
-	 * Parent events are governed by their filedesc, retain them.
+	 * Child events can be freed.
 	 */
-	if (!parent_event) {
-		perf_event_wakeup(child_event);
+	if (parent_event) {
+		mutex_unlock(&parent_event->child_mutex);
+		/*
+		 * Kick perf_poll() for is_event_hup();
+		 */
+		perf_event_wakeup(parent_event);
+		free_event(event);
+		put_event(parent_event);
 		return;
 	}
-	/*
-	 * Child events can be cleaned up.
-	 */
-
-	sync_child_event(child_event, child);
 
 	/*
-	 * Remove this event from the parent's list
-	 */
-	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-	mutex_lock(&parent_event->child_mutex);
-	list_del_init(&child_event->child_list);
-	mutex_unlock(&parent_event->child_mutex);
-
-	/*
-	 * Kick perf_poll() for is_event_hup().
+	 * Parent events are governed by their filedesc, retain them.
 	 */
-	perf_event_wakeup(parent_event);
-	free_event(child_event);
-	put_event(parent_event);
+	perf_event_wakeup(event);
 }
 
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
@@ -12509,7 +12524,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	perf_event_task(child, child_ctx, 0);
 
 	list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
-		perf_event_exit_event(child_event, child_ctx, child);
+		perf_event_exit_event(child_event, child_ctx);
 
 	mutex_unlock(&child_ctx->mutex);
 
@@ -12769,6 +12784,7 @@ inherit_event(struct perf_event *parent_event,
 	 */
 	raw_spin_lock_irqsave(&child_ctx->lock, flags);
 	add_event_to_ctx(child_event, child_ctx);
+	child_event->attach_state |= PERF_ATTACH_CHILD;
 	raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
 
 	/*
-- 
cgit v1.2.3


From 2b26f0aa004995f49f7b6f4100dd0e4c39a9ed5f Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 8 Apr 2021 12:35:58 +0200
Subject: perf: Support only inheriting events if cloned with CLONE_THREAD

Adds bit perf_event_attr::inherit_thread, to restricting inheriting
events only if the child was cloned with CLONE_THREAD.

This option supports the case where an event is supposed to be
process-wide only (including subthreads), but should not propagate
beyond the current process's shared environment.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/YBvj6eJR%2FDY2TsEB@hirez.programming.kicks-ass.net/
---
 include/linux/perf_event.h      |  5 +++--
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c            | 21 ++++++++++++++-------
 kernel/fork.c                   |  2 +-
 4 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3d478abf411c..1660039199b2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -958,7 +958,7 @@ extern void __perf_event_task_sched_in(struct task_struct *prev,
 				       struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *prev,
 					struct task_struct *next);
-extern int perf_event_init_task(struct task_struct *child);
+extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
@@ -1449,7 +1449,8 @@ perf_event_task_sched_in(struct task_struct *prev,
 static inline void
 perf_event_task_sched_out(struct task_struct *prev,
 			  struct task_struct *next)			{ }
-static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
+static inline int perf_event_init_task(struct task_struct *child,
+				       u64 clone_flags)			{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ad15e40d7f5d..813efb65fea8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -389,7 +389,8 @@ struct perf_event_attr {
 				cgroup         :  1, /* include cgroup events */
 				text_poke      :  1, /* include text poke events */
 				build_id       :  1, /* use build id in mmap2 events */
-				__reserved_1   : 29;
+				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
+				__reserved_1   : 28;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 10ed2cd434dc..3e3c00fd0b2e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11653,6 +11653,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	    (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
 		return -EINVAL;
 
+	if (!attr->inherit && attr->inherit_thread)
+		return -EINVAL;
+
 out:
 	return ret;
 
@@ -12873,12 +12876,13 @@ static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		   struct perf_event_context *parent_ctx,
 		   struct task_struct *child, int ctxn,
-		   int *inherited_all)
+		   u64 clone_flags, int *inherited_all)
 {
 	int ret;
 	struct perf_event_context *child_ctx;
 
-	if (!event->attr.inherit) {
+	if (!event->attr.inherit ||
+	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
 		*inherited_all = 0;
 		return 0;
 	}
@@ -12910,7 +12914,8 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 /*
  * Initialize the perf_event context in task_struct
  */
-static int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn,
+				   u64 clone_flags)
 {
 	struct perf_event_context *child_ctx, *parent_ctx;
 	struct perf_event_context *cloned_ctx;
@@ -12950,7 +12955,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	 */
 	perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
 		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
+					 child, ctxn, clone_flags,
+					 &inherited_all);
 		if (ret)
 			goto out_unlock;
 	}
@@ -12966,7 +12972,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 
 	perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
 		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
+					 child, ctxn, clone_flags,
+					 &inherited_all);
 		if (ret)
 			goto out_unlock;
 	}
@@ -13008,7 +13015,7 @@ out_unlock:
 /*
  * Initialize the perf_event context in task_struct
  */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_task(struct task_struct *child, u64 clone_flags)
 {
 	int ctxn, ret;
 
@@ -13017,7 +13024,7 @@ int perf_event_init_task(struct task_struct *child)
 	INIT_LIST_HEAD(&child->perf_event_list);
 
 	for_each_task_context_nr(ctxn) {
-		ret = perf_event_init_context(child, ctxn);
+		ret = perf_event_init_context(child, ctxn, clone_flags);
 		if (ret) {
 			perf_event_free_task(child);
 			return ret;
diff --git a/kernel/fork.c b/kernel/fork.c
index 0acc8ed1076b..3728a645771c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2078,7 +2078,7 @@ static __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_policy;
 
-	retval = perf_event_init_task(p);
+	retval = perf_event_init_task(p, clone_flags);
 	if (retval)
 		goto bad_fork_cleanup_policy;
 	retval = audit_alloc(p);
-- 
cgit v1.2.3


From fb6cc127e0b6e629252cdd0f77d5a1f49db95b92 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 8 Apr 2021 12:36:00 +0200
Subject: signal: Introduce TRAP_PERF si_code and si_perf to siginfo

Introduces the TRAP_PERF si_code, and associated siginfo_t field
si_perf. These will be used by the perf event subsystem to send signals
(if requested) to the task where an event occurred.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # m68k
Acked-by: Arnd Bergmann <arnd@arndb.de> # asm-generic
Link: https://lkml.kernel.org/r/20210408103605.1676875-6-elver@google.com
---
 arch/m68k/kernel/signal.c          |  3 +++
 arch/x86/kernel/signal_compat.c    |  5 ++++-
 fs/signalfd.c                      |  4 ++++
 include/linux/compat.h             |  2 ++
 include/linux/signal.h             |  1 +
 include/uapi/asm-generic/siginfo.h |  6 +++++-
 include/uapi/linux/signalfd.h      |  4 +++-
 kernel/signal.c                    | 11 +++++++++++
 8 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 349570f16a78..a4b7ee1df211 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -622,6 +622,9 @@ static inline void siginfo_build_tests(void)
 	/* _sigfault._addr_pkey */
 	BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12);
 
+	/* _sigfault._perf */
+	BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10);
+
 	/* _sigpoll */
 	BUILD_BUG_ON(offsetof(siginfo_t, si_band)   != 0x0c);
 	BUILD_BUG_ON(offsetof(siginfo_t, si_fd)     != 0x10);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff498f0..0e5d0a7e203b 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
 	BUILD_BUG_ON(NSIGFPE  != 15);
 	BUILD_BUG_ON(NSIGSEGV != 9);
 	BUILD_BUG_ON(NSIGBUS  != 5);
-	BUILD_BUG_ON(NSIGTRAP != 5);
+	BUILD_BUG_ON(NSIGTRAP != 6);
 	BUILD_BUG_ON(NSIGCHLD != 6);
 	BUILD_BUG_ON(NSIGSYS  != 2);
 
@@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void)
 	BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
 	BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
 
+	BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
+	BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
+
 	CHECK_CSI_OFFSET(_sigpoll);
 	CHECK_CSI_SIZE  (_sigpoll, 2*sizeof(int));
 	CHECK_SI_SIZE   (_sigpoll, 4*sizeof(int));
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 456046e15873..040a1142915f 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 #endif
 		new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
 		break;
+	case SIL_PERF_EVENT:
+		new.ssi_addr = (long) kinfo->si_addr;
+		new.ssi_perf = kinfo->si_perf;
+		break;
 	case SIL_CHLD:
 		new.ssi_pid    = kinfo->si_pid;
 		new.ssi_uid    = kinfo->si_uid;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 6e65be753603..c8821d966812 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -236,6 +236,8 @@ typedef struct compat_siginfo {
 					char _dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD];
 					u32 _pkey;
 				} _addr_pkey;
+				/* used when si_code=TRAP_PERF */
+				compat_u64 _perf;
 			};
 		} _sigfault;
 
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 205526c4003a..1e98548d7cf6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -43,6 +43,7 @@ enum siginfo_layout {
 	SIL_FAULT_MCEERR,
 	SIL_FAULT_BNDERR,
 	SIL_FAULT_PKUERR,
+	SIL_PERF_EVENT,
 	SIL_CHLD,
 	SIL_RT,
 	SIL_SYS,
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index d2597000407a..d0bb9125c853 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -91,6 +91,8 @@ union __sifields {
 				char _dummy_pkey[__ADDR_BND_PKEY_PAD];
 				__u32 _pkey;
 			} _addr_pkey;
+			/* used when si_code=TRAP_PERF */
+			__u64 _perf;
 		};
 	} _sigfault;
 
@@ -155,6 +157,7 @@ typedef struct siginfo {
 #define si_lower	_sifields._sigfault._addr_bnd._lower
 #define si_upper	_sifields._sigfault._addr_bnd._upper
 #define si_pkey		_sifields._sigfault._addr_pkey._pkey
+#define si_perf		_sifields._sigfault._perf
 #define si_band		_sifields._sigpoll._band
 #define si_fd		_sifields._sigpoll._fd
 #define si_call_addr	_sifields._sigsys._call_addr
@@ -253,7 +256,8 @@ typedef struct siginfo {
 #define TRAP_BRANCH     3	/* process taken branch trap */
 #define TRAP_HWBKPT     4	/* hardware breakpoint/watchpoint */
 #define TRAP_UNK	5	/* undiagnosed trap */
-#define NSIGTRAP	5
+#define TRAP_PERF	6	/* perf event with sigtrap=1 */
+#define NSIGTRAP	6
 
 /*
  * There is an additional set of SIGTRAP si_codes used by ptrace
diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h
index 83429a05b698..7e333042c7e3 100644
--- a/include/uapi/linux/signalfd.h
+++ b/include/uapi/linux/signalfd.h
@@ -39,6 +39,8 @@ struct signalfd_siginfo {
 	__s32 ssi_syscall;
 	__u64 ssi_call_addr;
 	__u32 ssi_arch;
+	__u32 __pad3;
+	__u64 ssi_perf;
 
 	/*
 	 * Pad strcture to 128 bytes. Remember to update the
@@ -49,7 +51,7 @@ struct signalfd_siginfo {
 	 * comes out of a read(2) and we really don't want to have
 	 * a compat on read(2).
 	 */
-	__u8 __pad[28];
+	__u8 __pad[16];
 };
 
 
diff --git a/kernel/signal.c b/kernel/signal.c
index ba4d1ef39a9e..f68351825e5e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1199,6 +1199,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
 	case SIL_FAULT_MCEERR:
 	case SIL_FAULT_BNDERR:
 	case SIL_FAULT_PKUERR:
+	case SIL_PERF_EVENT:
 	case SIL_SYS:
 		ret = false;
 		break;
@@ -2531,6 +2532,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig)
 	case SIL_FAULT_MCEERR:
 	case SIL_FAULT_BNDERR:
 	case SIL_FAULT_PKUERR:
+	case SIL_PERF_EVENT:
 		ksig->info.si_addr = arch_untagged_si_addr(
 			ksig->info.si_addr, ksig->sig, ksig->info.si_code);
 		break;
@@ -3333,6 +3335,10 @@ void copy_siginfo_to_external32(struct compat_siginfo *to,
 #endif
 		to->si_pkey = from->si_pkey;
 		break;
+	case SIL_PERF_EVENT:
+		to->si_addr = ptr_to_compat(from->si_addr);
+		to->si_perf = from->si_perf;
+		break;
 	case SIL_CHLD:
 		to->si_pid = from->si_pid;
 		to->si_uid = from->si_uid;
@@ -3413,6 +3419,10 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
 #endif
 		to->si_pkey = from->si_pkey;
 		break;
+	case SIL_PERF_EVENT:
+		to->si_addr = compat_ptr(from->si_addr);
+		to->si_perf = from->si_perf;
+		break;
 	case SIL_CHLD:
 		to->si_pid    = from->si_pid;
 		to->si_uid    = from->si_uid;
@@ -4593,6 +4603,7 @@ static inline void siginfo_buildtime_checks(void)
 	CHECK_OFFSET(si_lower);
 	CHECK_OFFSET(si_upper);
 	CHECK_OFFSET(si_pkey);
+	CHECK_OFFSET(si_perf);
 
 	/* sigpoll */
 	CHECK_OFFSET(si_band);
-- 
cgit v1.2.3


From 97ba62b278674293762c3d91f724f1bb922f04e0 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 8 Apr 2021 12:36:01 +0200
Subject: perf: Add support for SIGTRAP on perf events

Adds bit perf_event_attr::sigtrap, which can be set to cause events to
send SIGTRAP (with si_code TRAP_PERF) to the task where the event
occurred. The primary motivation is to support synchronous signals on
perf events in the task where an event (such as breakpoints) triggered.

To distinguish perf events based on the event type, the type is set in
si_errno. For events that are associated with an address, si_addr is
copied from perf_sample_data.

The new field perf_event_attr::sig_data is copied to si_perf, which
allows user space to disambiguate which event (of the same type)
triggered the signal. For example, user space could encode the relevant
information it cares about in sig_data.

We note that the choice of an opaque u64 provides the simplest and most
flexible option. Alternatives where a reference to some user space data
is passed back suffer from the problem that modification of referenced
data (be it the event fd, or the perf_event_attr) can race with the
signal being delivered (of course, the same caveat applies if user space
decides to store a pointer in sig_data, but the ABI explicitly avoids
prescribing such a design).

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Link: https://lore.kernel.org/lkml/YBv3rAT566k+6zjg@hirez.programming.kicks-ass.net/
---
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h | 10 ++++++++-
 kernel/events/core.c            | 49 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1660039199b2..7d7280aa4e22 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -735,6 +735,7 @@ struct perf_event {
 	int				pending_wakeup;
 	int				pending_kill;
 	int				pending_disable;
+	unsigned long			pending_addr;	/* SIGTRAP */
 	struct irq_work			pending;
 
 	atomic_t			event_limit;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8c5b9f5ad63f..31b00e3b69c9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -311,6 +311,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
 #define PERF_ATTR_SIZE_VER6	120	/* add: aux_sample_size */
+#define PERF_ATTR_SIZE_VER7	128	/* add: sig_data */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -391,7 +392,8 @@ struct perf_event_attr {
 				build_id       :  1, /* use build id in mmap2 events */
 				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
-				__reserved_1   : 27;
+				sigtrap        :  1, /* send synchronous SIGTRAP on event */
+				__reserved_1   : 26;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -443,6 +445,12 @@ struct perf_event_attr {
 	__u16	__reserved_2;
 	__u32	aux_sample_size;
 	__u32	__reserved_3;
+
+	/*
+	 * User provided data if sigtrap=1, passed back to user via
+	 * siginfo_t::si_perf, e.g. to permit user to identify the event.
+	 */
+	__u64	sig_data;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e4a584bceb7a..6f0723c711a9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6392,6 +6392,33 @@ void perf_event_wakeup(struct perf_event *event)
 	}
 }
 
+static void perf_sigtrap(struct perf_event *event)
+{
+	struct kernel_siginfo info;
+
+	/*
+	 * We'd expect this to only occur if the irq_work is delayed and either
+	 * ctx->task or current has changed in the meantime. This can be the
+	 * case on architectures that do not implement arch_irq_work_raise().
+	 */
+	if (WARN_ON_ONCE(event->ctx->task != current))
+		return;
+
+	/*
+	 * perf_pending_event() can race with the task exiting.
+	 */
+	if (current->flags & PF_EXITING)
+		return;
+
+	clear_siginfo(&info);
+	info.si_signo = SIGTRAP;
+	info.si_code = TRAP_PERF;
+	info.si_errno = event->attr.type;
+	info.si_perf = event->attr.sig_data;
+	info.si_addr = (void __user *)event->pending_addr;
+	force_sig_info(&info);
+}
+
 static void perf_pending_event_disable(struct perf_event *event)
 {
 	int cpu = READ_ONCE(event->pending_disable);
@@ -6401,6 +6428,13 @@ static void perf_pending_event_disable(struct perf_event *event)
 
 	if (cpu == smp_processor_id()) {
 		WRITE_ONCE(event->pending_disable, -1);
+
+		if (event->attr.sigtrap) {
+			perf_sigtrap(event);
+			atomic_set_release(&event->event_limit, 1); /* rearm event */
+			return;
+		}
+
 		perf_event_disable_local(event);
 		return;
 	}
@@ -9103,6 +9137,7 @@ static int __perf_event_overflow(struct perf_event *event,
 	if (events && atomic_dec_and_test(&event->event_limit)) {
 		ret = 1;
 		event->pending_kill = POLL_HUP;
+		event->pending_addr = data->addr;
 
 		perf_event_disable_inatomic(event);
 	}
@@ -11384,6 +11419,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		if (!task || cpu != -1)
 			return ERR_PTR(-EINVAL);
 	}
+	if (attr->sigtrap && !task) {
+		/* Requires a task: avoid signalling random tasks. */
+		return ERR_PTR(-EINVAL);
+	}
 
 	node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
 	event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
@@ -11432,6 +11471,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	event->state		= PERF_EVENT_STATE_INACTIVE;
 
+	if (event->attr.sigtrap)
+		atomic_set(&event->event_limit, 1);
+
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
 		/*
@@ -11710,6 +11752,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 	if (attr->remove_on_exec && attr->enable_on_exec)
 		return -EINVAL;
 
+	if (attr->sigtrap && !attr->remove_on_exec)
+		return -EINVAL;
+
 out:
 	return ret;
 
@@ -12936,7 +12981,9 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 	struct perf_event_context *child_ctx;
 
 	if (!event->attr.inherit ||
-	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) {
+	    (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
+	    /* Do not inherit if sigtrap and signal handlers were cleared. */
+	    (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
 		*inherited_all = 0;
 		return 0;
 	}
-- 
cgit v1.2.3


From b02a4fd8148f655095d9e3d6eddd8f0042bcc27c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 25 Jan 2021 16:46:49 +0100
Subject: cpumask: Make cpu_{online,possible,present,active}() inline

Prepare for addition of another mask. Primarily a code movement to
avoid having to create more #ifdef, but while there, convert
everything with an argument to an inline function.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lkml.kernel.org/r/20210310150109.045447765@infradead.org
---
 include/linux/cpumask.h | 97 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 383684e30f12..a58433668bb2 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -98,37 +98,6 @@ extern struct cpumask __cpu_active_mask;
 
 extern atomic_t __num_online_cpus;
 
-#if NR_CPUS > 1
-/**
- * num_online_cpus() - Read the number of online CPUs
- *
- * Despite the fact that __num_online_cpus is of type atomic_t, this
- * interface gives only a momentary snapshot and is not protected against
- * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
- * region.
- */
-static inline unsigned int num_online_cpus(void)
-{
-	return atomic_read(&__num_online_cpus);
-}
-#define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
-#define num_present_cpus()	cpumask_weight(cpu_present_mask)
-#define num_active_cpus()	cpumask_weight(cpu_active_mask)
-#define cpu_online(cpu)		cpumask_test_cpu((cpu), cpu_online_mask)
-#define cpu_possible(cpu)	cpumask_test_cpu((cpu), cpu_possible_mask)
-#define cpu_present(cpu)	cpumask_test_cpu((cpu), cpu_present_mask)
-#define cpu_active(cpu)		cpumask_test_cpu((cpu), cpu_active_mask)
-#else
-#define num_online_cpus()	1U
-#define num_possible_cpus()	1U
-#define num_present_cpus()	1U
-#define num_active_cpus()	1U
-#define cpu_online(cpu)		((cpu) == 0)
-#define cpu_possible(cpu)	((cpu) == 0)
-#define cpu_present(cpu)	((cpu) == 0)
-#define cpu_active(cpu)		((cpu) == 0)
-#endif
-
 extern cpumask_t cpus_booted_once_mask;
 
 static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
@@ -894,6 +863,72 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 	return to_cpumask(p);
 }
 
+#if NR_CPUS > 1
+/**
+ * num_online_cpus() - Read the number of online CPUs
+ *
+ * Despite the fact that __num_online_cpus is of type atomic_t, this
+ * interface gives only a momentary snapshot and is not protected against
+ * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
+ * region.
+ */
+static inline unsigned int num_online_cpus(void)
+{
+	return atomic_read(&__num_online_cpus);
+}
+#define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
+#define num_present_cpus()	cpumask_weight(cpu_present_mask)
+#define num_active_cpus()	cpumask_weight(cpu_active_mask)
+
+static inline bool cpu_online(unsigned int cpu)
+{
+	return cpumask_test_cpu(cpu, cpu_online_mask);
+}
+
+static inline bool cpu_possible(unsigned int cpu)
+{
+	return cpumask_test_cpu(cpu, cpu_possible_mask);
+}
+
+static inline bool cpu_present(unsigned int cpu)
+{
+	return cpumask_test_cpu(cpu, cpu_present_mask);
+}
+
+static inline bool cpu_active(unsigned int cpu)
+{
+	return cpumask_test_cpu(cpu, cpu_active_mask);
+}
+
+#else
+
+#define num_online_cpus()	1U
+#define num_possible_cpus()	1U
+#define num_present_cpus()	1U
+#define num_active_cpus()	1U
+
+static inline bool cpu_online(unsigned int cpu)
+{
+	return cpu == 0;
+}
+
+static inline bool cpu_possible(unsigned int cpu)
+{
+	return cpu == 0;
+}
+
+static inline bool cpu_present(unsigned int cpu)
+{
+	return cpu == 0;
+}
+
+static inline bool cpu_active(unsigned int cpu)
+{
+	return cpu == 0;
+}
+
+#endif /* NR_CPUS > 1 */
+
 #define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
 
 #if NR_CPUS <= BITS_PER_LONG
-- 
cgit v1.2.3


From e40f74c535b8a0ecf3ef0388b51a34cdadb34fb5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 19 Jan 2021 18:43:45 +0100
Subject: cpumask: Introduce DYING mask

Introduce a cpumask that indicates (for each CPU) what direction the
CPU hotplug is currently going. Notably, it tracks rollbacks. Eg. when
an up fails and we do a roll-back down, it will accurately reflect the
direction.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lkml.kernel.org/r/20210310150109.151441252@infradead.org
---
 include/linux/cpumask.h | 20 ++++++++++++++++++++
 kernel/cpu.c            |  6 ++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index a58433668bb2..e6b948a6000d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -91,10 +91,12 @@ extern struct cpumask __cpu_possible_mask;
 extern struct cpumask __cpu_online_mask;
 extern struct cpumask __cpu_present_mask;
 extern struct cpumask __cpu_active_mask;
+extern struct cpumask __cpu_dying_mask;
 #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
 #define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
 #define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
 #define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
+#define cpu_dying_mask    ((const struct cpumask *)&__cpu_dying_mask)
 
 extern atomic_t __num_online_cpus;
 
@@ -826,6 +828,14 @@ set_cpu_active(unsigned int cpu, bool active)
 		cpumask_clear_cpu(cpu, &__cpu_active_mask);
 }
 
+static inline void
+set_cpu_dying(unsigned int cpu, bool dying)
+{
+	if (dying)
+		cpumask_set_cpu(cpu, &__cpu_dying_mask);
+	else
+		cpumask_clear_cpu(cpu, &__cpu_dying_mask);
+}
 
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
@@ -900,6 +910,11 @@ static inline bool cpu_active(unsigned int cpu)
 	return cpumask_test_cpu(cpu, cpu_active_mask);
 }
 
+static inline bool cpu_dying(unsigned int cpu)
+{
+	return cpumask_test_cpu(cpu, cpu_dying_mask);
+}
+
 #else
 
 #define num_online_cpus()	1U
@@ -927,6 +942,11 @@ static inline bool cpu_active(unsigned int cpu)
 	return cpu == 0;
 }
 
+static inline bool cpu_dying(unsigned int cpu)
+{
+	return false;
+}
+
 #endif /* NR_CPUS > 1 */
 
 #define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 23505d6abd45..838dcf238f92 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,6 +160,9 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
 	int (*cb)(unsigned int cpu);
 	int ret, cnt;
 
+	if (cpu_dying(cpu) != !bringup)
+		set_cpu_dying(cpu, !bringup);
+
 	if (st->fail == state) {
 		st->fail = CPUHP_INVALID;
 		return -EAGAIN;
@@ -2512,6 +2515,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
 struct cpumask __cpu_active_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_active_mask);
 
+struct cpumask __cpu_dying_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_dying_mask);
+
 atomic_t __num_online_cpus __read_mostly;
 EXPORT_SYMBOL(__num_online_cpus);
 
-- 
cgit v1.2.3


From 8a99b6833c884fa0e7919030d93fecedc69fc625 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 Mar 2021 11:43:21 +0100
Subject: sched: Move SCHED_DEBUG sysctl to debugfs

Stop polluting sysctl with undocumented knobs that really are debug
only, move them all to /debug/sched/ along with the existing
/debug/sched_* files that already exist.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lkml.kernel.org/r/20210412102001.287610138@infradead.org
---
 include/linux/sched/sysctl.h |  8 ++--
 kernel/sched/core.c          |  4 +-
 kernel/sched/debug.c         | 74 ++++++++++++++++++++++++++++++++--
 kernel/sched/fair.c          |  9 +----
 kernel/sched/sched.h         |  2 +
 kernel/sysctl.c              | 96 --------------------------------------------
 6 files changed, 80 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3c31ba88aca5..0a3f34638cf5 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 enum { sysctl_hung_task_timeout_secs = 0 };
 #endif
 
+extern unsigned int sysctl_sched_child_runs_first;
+
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;
 
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
@@ -37,7 +38,7 @@ enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_LINEAR,
 	SCHED_TUNABLESCALING_END,
 };
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_sched_tunable_scaling;
 
 extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
@@ -47,9 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *length, loff_t *ppos);
 #endif
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d031da20df3..bac30db29d3f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5504,9 +5504,11 @@ static const struct file_operations sched_dynamic_fops = {
 	.release	= single_release,
 };
 
+extern struct dentry *debugfs_sched;
+
 static __init int sched_init_debug_dynamic(void)
 {
-	debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+	debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
 	return 0;
 }
 late_initcall(sched_init_debug_dynamic);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4b49cc2af5c4..2093b9086e34 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -169,15 +169,81 @@ static const struct file_operations sched_feat_fops = {
 	.release	= single_release,
 };
 
+#ifdef CONFIG_SMP
+
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	char buf[16];
+
+	if (cnt > 15)
+		cnt = 15;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+		return -EINVAL;
+
+	if (sched_update_scaling())
+		return -EINVAL;
+
+	*ppos += cnt;
+	return cnt;
+}
+
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+	return 0;
+}
+
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_scaling_show, NULL);
+}
+
+static const struct file_operations sched_scaling_fops = {
+	.open		= sched_scaling_open,
+	.write		= sched_scaling_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif /* SMP */
+
 __read_mostly bool sched_debug_enabled;
 
+struct dentry *debugfs_sched;
+
 static __init int sched_init_debug(void)
 {
-	debugfs_create_file("sched_features", 0644, NULL, NULL,
-			&sched_feat_fops);
+	struct dentry __maybe_unused *numa;
 
-	debugfs_create_bool("sched_debug", 0644, NULL,
-			&sched_debug_enabled);
+	debugfs_sched = debugfs_create_dir("sched", NULL);
+
+	debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+	debugfs_create_bool("debug_enabled", 0644, debugfs_sched, &sched_debug_enabled);
+
+	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
+	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
+
+#ifdef CONFIG_SMP
+	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+	numa = debugfs_create_dir("numa_balancing", debugfs_sched);
+
+	debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+	debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+	debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+	debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+#endif
 
 	return 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b8ae02f1994..b3ea14c08a9b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
  *
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  */
-enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
@@ -634,15 +634,10 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
-int sched_proc_update_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
+int sched_update_scaling(void)
 {
-	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	unsigned int factor = get_update_sysctl_factor();
 
-	if (ret || !write)
-		return ret;
-
 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 					sysctl_sched_min_granularity);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7e7e936b4938..123ff3bb90bc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1568,6 +1568,8 @@ static inline void unregister_sched_domain_sysctl(void)
 }
 #endif
 
+extern int sched_update_scaling(void);
+
 extern void flush_smp_call_function_from_idle(void);
 
 #else /* !CONFIG_SMP: */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 17f1cc9988ca..4bff44d47154 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -184,17 +184,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
 int sysctl_legacy_va_layout;
 #endif
 
-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000;		/* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
-static int min_wakeup_granularity_ns;			/* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
-
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
 static int max_extfrag_threshold = 1000;
@@ -1659,91 +1648,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-#ifdef CONFIG_SCHED_DEBUG
-	{
-		.procname	= "sched_min_granularity_ns",
-		.data		= &sysctl_sched_min_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.procname	= "sched_latency_ns",
-		.data		= &sysctl_sched_latency,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.procname	= "sched_wakeup_granularity_ns",
-		.data		= &sysctl_sched_wakeup_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-#ifdef CONFIG_SMP
-	{
-		.procname	= "sched_tunable_scaling",
-		.data		= &sysctl_sched_tunable_scaling,
-		.maxlen		= sizeof(enum sched_tunable_scaling),
-		.mode		= 0644,
-		.proc_handler	= sched_proc_update_handler,
-		.extra1		= &min_sched_tunable_scaling,
-		.extra2		= &max_sched_tunable_scaling,
-	},
-	{
-		.procname	= "sched_migration_cost_ns",
-		.data		= &sysctl_sched_migration_cost,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "sched_nr_migrate",
-		.data		= &sysctl_sched_nr_migrate,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_NUMA_BALANCING
-	{
-		.procname	= "numa_balancing_scan_delay_ms",
-		.data		= &sysctl_numa_balancing_scan_delay,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_period_min_ms",
-		.data		= &sysctl_numa_balancing_scan_period_min,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_period_max_ms",
-		.data		= &sysctl_numa_balancing_scan_period_max,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
-		.procname	= "numa_balancing_scan_size_mb",
-		.data		= &sysctl_numa_balancing_scan_size,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ONE,
-	},
-#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname	= "sched_schedstats",
-- 
cgit v1.2.3


From 9af0440ec86ebdab075e1b3d231f81fe7decb575 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 25 Mar 2021 10:53:55 +0100
Subject: debugfs: Implement debugfs_create_str()

Implement debugfs_create_str() to easily display names and such in
debugfs.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lkml.kernel.org/r/20210412102001.415407080@infradead.org
---
 fs/debugfs/file.c       | 91 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/debugfs.h | 17 +++++++++
 2 files changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 686e0ad28788..9b78e9ee5429 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -865,6 +865,97 @@ struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_bool);
 
+ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos)
+{
+	struct dentry *dentry = F_DENTRY(file);
+	char *str, *copy = NULL;
+	int copy_len, len;
+	ssize_t ret;
+
+	ret = debugfs_file_get(dentry);
+	if (unlikely(ret))
+		return ret;
+
+	str = *(char **)file->private_data;
+	len = strlen(str) + 1;
+	copy = kmalloc(len, GFP_KERNEL);
+	if (!copy) {
+		debugfs_file_put(dentry);
+		return -ENOMEM;
+	}
+
+	copy_len = strscpy(copy, str, len);
+	debugfs_file_put(dentry);
+	if (copy_len < 0) {
+		kfree(copy);
+		return copy_len;
+	}
+
+	copy[copy_len] = '\n';
+
+	ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len);
+	kfree(copy);
+
+	return ret;
+}
+
+static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	/* This is really only for read-only strings */
+	return -EINVAL;
+}
+
+static const struct file_operations fops_str = {
+	.read =		debugfs_read_file_str,
+	.write =	debugfs_write_file_str,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+static const struct file_operations fops_str_ro = {
+	.read =		debugfs_read_file_str,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+static const struct file_operations fops_str_wo = {
+	.write =	debugfs_write_file_str,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+/**
+ * debugfs_create_str - create a debugfs file that is used to read and write a string value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
+ * be returned.
+ */
+void debugfs_create_str(const char *name, umode_t mode,
+			struct dentry *parent, char **value)
+{
+	debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str,
+				   &fops_str_ro, &fops_str_wo);
+}
+
 static ssize_t read_file_blob(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos)
 {
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index d6c4cc9ecc77..1fdb4343af9c 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -128,6 +128,8 @@ void debugfs_create_atomic_t(const char *name, umode_t mode,
 			     struct dentry *parent, atomic_t *value);
 struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 				  struct dentry *parent, bool *value);
+void debugfs_create_str(const char *name, umode_t mode,
+			struct dentry *parent, char **value);
 
 struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				  struct dentry *parent,
@@ -156,6 +158,9 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
 ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
 				size_t count, loff_t *ppos);
 
+ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
+			      size_t count, loff_t *ppos);
+
 #else
 
 #include <linux/err.h>
@@ -297,6 +302,11 @@ static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline void debugfs_create_str(const char *name, umode_t mode,
+				      struct dentry *parent,
+				      char **value)
+{ }
+
 static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
 				  struct dentry *parent,
 				  struct debugfs_blob_wrapper *blob)
@@ -348,6 +358,13 @@ static inline ssize_t debugfs_write_file_bool(struct file *file,
 	return -ENODEV;
 }
 
+static inline ssize_t debugfs_read_file_str(struct file *file,
+					    char __user *user_buf,
+					    size_t count, loff_t *ppos)
+{
+	return -ENODEV;
+}
+
 #endif
 
 /**
-- 
cgit v1.2.3


From c0aec6680b6c82fe893a546e322e1130cd5cf21e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 1 Apr 2021 14:56:25 +0100
Subject: iommu: Statically set module owner

It happens that the 3 drivers which first supported being modular are
also ones which play games with their pgsize_bitmap, so have non-const
iommu_ops where dynamically setting the owner manages to work out OK.
However, it's less than ideal to force that upon all drivers which want
to be modular - like the new sprd-iommu driver which now has a potential
bug in that regard - so let's just statically set the module owner and
let ops remain const wherever possible.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/31423b99ff609c3d4b291c701a7a7a810d9ce8dc.1617285386.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 +
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 1 +
 drivers/iommu/sprd-iommu.c                  | 1 +
 drivers/iommu/virtio-iommu.c                | 1 +
 include/linux/iommu.h                       | 9 +--------
 5 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6f8ffc86a115..bee7ce246dd2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2674,6 +2674,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.sva_unbind		= arm_smmu_sva_unbind,
 	.sva_get_pasid		= arm_smmu_sva_get_pasid,
 	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
+	.owner			= THIS_MODULE,
 };
 
 /* Probing and initialisation functions */
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 3c6adcdb201b..837a2f1816b7 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1577,6 +1577,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.def_domain_type	= arm_smmu_def_domain_type,
 	.pgsize_bitmap		= -1UL, /* Restricted during device attach */
+	.owner			= THIS_MODULE,
 };
 
 static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index 3811a5085b50..89272f9fd8f3 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -436,6 +436,7 @@ static const struct iommu_ops sprd_iommu_ops = {
 	.device_group	= sprd_iommu_device_group,
 	.of_xlate	= sprd_iommu_of_xlate,
 	.pgsize_bitmap	= ~0UL << SPRD_IOMMU_PAGE_SHIFT,
+	.owner		= THIS_MODULE,
 };
 
 static const struct of_device_id sprd_iommu_of_match[] = {
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 2bfdd5734844..594ed827e944 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -945,6 +945,7 @@ static struct iommu_ops viommu_ops = {
 	.get_resv_regions	= viommu_get_resv_regions,
 	.put_resv_regions	= generic_iommu_put_resv_regions,
 	.of_xlate		= viommu_of_xlate,
+	.owner			= THIS_MODULE,
 };
 
 static int viommu_init_vqs(struct viommu_dev *viommu)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9223a8266b08..ce904cf4e774 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -361,19 +361,12 @@ int  iommu_device_link(struct iommu_device   *iommu, struct device *link);
 void iommu_device_unlink(struct iommu_device *iommu, struct device *link);
 int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain);
 
-static inline void __iommu_device_set_ops(struct iommu_device *iommu,
+static inline void iommu_device_set_ops(struct iommu_device *iommu,
 					  const struct iommu_ops *ops)
 {
 	iommu->ops = ops;
 }
 
-#define iommu_device_set_ops(iommu, ops)				\
-do {									\
-	struct iommu_ops *__ops = (struct iommu_ops *)(ops);		\
-	__ops->owner = THIS_MODULE;					\
-	__iommu_device_set_ops(iommu, __ops);				\
-} while (0)
-
 static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
 					   struct fwnode_handle *fwnode)
 {
-- 
cgit v1.2.3


From 2d471b20c55e13c98d1dba413bf2de618e89cdac Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 1 Apr 2021 14:56:26 +0100
Subject: iommu: Streamline registration interface

Rather than have separate opaque setter functions that are easy to
overlook and lead to repetitive boilerplate in drivers, let's pass the
relevant initialisation parameters directly to iommu_device_register().

Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/ab001b87c533b6f4db71eb90db6f888953986c36.1617285386.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/init.c                    |  3 +--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  5 +----
 drivers/iommu/arm/arm-smmu/arm-smmu.c       |  5 +----
 drivers/iommu/arm/arm-smmu/qcom_iommu.c     |  5 +----
 drivers/iommu/exynos-iommu.c                |  5 +----
 drivers/iommu/fsl_pamu_domain.c             |  4 +---
 drivers/iommu/intel/dmar.c                  |  4 +---
 drivers/iommu/intel/iommu.c                 |  3 +--
 drivers/iommu/iommu.c                       | 19 +++++++++++++++++-
 drivers/iommu/ipmmu-vmsa.c                  |  6 +-----
 drivers/iommu/msm_iommu.c                   |  5 +----
 drivers/iommu/mtk_iommu.c                   |  5 +----
 drivers/iommu/mtk_iommu_v1.c                |  4 +---
 drivers/iommu/omap-iommu.c                  |  5 +----
 drivers/iommu/rockchip-iommu.c              |  5 +----
 drivers/iommu/s390-iommu.c                  |  4 +---
 drivers/iommu/sprd-iommu.c                  |  5 +----
 drivers/iommu/sun50i-iommu.c                |  5 +----
 drivers/iommu/tegra-gart.c                  |  5 +----
 drivers/iommu/tegra-smmu.c                  |  5 +----
 drivers/iommu/virtio-iommu.c                |  5 +----
 include/linux/iommu.h                       | 30 ++++++-----------------------
 22 files changed, 44 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 54f6e99528f3..d006724f4dc2 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1888,8 +1888,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
 
 	iommu_device_sysfs_add(&iommu->iommu, &iommu->dev->dev,
 			       amd_iommu_groups, "ivhd%d", iommu->index);
-	iommu_device_set_ops(&iommu->iommu, &amd_iommu_ops);
-	iommu_device_register(&iommu->iommu);
+	iommu_device_register(&iommu->iommu, &amd_iommu_ops, NULL);
 
 	return pci_enable_device(iommu->dev);
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index bee7ce246dd2..54b2f27b81d4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3666,10 +3666,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	iommu_device_set_ops(&smmu->iommu, &arm_smmu_ops);
-	iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
-
-	ret = iommu_device_register(&smmu->iommu);
+	ret = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
 	if (ret) {
 		dev_err(dev, "Failed to register iommu\n");
 		return ret;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 837a2f1816b7..6f72c4d208ca 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -2161,10 +2161,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	iommu_device_set_ops(&smmu->iommu, &arm_smmu_ops);
-	iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
-
-	err = iommu_device_register(&smmu->iommu);
+	err = iommu_device_register(&smmu->iommu, &arm_smmu_ops, dev);
 	if (err) {
 		dev_err(dev, "Failed to register iommu\n");
 		return err;
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index 7f280c8d5c53..4294abe389b2 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -847,10 +847,7 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	iommu_device_set_ops(&qcom_iommu->iommu, &qcom_iommu_ops);
-	iommu_device_set_fwnode(&qcom_iommu->iommu, dev->fwnode);
-
-	ret = iommu_device_register(&qcom_iommu->iommu);
+	ret = iommu_device_register(&qcom_iommu->iommu, &qcom_iommu_ops, dev);
 	if (ret) {
 		dev_err(dev, "Failed to register iommu\n");
 		return ret;
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 8fa9a591fb96..7623d8c371f5 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -630,10 +630,7 @@ static int exynos_sysmmu_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	iommu_device_set_ops(&data->iommu, &exynos_iommu_ops);
-	iommu_device_set_fwnode(&data->iommu, &dev->of_node->fwnode);
-
-	ret = iommu_device_register(&data->iommu);
+	ret = iommu_device_register(&data->iommu, &exynos_iommu_ops, dev);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c
index 0ac781186dbd..a47f47307109 100644
--- a/drivers/iommu/fsl_pamu_domain.c
+++ b/drivers/iommu/fsl_pamu_domain.c
@@ -474,9 +474,7 @@ int __init pamu_domain_init(void)
 	if (ret)
 		return ret;
 
-	iommu_device_set_ops(&pamu_iommu, &fsl_pamu_ops);
-
-	ret = iommu_device_register(&pamu_iommu);
+	ret = iommu_device_register(&pamu_iommu, &fsl_pamu_ops, NULL);
 	if (ret) {
 		iommu_device_sysfs_remove(&pamu_iommu);
 		pr_err("Can't register iommu device\n");
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 6971397805f3..1757ac1e1623 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1140,9 +1140,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
 		if (err)
 			goto err_unmap;
 
-		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
-
-		err = iommu_device_register(&iommu->iommu);
+		err = iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
 		if (err)
 			goto err_unmap;
 	}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 0e04d450c38a..708f430af1c4 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4396,8 +4396,7 @@ int __init intel_iommu_init(void)
 		iommu_device_sysfs_add(&iommu->iommu, NULL,
 				       intel_iommu_groups,
 				       "%s", iommu->name);
-		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
-		iommu_device_register(&iommu->iommu);
+		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
 	}
 	up_read(&dmar_global_lock);
 
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c4ad9c644802..808ab70d5df5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -142,8 +142,25 @@ static int __init iommu_subsys_init(void)
 }
 subsys_initcall(iommu_subsys_init);
 
-int iommu_device_register(struct iommu_device *iommu)
+/**
+ * iommu_device_register() - Register an IOMMU hardware instance
+ * @iommu: IOMMU handle for the instance
+ * @ops:   IOMMU ops to associate with the instance
+ * @hwdev: (optional) actual instance device, used for fwnode lookup
+ *
+ * Return: 0 on success, or an error.
+ */
+int iommu_device_register(struct iommu_device *iommu,
+			  const struct iommu_ops *ops, struct device *hwdev)
 {
+	/* We need to be able to take module references appropriately */
+	if (WARN_ON(is_module_address((unsigned long)ops) && !ops->owner))
+		return -EINVAL;
+
+	iommu->ops = ops;
+	if (hwdev)
+		iommu->fwnode = hwdev->fwnode;
+
 	spin_lock(&iommu_device_lock);
 	list_add_tail(&iommu->list, &iommu_device_list);
 	spin_unlock(&iommu_device_lock);
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index eaaec0a55cc6..aaa6a4d59057 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -1076,11 +1076,7 @@ static int ipmmu_probe(struct platform_device *pdev)
 		if (ret)
 			return ret;
 
-		iommu_device_set_ops(&mmu->iommu, &ipmmu_ops);
-		iommu_device_set_fwnode(&mmu->iommu,
-					&pdev->dev.of_node->fwnode);
-
-		ret = iommu_device_register(&mmu->iommu);
+		ret = iommu_device_register(&mmu->iommu, &ipmmu_ops, &pdev->dev);
 		if (ret)
 			return ret;
 
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index f0ba6a09b434..7880f307cb2d 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -792,10 +792,7 @@ static int msm_iommu_probe(struct platform_device *pdev)
 		goto fail;
 	}
 
-	iommu_device_set_ops(&iommu->iommu, &msm_iommu_ops);
-	iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
-
-	ret = iommu_device_register(&iommu->iommu);
+	ret = iommu_device_register(&iommu->iommu, &msm_iommu_ops, &pdev->dev);
 	if (ret) {
 		pr_err("Could not register msm-smmu at %pa\n", &ioaddr);
 		goto fail;
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index cdda3a85fc08..e06b8a0e2b56 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -892,10 +892,7 @@ static int mtk_iommu_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_link_remove;
 
-	iommu_device_set_ops(&data->iommu, &mtk_iommu_ops);
-	iommu_device_set_fwnode(&data->iommu, &pdev->dev.of_node->fwnode);
-
-	ret = iommu_device_register(&data->iommu);
+	ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev);
 	if (ret)
 		goto out_sysfs_remove;
 
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 8ba9a2ec5509..5915d7b38211 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -616,9 +616,7 @@ static int mtk_iommu_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	iommu_device_set_ops(&data->iommu, &mtk_iommu_ops);
-
-	ret = iommu_device_register(&data->iommu);
+	ret = iommu_device_register(&data->iommu, &mtk_iommu_ops, dev);
 	if (ret)
 		goto out_sysfs_remove;
 
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 71f29c0927fc..26e517eb0dd3 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1235,10 +1235,7 @@ static int omap_iommu_probe(struct platform_device *pdev)
 		if (err)
 			goto out_group;
 
-		iommu_device_set_ops(&obj->iommu, &omap_iommu_ops);
-		iommu_device_set_fwnode(&obj->iommu, &of->fwnode);
-
-		err = iommu_device_register(&obj->iommu);
+		err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev);
 		if (err)
 			goto out_sysfs;
 	}
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index e5d86b7177de..7a2932772fdf 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -1196,10 +1196,7 @@ static int rk_iommu_probe(struct platform_device *pdev)
 	if (err)
 		goto err_put_group;
 
-	iommu_device_set_ops(&iommu->iommu, &rk_iommu_ops);
-	iommu_device_set_fwnode(&iommu->iommu, &dev->of_node->fwnode);
-
-	err = iommu_device_register(&iommu->iommu);
+	err = iommu_device_register(&iommu->iommu, &rk_iommu_ops, dev);
 	if (err)
 		goto err_remove_sysfs;
 
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index 8895dbb705eb..6019e58ce4fb 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -333,9 +333,7 @@ int zpci_init_iommu(struct zpci_dev *zdev)
 	if (rc)
 		goto out_err;
 
-	iommu_device_set_ops(&zdev->iommu_dev, &s390_iommu_ops);
-
-	rc = iommu_device_register(&zdev->iommu_dev);
+	rc = iommu_device_register(&zdev->iommu_dev, &s390_iommu_ops, NULL);
 	if (rc)
 		goto out_sysfs;
 
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index 89272f9fd8f3..73dfd9946312 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -508,10 +508,7 @@ static int sprd_iommu_probe(struct platform_device *pdev)
 	if (ret)
 		goto put_group;
 
-	iommu_device_set_ops(&sdev->iommu, &sprd_iommu_ops);
-	iommu_device_set_fwnode(&sdev->iommu, &dev->of_node->fwnode);
-
-	ret = iommu_device_register(&sdev->iommu);
+	ret = iommu_device_register(&sdev->iommu, &sprd_iommu_ops, dev);
 	if (ret)
 		goto remove_sysfs;
 
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index ea6db1341916..181bb1c3437c 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -968,10 +968,7 @@ static int sun50i_iommu_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_free_group;
 
-	iommu_device_set_ops(&iommu->iommu, &sun50i_iommu_ops);
-	iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
-
-	ret = iommu_device_register(&iommu->iommu);
+	ret = iommu_device_register(&iommu->iommu, &sun50i_iommu_ops, &pdev->dev);
 	if (ret)
 		goto err_remove_sysfs;
 
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 6f130e51f072..6a358f92c7e5 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -353,10 +353,7 @@ struct gart_device *tegra_gart_probe(struct device *dev, struct tegra_mc *mc)
 	if (err)
 		goto free_gart;
 
-	iommu_device_set_ops(&gart->iommu, &gart_iommu_ops);
-	iommu_device_set_fwnode(&gart->iommu, dev->fwnode);
-
-	err = iommu_device_register(&gart->iommu);
+	err = iommu_device_register(&gart->iommu, &gart_iommu_ops, dev);
 	if (err)
 		goto remove_sysfs;
 
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 602aab98c079..1e98dc63ad13 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -1145,10 +1145,7 @@ struct tegra_smmu *tegra_smmu_probe(struct device *dev,
 	if (err)
 		return ERR_PTR(err);
 
-	iommu_device_set_ops(&smmu->iommu, &tegra_smmu_ops);
-	iommu_device_set_fwnode(&smmu->iommu, dev->fwnode);
-
-	err = iommu_device_register(&smmu->iommu);
+	err = iommu_device_register(&smmu->iommu, &tegra_smmu_ops, dev);
 	if (err)
 		goto remove_sysfs;
 
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 594ed827e944..7c02481a81b4 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -1066,10 +1066,7 @@ static int viommu_probe(struct virtio_device *vdev)
 	if (ret)
 		goto err_free_vqs;
 
-	iommu_device_set_ops(&viommu->iommu, &viommu_ops);
-	iommu_device_set_fwnode(&viommu->iommu, parent_dev->fwnode);
-
-	iommu_device_register(&viommu->iommu);
+	iommu_device_register(&viommu->iommu, &viommu_ops, parent_dev);
 
 #ifdef CONFIG_PCI
 	if (pci_bus_type.iommu_ops != &viommu_ops) {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ce904cf4e774..32d448050bf7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -350,7 +350,9 @@ struct dev_iommu {
 	void				*priv;
 };
 
-int  iommu_device_register(struct iommu_device *iommu);
+int iommu_device_register(struct iommu_device *iommu,
+			  const struct iommu_ops *ops,
+			  struct device *hwdev);
 void iommu_device_unregister(struct iommu_device *iommu);
 int  iommu_device_sysfs_add(struct iommu_device *iommu,
 			    struct device *parent,
@@ -361,18 +363,6 @@ int  iommu_device_link(struct iommu_device   *iommu, struct device *link);
 void iommu_device_unlink(struct iommu_device *iommu, struct device *link);
 int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain);
 
-static inline void iommu_device_set_ops(struct iommu_device *iommu,
-					  const struct iommu_ops *ops)
-{
-	iommu->ops = ops;
-}
-
-static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
-					   struct fwnode_handle *fwnode)
-{
-	iommu->fwnode = fwnode;
-}
-
 static inline struct iommu_device *dev_to_iommu_device(struct device *dev)
 {
 	return (struct iommu_device *)dev_get_drvdata(dev);
@@ -858,21 +848,13 @@ static inline int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 	return 0;
 }
 
-static inline int  iommu_device_register(struct iommu_device *iommu)
+static inline int iommu_device_register(struct iommu_device *iommu,
+					const struct iommu_ops *ops,
+					struct device *hwdev)
 {
 	return -ENODEV;
 }
 
-static inline void iommu_device_set_ops(struct iommu_device *iommu,
-					const struct iommu_ops *ops)
-{
-}
-
-static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
-					   struct fwnode_handle *fwnode)
-{
-}
-
 static inline struct iommu_device *dev_to_iommu_device(struct device *dev)
 {
 	return NULL;
-- 
cgit v1.2.3


From 7f100744749e4fe547dece3bb6557fae5f0a7252 Mon Sep 17 00:00:00 2001
From: Vidya Sagar <vidyas@nvidia.com>
Date: Fri, 16 Apr 2021 19:15:37 +0530
Subject: PCI: tegra: Add Tegra194 MCFG quirks for ECAM errata

The PCIe controller in Tegra194 SoC is not ECAM-compliant.  With the
current hardware design, ECAM can be enabled only for one controller (the
C5 controller) with bus numbers starting from 160 instead of 0. A different
approach is taken to avoid this abnormal way of enabling ECAM for just one
controller but to enable configuration space access for all the other
controllers. In this approach, ops are added through MCFG quirk mechanism
which access the configuration spaces by dynamically programming iATU
(internal AddressTranslation Unit) to generate respective configuration
accesses just like the way it is done in DesignWare core sub-system.

This issue is specific to Tegra194 and it would be fixed in the future
generations of Tegra SoCs.

Link: https://lore.kernel.org/r/20210416134537.19474-1-vidyas@nvidia.com
Signed-off-by: Vidya Sagar <vidyas@nvidia.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/pci_mcfg.c                    |   7 ++
 drivers/pci/controller/dwc/Makefile        |   2 +-
 drivers/pci/controller/dwc/pcie-tegra194.c | 102 +++++++++++++++++++++++++++++
 include/linux/pci-ecam.h                   |   1 +
 4 files changed, 111 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
index 95f23acd5b80..53cab975f612 100644
--- a/drivers/acpi/pci_mcfg.c
+++ b/drivers/acpi/pci_mcfg.c
@@ -116,6 +116,13 @@ static struct mcfg_fixup mcfg_quirks[] = {
 	THUNDER_ECAM_QUIRK(2, 12),
 	THUNDER_ECAM_QUIRK(2, 13),
 
+	{ "NVIDIA", "TEGRA194", 1, 0, MCFG_BUS_ANY, &tegra194_pcie_ops},
+	{ "NVIDIA", "TEGRA194", 1, 1, MCFG_BUS_ANY, &tegra194_pcie_ops},
+	{ "NVIDIA", "TEGRA194", 1, 2, MCFG_BUS_ANY, &tegra194_pcie_ops},
+	{ "NVIDIA", "TEGRA194", 1, 3, MCFG_BUS_ANY, &tegra194_pcie_ops},
+	{ "NVIDIA", "TEGRA194", 1, 4, MCFG_BUS_ANY, &tegra194_pcie_ops},
+	{ "NVIDIA", "TEGRA194", 1, 5, MCFG_BUS_ANY, &tegra194_pcie_ops},
+
 #define XGENE_V1_ECAM_MCFG(rev, seg) \
 	{"APM   ", "XGENE   ", rev, seg, MCFG_BUS_ANY, \
 		&xgene_v1_pcie_ecam_ops }
diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile
index a751553fa0db..dbb981876556 100644
--- a/drivers/pci/controller/dwc/Makefile
+++ b/drivers/pci/controller/dwc/Makefile
@@ -17,7 +17,6 @@ obj-$(CONFIG_PCIE_INTEL_GW) += pcie-intel-gw.o
 obj-$(CONFIG_PCIE_KIRIN) += pcie-kirin.o
 obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o
 obj-$(CONFIG_PCI_MESON) += pci-meson.o
-obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o
 obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o
 obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o
 
@@ -34,4 +33,5 @@ obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o
 ifdef CONFIG_PCI
 obj-$(CONFIG_ARM64) += pcie-al.o
 obj-$(CONFIG_ARM64) += pcie-hisi.o
+obj-$(CONFIG_ARM64) += pcie-tegra194.o
 endif
diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c
index 6fa216e52d14..a3979d358d06 100644
--- a/drivers/pci/controller/dwc/pcie-tegra194.c
+++ b/drivers/pci/controller/dwc/pcie-tegra194.c
@@ -22,6 +22,8 @@
 #include <linux/of_irq.h>
 #include <linux/of_pci.h>
 #include <linux/pci.h>
+#include <linux/pci-acpi.h>
+#include <linux/pci-ecam.h>
 #include <linux/phy/phy.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
@@ -311,6 +313,104 @@ struct tegra_pcie_dw_of_data {
 	enum dw_pcie_device_mode mode;
 };
 
+#if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS)
+struct tegra194_pcie_ecam  {
+	void __iomem *config_base;
+	void __iomem *iatu_base;
+	void __iomem *dbi_base;
+};
+
+static int tegra194_acpi_init(struct pci_config_window *cfg)
+{
+	struct device *dev = cfg->parent;
+	struct tegra194_pcie_ecam *pcie_ecam;
+
+	pcie_ecam = devm_kzalloc(dev, sizeof(*pcie_ecam), GFP_KERNEL);
+	if (!pcie_ecam)
+		return -ENOMEM;
+
+	pcie_ecam->config_base = cfg->win;
+	pcie_ecam->iatu_base = cfg->win + SZ_256K;
+	pcie_ecam->dbi_base = cfg->win + SZ_512K;
+	cfg->priv = pcie_ecam;
+
+	return 0;
+}
+
+static void atu_reg_write(struct tegra194_pcie_ecam *pcie_ecam, int index,
+			  u32 val, u32 reg)
+{
+	u32 offset = PCIE_GET_ATU_OUTB_UNR_REG_OFFSET(index);
+
+	writel(val, pcie_ecam->iatu_base + offset + reg);
+}
+
+static void program_outbound_atu(struct tegra194_pcie_ecam *pcie_ecam,
+				 int index, int type, u64 cpu_addr,
+				 u64 pci_addr, u64 size)
+{
+	atu_reg_write(pcie_ecam, index, lower_32_bits(cpu_addr),
+		      PCIE_ATU_LOWER_BASE);
+	atu_reg_write(pcie_ecam, index, upper_32_bits(cpu_addr),
+		      PCIE_ATU_UPPER_BASE);
+	atu_reg_write(pcie_ecam, index, lower_32_bits(pci_addr),
+		      PCIE_ATU_LOWER_TARGET);
+	atu_reg_write(pcie_ecam, index, lower_32_bits(cpu_addr + size - 1),
+		      PCIE_ATU_LIMIT);
+	atu_reg_write(pcie_ecam, index, upper_32_bits(pci_addr),
+		      PCIE_ATU_UPPER_TARGET);
+	atu_reg_write(pcie_ecam, index, type, PCIE_ATU_CR1);
+	atu_reg_write(pcie_ecam, index, PCIE_ATU_ENABLE, PCIE_ATU_CR2);
+}
+
+static void __iomem *tegra194_map_bus(struct pci_bus *bus,
+				      unsigned int devfn, int where)
+{
+	struct pci_config_window *cfg = bus->sysdata;
+	struct tegra194_pcie_ecam *pcie_ecam = cfg->priv;
+	u32 busdev;
+	int type;
+
+	if (bus->number < cfg->busr.start || bus->number > cfg->busr.end)
+		return NULL;
+
+	if (bus->number == cfg->busr.start) {
+		if (PCI_SLOT(devfn) == 0)
+			return pcie_ecam->dbi_base + where;
+		else
+			return NULL;
+	}
+
+	busdev = PCIE_ATU_BUS(bus->number) | PCIE_ATU_DEV(PCI_SLOT(devfn)) |
+		 PCIE_ATU_FUNC(PCI_FUNC(devfn));
+
+	if (bus->parent->number == cfg->busr.start) {
+		if (PCI_SLOT(devfn) == 0)
+			type = PCIE_ATU_TYPE_CFG0;
+		else
+			return NULL;
+	} else {
+		type = PCIE_ATU_TYPE_CFG1;
+	}
+
+	program_outbound_atu(pcie_ecam, 0, type, cfg->res.start, busdev,
+			     SZ_256K);
+
+	return pcie_ecam->config_base + where;
+}
+
+const struct pci_ecam_ops tegra194_pcie_ops = {
+	.init		= tegra194_acpi_init,
+	.pci_ops	= {
+		.map_bus	= tegra194_map_bus,
+		.read		= pci_generic_config_read,
+		.write		= pci_generic_config_write,
+	}
+};
+#endif /* defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) */
+
+#ifdef CONFIG_PCIE_TEGRA194
+
 static inline struct tegra_pcie_dw *to_tegra_pcie(struct dw_pcie *pci)
 {
 	return container_of(pci, struct tegra_pcie_dw, pci);
@@ -2311,3 +2411,5 @@ MODULE_DEVICE_TABLE(of, tegra_pcie_dw_of_match);
 MODULE_AUTHOR("Vidya Sagar <vidyas@nvidia.com>");
 MODULE_DESCRIPTION("NVIDIA PCIe host controller driver");
 MODULE_LICENSE("GPL v2");
+
+#endif /* CONFIG_PCIE_TEGRA194 */
diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index 65d3d83015c3..fbdadd4d8377 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -85,6 +85,7 @@ extern const struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */
 extern const struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */
 extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */
 extern const struct pci_ecam_ops al_pcie_ops;	/* Amazon Annapurna Labs PCIe */
+extern const struct pci_ecam_ops tegra194_pcie_ops; /* Tegra194 PCIe */
 #endif
 
 #if IS_ENABLED(CONFIG_PCI_HOST_COMMON)
-- 
cgit v1.2.3


From 7c8056bb366b1b2dc8e4a3cc0b876e15a8ebca2c Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 10 Feb 2021 17:33:25 +0900
Subject: perf core: Factor out __perf_sw_event_sched

In some cases, we need to check more than whether the software event
is enabled.  So split the condition check and the actual event
handling.  This is a preparation for the next change.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210210083327.22726-1-namhyung@kernel.org
---
 include/linux/perf_event.h | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7d7280aa4e22..92d51a7beaea 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1178,30 +1178,24 @@ DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
  * which is guaranteed by us not actually scheduling inside other swevents
  * because those disable preemption.
  */
-static __always_inline void
-perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
+static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
 {
-	if (static_key_false(&perf_swevent_enabled[event_id])) {
-		struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+	struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
 
-		perf_fetch_caller_regs(regs);
-		___perf_sw_event(event_id, nr, regs, addr);
-	}
+	perf_fetch_caller_regs(regs);
+	___perf_sw_event(event_id, nr, regs, addr);
 }
 
 extern struct static_key_false perf_sched_events;
 
-static __always_inline bool
-perf_sw_migrate_enabled(void)
+static __always_inline bool __perf_sw_enabled(int swevt)
 {
-	if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
-		return true;
-	return false;
+	return static_key_false(&perf_swevent_enabled[swevt]);
 }
 
 static inline void perf_event_task_migrate(struct task_struct *task)
 {
-	if (perf_sw_migrate_enabled())
+	if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS))
 		task->sched_migrated = 1;
 }
 
@@ -1211,11 +1205,9 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
 	if (static_branch_unlikely(&perf_sched_events))
 		__perf_event_task_sched_in(prev, task);
 
-	if (perf_sw_migrate_enabled() && task->sched_migrated) {
-		struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
-
-		perf_fetch_caller_regs(regs);
-		___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
+	if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
+	    task->sched_migrated) {
+		__perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
 		task->sched_migrated = 0;
 	}
 }
@@ -1223,7 +1215,8 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
 static inline void perf_event_task_sched_out(struct task_struct *prev,
 					     struct task_struct *next)
 {
-	perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
+	if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
+		__perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
 
 	if (static_branch_unlikely(&perf_sched_events))
 		__perf_event_task_sched_out(prev, next);
@@ -1480,8 +1473,6 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh)
 static inline void
 perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)	{ }
 static inline void
-perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)			{ }
-static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
 static inline int perf_register_guest_info_callbacks
-- 
cgit v1.2.3


From d0d1dd628527c77db2391ce0293c1ed344b2365f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 10 Feb 2021 17:33:26 +0900
Subject: perf core: Add PERF_COUNT_SW_CGROUP_SWITCHES event

This patch adds a new software event to count context switches
involving cgroup switches.  So it's counted only if cgroups of
previous and next tasks are different.  Note that it only checks the
cgroups in the perf_event subsystem.  For cgroup v2, it shouldn't
matter anyway.

One can argue that we can do this by using existing sched_switch event
with eBPF.  But some systems might not have eBPF for some reason so
I'd like to add this as a simple way.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210210083327.22726-2-namhyung@kernel.org
---
 include/linux/perf_event.h      | 7 +++++++
 include/uapi/linux/perf_event.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 92d51a7beaea..8989b2b7268d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1218,6 +1218,13 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 	if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
 		__perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
 
+#ifdef CONFIG_CGROUP_PERF
+	if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
+	    perf_cgroup_from_task(prev, NULL) !=
+	    perf_cgroup_from_task(next, NULL))
+		__perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
+#endif
+
 	if (static_branch_unlikely(&perf_sched_events))
 		__perf_event_task_sched_out(prev, next);
 }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 31b00e3b69c9..0b58970bab6b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -112,6 +112,7 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 	PERF_COUNT_SW_DUMMY			= 9,
 	PERF_COUNT_SW_BPF_OUTPUT		= 10,
+	PERF_COUNT_SW_CGROUP_SWITCHES		= 11,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
-- 
cgit v1.2.3


From 1df1fc8c62f7527d953c7f3869930067bf5b3f29 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@microchip.com>
Date: Sat, 3 Apr 2021 09:09:31 +0300
Subject: mtd: core: Constify buf in mtd_write_user_prot_reg()

The write buffer comes from user and should be const.
Constify write buffer in mtd core and across all _write_user_prot_reg()
users. cfi_cmdset_{0001, 0002} and onenand_base will pay the cost of an
explicit cast to discard the const qualifier since the beginning, since
they are using an otp_op_t function prototype that is used for both reads
and writes. mtd_dataflash and SPI NOR will benefit of the const buffer
because they are using different paths for writes and reads.

Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210403060931.7119-1-tudor.ambarus@microchip.com
---
 drivers/mtd/chips/cfi_cmdset_0001.c     | 7 ++++---
 drivers/mtd/chips/cfi_cmdset_0002.c     | 6 +++---
 drivers/mtd/devices/mtd_dataflash.c     | 2 +-
 drivers/mtd/mtdcore.c                   | 2 +-
 drivers/mtd/nand/onenand/onenand_base.c | 5 +++--
 drivers/mtd/spi-nor/core.h              | 6 ++++--
 drivers/mtd/spi-nor/otp.c               | 9 +++++----
 include/linux/mtd/mtd.h                 | 5 +++--
 8 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index b7f5e7977dcd..54f92d09d9cf 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -72,7 +72,8 @@ static int cfi_intelext_is_locked(struct mtd_info *mtd, loff_t ofs,
 #ifdef CONFIG_MTD_OTP
 static int cfi_intelext_read_fact_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int cfi_intelext_read_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
-static int cfi_intelext_write_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
+static int cfi_intelext_write_user_prot_reg(struct mtd_info *, loff_t, size_t,
+					    size_t *, const u_char *);
 static int cfi_intelext_lock_user_prot_reg (struct mtd_info *, loff_t, size_t);
 static int cfi_intelext_get_fact_prot_info(struct mtd_info *, size_t,
 					   size_t *, struct otp_info *);
@@ -2447,10 +2448,10 @@ static int cfi_intelext_read_user_prot_reg(struct mtd_info *mtd, loff_t from,
 
 static int cfi_intelext_write_user_prot_reg(struct mtd_info *mtd, loff_t from,
 					    size_t len, size_t *retlen,
-					     u_char *buf)
+					    const u_char *buf)
 {
 	return cfi_intelext_otp_walk(mtd, from, len, retlen,
-				     buf, do_otp_write, 1);
+				     (u_char *)buf, do_otp_write, 1);
 }
 
 static int cfi_intelext_lock_user_prot_reg(struct mtd_info *mtd,
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index c5c9a4c3b027..3097e93787f7 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -80,7 +80,7 @@ static int cfi_amdstd_read_fact_prot_reg(struct mtd_info *, loff_t, size_t,
 static int cfi_amdstd_read_user_prot_reg(struct mtd_info *, loff_t, size_t,
 					 size_t *, u_char *);
 static int cfi_amdstd_write_user_prot_reg(struct mtd_info *, loff_t, size_t,
-					  size_t *, u_char *);
+					  size_t *, const u_char *);
 static int cfi_amdstd_lock_user_prot_reg(struct mtd_info *, loff_t, size_t);
 
 static int cfi_amdstd_panic_write(struct mtd_info *mtd, loff_t to, size_t len,
@@ -1635,9 +1635,9 @@ static int cfi_amdstd_read_user_prot_reg(struct mtd_info *mtd, loff_t from,
 
 static int cfi_amdstd_write_user_prot_reg(struct mtd_info *mtd, loff_t from,
 					  size_t len, size_t *retlen,
-					  u_char *buf)
+					  const u_char *buf)
 {
-	return cfi_amdstd_otp_walk(mtd, from, len, retlen, buf,
+	return cfi_amdstd_otp_walk(mtd, from, len, retlen, (u_char *)buf,
 				   do_otp_write, 1);
 }
 
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
index 6d1eefe94106..9802e265fca8 100644
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -527,7 +527,7 @@ static int dataflash_read_user_otp(struct mtd_info *mtd,
 }
 
 static int dataflash_write_user_otp(struct mtd_info *mtd,
-		loff_t from, size_t len, size_t *retlen, u_char *buf)
+		loff_t from, size_t len, size_t *retlen, const u_char *buf)
 {
 	struct spi_message	m;
 	const size_t		l = 4 + 64;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 0bc6871c3863..9aaeadd53eb4 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1889,7 +1889,7 @@ int mtd_read_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len,
 EXPORT_SYMBOL_GPL(mtd_read_user_prot_reg);
 
 int mtd_write_user_prot_reg(struct mtd_info *mtd, loff_t to, size_t len,
-			    size_t *retlen, u_char *buf)
+			    size_t *retlen, const u_char *buf)
 {
 	struct mtd_info *master = mtd_get_master(mtd);
 	int ret;
diff --git a/drivers/mtd/nand/onenand/onenand_base.c b/drivers/mtd/nand/onenand/onenand_base.c
index a9fdea26ea46..958bac54b190 100644
--- a/drivers/mtd/nand/onenand/onenand_base.c
+++ b/drivers/mtd/nand/onenand/onenand_base.c
@@ -3167,9 +3167,10 @@ static int onenand_read_user_prot_reg(struct mtd_info *mtd, loff_t from,
  * Write user OTP area.
  */
 static int onenand_write_user_prot_reg(struct mtd_info *mtd, loff_t from,
-			size_t len, size_t *retlen, u_char *buf)
+			size_t len, size_t *retlen, const u_char *buf)
 {
-	return onenand_otp_walk(mtd, from, len, retlen, buf, do_otp_write, MTD_OTP_USER);
+	return onenand_otp_walk(mtd, from, len, retlen, (u_char *)buf,
+				do_otp_write, MTD_OTP_USER);
 }
 
 /**
diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h
index e9b6b2e76cdb..28a2e0be97a3 100644
--- a/drivers/mtd/spi-nor/core.h
+++ b/drivers/mtd/spi-nor/core.h
@@ -211,7 +211,8 @@ struct spi_nor_otp_organization {
  */
 struct spi_nor_otp_ops {
 	int (*read)(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf);
-	int (*write)(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf);
+	int (*write)(struct spi_nor *nor, loff_t addr, size_t len,
+		     const u8 *buf);
 	int (*lock)(struct spi_nor *nor, unsigned int region);
 	int (*is_locked)(struct spi_nor *nor, unsigned int region);
 };
@@ -504,7 +505,8 @@ ssize_t spi_nor_write_data(struct spi_nor *nor, loff_t to, size_t len,
 			   const u8 *buf);
 
 int spi_nor_otp_read_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf);
-int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf);
+int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len,
+			   const u8 *buf);
 int spi_nor_otp_lock_sr2(struct spi_nor *nor, unsigned int region);
 int spi_nor_otp_is_locked_sr2(struct spi_nor *nor, unsigned int region);
 
diff --git a/drivers/mtd/spi-nor/otp.c b/drivers/mtd/spi-nor/otp.c
index 5021d40dffbf..fcf38d260345 100644
--- a/drivers/mtd/spi-nor/otp.c
+++ b/drivers/mtd/spi-nor/otp.c
@@ -70,7 +70,8 @@ int spi_nor_otp_read_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf)
  *
  * Return: number of bytes written successfully, -errno otherwise
  */
-int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len, u8 *buf)
+int spi_nor_otp_write_secr(struct spi_nor *nor, loff_t addr, size_t len,
+			   const u8 *buf)
 {
 	enum spi_nor_protocol write_proto;
 	struct spi_mem_dirmap_desc *wdesc;
@@ -241,7 +242,7 @@ out:
 
 static int spi_nor_mtd_otp_read_write(struct mtd_info *mtd, loff_t ofs,
 				      size_t total_len, size_t *retlen,
-				      u8 *buf, bool is_write)
+				      const u8 *buf, bool is_write)
 {
 	struct spi_nor *nor = mtd_to_spi_nor(mtd);
 	const struct spi_nor_otp_ops *ops = nor->params->otp.ops;
@@ -285,7 +286,7 @@ static int spi_nor_mtd_otp_read_write(struct mtd_info *mtd, loff_t ofs,
 		if (is_write)
 			ret = ops->write(nor, rstart + rofs, len, buf);
 		else
-			ret = ops->read(nor, rstart + rofs, len, buf);
+			ret = ops->read(nor, rstart + rofs, len, (u8 *)buf);
 		if (ret == 0)
 			ret = -EIO;
 		if (ret < 0)
@@ -310,7 +311,7 @@ static int spi_nor_mtd_otp_read(struct mtd_info *mtd, loff_t from, size_t len,
 }
 
 static int spi_nor_mtd_otp_write(struct mtd_info *mtd, loff_t to, size_t len,
-				 size_t *retlen, u8 *buf)
+				 size_t *retlen, const u8 *buf)
 {
 	return spi_nor_mtd_otp_read_write(mtd, to, len, retlen, buf, true);
 }
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 4aac200ca8b5..a89955f3cbc8 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -334,7 +334,8 @@ struct mtd_info {
 	int (*_read_user_prot_reg) (struct mtd_info *mtd, loff_t from,
 				    size_t len, size_t *retlen, u_char *buf);
 	int (*_write_user_prot_reg) (struct mtd_info *mtd, loff_t to,
-				     size_t len, size_t *retlen, u_char *buf);
+				     size_t len, size_t *retlen,
+				     const u_char *buf);
 	int (*_lock_user_prot_reg) (struct mtd_info *mtd, loff_t from,
 				    size_t len);
 	int (*_erase_user_prot_reg) (struct mtd_info *mtd, loff_t from,
@@ -518,7 +519,7 @@ int mtd_get_user_prot_info(struct mtd_info *mtd, size_t len, size_t *retlen,
 int mtd_read_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len,
 			   size_t *retlen, u_char *buf);
 int mtd_write_user_prot_reg(struct mtd_info *mtd, loff_t to, size_t len,
-			    size_t *retlen, u_char *buf);
+			    size_t *retlen, const u_char *buf);
 int mtd_lock_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len);
 int mtd_erase_user_prot_reg(struct mtd_info *mtd, loff_t from, size_t len);
 
-- 
cgit v1.2.3


From 36830159acbeb9896d7684b5f52db7b22efa197f Mon Sep 17 00:00:00 2001
From: Moshe Tal <moshet@nvidia.com>
Date: Mon, 15 Feb 2021 16:13:02 +0200
Subject: net/mlx5: Add register layout to support extended link state

Add needed structure layouts and defines for pddr register
(Port Diagnostics Database Register) and the troublshooting page.

This will be used to get extended link state from the monitor opcode
bits.

Signed-off-by: Moshe Tal <moshet@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 50 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2da953ad02ed..4e531c2aab52 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -127,6 +127,7 @@ enum {
 	MLX5_REG_PELC		 = 0x500e,
 	MLX5_REG_PVLC		 = 0x500f,
 	MLX5_REG_PCMR		 = 0x5041,
+	MLX5_REG_PDDR		 = 0x5031,
 	MLX5_REG_PMLP		 = 0x5002,
 	MLX5_REG_PPLM		 = 0x5023,
 	MLX5_REG_PCAM		 = 0x507f,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1599deee0456..f2c51d6833c6 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9956,6 +9956,53 @@ struct mlx5_ifc_mirc_reg_bits {
 	u8         reserved_at_20[0x20];
 };
 
+struct mlx5_ifc_pddr_monitor_opcode_bits {
+	u8         reserved_at_0[0x10];
+	u8         monitor_opcode[0x10];
+};
+
+union mlx5_ifc_pddr_troubleshooting_page_status_opcode_auto_bits {
+	struct mlx5_ifc_pddr_monitor_opcode_bits pddr_monitor_opcode;
+	u8         reserved_at_0[0x20];
+};
+
+enum {
+	/* Monitor opcodes */
+	MLX5_PDDR_REG_TRBLSH_GROUP_OPCODE_MONITOR = 0x0,
+};
+
+struct mlx5_ifc_pddr_troubleshooting_page_bits {
+	u8         reserved_at_0[0x10];
+	u8         group_opcode[0x10];
+
+	union mlx5_ifc_pddr_troubleshooting_page_status_opcode_auto_bits status_opcode;
+
+	u8         reserved_at_40[0x20];
+
+	u8         status_message[59][0x20];
+};
+
+union mlx5_ifc_pddr_reg_page_data_auto_bits {
+	struct mlx5_ifc_pddr_troubleshooting_page_bits pddr_troubleshooting_page;
+	u8         reserved_at_0[0x7c0];
+};
+
+enum {
+	MLX5_PDDR_REG_PAGE_SELECT_TROUBLESHOOTING_INFO_PAGE      = 0x1,
+};
+
+struct mlx5_ifc_pddr_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         reserved_at_12[0xe];
+
+	u8         reserved_at_20[0x18];
+	u8         page_select[0x8];
+
+	union mlx5_ifc_pddr_reg_page_data_auto_bits page_data;
+};
+
 union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
 	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
@@ -9970,6 +10017,9 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pamp_reg_bits pamp_reg;
 	struct mlx5_ifc_paos_reg_bits paos_reg;
 	struct mlx5_ifc_pcap_reg_bits pcap_reg;
+	struct mlx5_ifc_pddr_monitor_opcode_bits pddr_monitor_opcode;
+	struct mlx5_ifc_pddr_reg_bits pddr_reg;
+	struct mlx5_ifc_pddr_troubleshooting_page_bits pddr_troubleshooting_page;
 	struct mlx5_ifc_peir_reg_bits peir_reg;
 	struct mlx5_ifc_pelc_reg_bits pelc_reg;
 	struct mlx5_ifc_pfcc_reg_bits pfcc_reg;
-- 
cgit v1.2.3


From 07ff4aed015c564d03fd518d2fb54e5e6948903c Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 3 Mar 2021 11:35:44 +0100
Subject: time/timecounter: Mark 1st argument of timecounter_cyc2time() as
 const

The timecounter is not modified in this function. Mark it as const.

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210303103544.994855-1-mkl@pengutronix.de
---
 include/linux/timecounter.h | 2 +-
 kernel/time/timecounter.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index 754b74a2167f..c6540ceea143 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -124,7 +124,7 @@ extern u64 timecounter_read(struct timecounter *tc);
  * This allows conversion of cycle counter values which were generated
  * in the past.
  */
-extern u64 timecounter_cyc2time(struct timecounter *tc,
+extern u64 timecounter_cyc2time(const struct timecounter *tc,
 				u64 cycle_tstamp);
 
 #endif
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 85b98e727306..e6285288d765 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -76,7 +76,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
 	return ns;
 }
 
-u64 timecounter_cyc2time(struct timecounter *tc,
+u64 timecounter_cyc2time(const struct timecounter *tc,
 			 u64 cycle_tstamp)
 {
 	u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
-- 
cgit v1.2.3


From 9a44c1cc63887627284ae232a9626a9f1cd066fc Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Fri, 16 Apr 2021 10:36:33 +0200
Subject: net: Add a WWAN subsystem

This change introduces initial support for a WWAN framework. Given the
complexity and heterogeneity of existing WWAN hardwares and interfaces,
there is no strict definition of what a WWAN device is and how it should
be represented. It's often a collection of multiple devices that perform
the global WWAN feature (netdev, tty, chardev, etc).

One usual way to expose modem controls and configuration is via high
level protocols such as the well known AT command protocol, MBIM or
QMI. The USB modems started to expose them as character devices, and
user daemons such as ModemManager learnt to use them.

This initial version adds the concept of WWAN port, which is a logical
pipe to a modem control protocol. The protocols are rawly exposed to
user via character device, allowing straigthforward support in existing
tools (ModemManager, ofono...). The WWAN core takes care of the generic
part, including character device management, and relies on port driver
operations to receive/submit protocol data.

Since the different devices exposing protocols for a same WWAN hardware
do not necessarily know about each others (e.g. two different USB
interfaces, PCI/MHI channel devices...) and can be created/removed in
different orders, the WWAN core ensures that all WAN ports contributing
to the 'whole' WWAN feature are grouped under the same virtual WWAN
device, relying on the provided parent device (e.g. mhi controller,
USB device). It's a 'trick' I copied from Johannes's earlier WWAN
subsystem proposal.

This initial version is purposely minimalist, it's essentially moving
the generic part of the previously proposed mhi_wwan_ctrl driver inside
a common WWAN framework, but the implementation is open and flexible
enough to allow extension for further drivers.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig          |   2 +
 drivers/net/Makefile         |   1 +
 drivers/net/wwan/Kconfig     |  23 ++
 drivers/net/wwan/Makefile    |   7 +
 drivers/net/wwan/wwan_core.c | 552 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/wwan.h         | 111 +++++++++
 6 files changed, 696 insertions(+)
 create mode 100644 drivers/net/wwan/Kconfig
 create mode 100644 drivers/net/wwan/Makefile
 create mode 100644 drivers/net/wwan/wwan_core.c
 create mode 100644 include/linux/wwan.h

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 5895905b6aa1..74dc8e249faa 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -502,6 +502,8 @@ source "drivers/net/wan/Kconfig"
 
 source "drivers/net/ieee802154/Kconfig"
 
+source "drivers/net/wwan/Kconfig"
+
 config XEN_NETDEV_FRONTEND
 	tristate "Xen network device frontend driver"
 	depends on XEN
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 040e20b81317..7ffd2d03efaf 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_SUNGEM_PHY) += sungem_phy.o
 obj-$(CONFIG_WAN) += wan/
 obj-$(CONFIG_WLAN) += wireless/
 obj-$(CONFIG_IEEE802154) += ieee802154/
+obj-$(CONFIG_WWAN) += wwan/
 
 obj-$(CONFIG_VMXNET3) += vmxnet3/
 obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
diff --git a/drivers/net/wwan/Kconfig b/drivers/net/wwan/Kconfig
new file mode 100644
index 000000000000..fc3f3a1c80ee
--- /dev/null
+++ b/drivers/net/wwan/Kconfig
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Wireless WAN device configuration
+#
+
+menuconfig WWAN
+	bool "Wireless WAN"
+	help
+	  This section contains Wireless WAN configuration for WWAN framework
+	  and drivers.
+
+if WWAN
+
+config WWAN_CORE
+	tristate "WWAN Driver Core"
+	help
+	  Say Y here if you want to use the WWAN driver core. This driver
+	  provides a common framework for WWAN drivers.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called wwan.
+
+endif # WWAN
diff --git a/drivers/net/wwan/Makefile b/drivers/net/wwan/Makefile
new file mode 100644
index 000000000000..934590b9e47d
--- /dev/null
+++ b/drivers/net/wwan/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux WWAN device drivers.
+#
+
+obj-$(CONFIG_WWAN_CORE) += wwan.o
+wwan-objs += wwan_core.o
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
new file mode 100644
index 000000000000..b618b7937846
--- /dev/null
+++ b/drivers/net/wwan/wwan_core.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021, Linaro Ltd <loic.poulain@linaro.org> */
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/idr.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/wwan.h>
+
+#define WWAN_MAX_MINORS 256 /* 256 minors allowed with register_chrdev() */
+
+static DEFINE_MUTEX(wwan_register_lock); /* WWAN device create|remove lock */
+static DEFINE_IDA(minors); /* minors for WWAN port chardevs */
+static DEFINE_IDA(wwan_dev_ids); /* for unique WWAN device IDs */
+static struct class *wwan_class;
+static int wwan_major;
+
+#define to_wwan_dev(d) container_of(d, struct wwan_device, dev)
+#define to_wwan_port(d) container_of(d, struct wwan_port, dev)
+
+/* WWAN port flags */
+#define WWAN_PORT_TX_OFF	BIT(0)
+
+/**
+ * struct wwan_device - The structure that defines a WWAN device
+ *
+ * @id: WWAN device unique ID.
+ * @dev: Underlying device.
+ * @port_id: Current available port ID to pick.
+ */
+struct wwan_device {
+	unsigned int id;
+	struct device dev;
+	atomic_t port_id;
+};
+
+/**
+ * struct wwan_port - The structure that defines a WWAN port
+ * @type: Port type
+ * @start_count: Port start counter
+ * @flags: Store port state and capabilities
+ * @ops: Pointer to WWAN port operations
+ * @ops_lock: Protect port ops
+ * @dev: Underlying device
+ * @rxq: Buffer inbound queue
+ * @waitqueue: The waitqueue for port fops (read/write/poll)
+ */
+struct wwan_port {
+	enum wwan_port_type type;
+	unsigned int start_count;
+	unsigned long flags;
+	const struct wwan_port_ops *ops;
+	struct mutex ops_lock; /* Serialize ops + protect against removal */
+	struct device dev;
+	struct sk_buff_head rxq;
+	wait_queue_head_t waitqueue;
+};
+
+static void wwan_dev_destroy(struct device *dev)
+{
+	struct wwan_device *wwandev = to_wwan_dev(dev);
+
+	ida_free(&wwan_dev_ids, wwandev->id);
+	kfree(wwandev);
+}
+
+static const struct device_type wwan_dev_type = {
+	.name    = "wwan_dev",
+	.release = wwan_dev_destroy,
+};
+
+static int wwan_dev_parent_match(struct device *dev, const void *parent)
+{
+	return (dev->type == &wwan_dev_type && dev->parent == parent);
+}
+
+static struct wwan_device *wwan_dev_get_by_parent(struct device *parent)
+{
+	struct device *dev;
+
+	dev = class_find_device(wwan_class, NULL, parent, wwan_dev_parent_match);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	return to_wwan_dev(dev);
+}
+
+/* This function allocates and registers a new WWAN device OR if a WWAN device
+ * already exist for the given parent, it gets a reference and return it.
+ * This function is not exported (for now), it is called indirectly via
+ * wwan_create_port().
+ */
+static struct wwan_device *wwan_create_dev(struct device *parent)
+{
+	struct wwan_device *wwandev;
+	int err, id;
+
+	/* The 'find-alloc-register' operation must be protected against
+	 * concurrent execution, a WWAN device is possibly shared between
+	 * multiple callers or concurrently unregistered from wwan_remove_dev().
+	 */
+	mutex_lock(&wwan_register_lock);
+
+	/* If wwandev already exists, return it */
+	wwandev = wwan_dev_get_by_parent(parent);
+	if (!IS_ERR(wwandev))
+		goto done_unlock;
+
+	id = ida_alloc(&wwan_dev_ids, GFP_KERNEL);
+	if (id < 0)
+		goto done_unlock;
+
+	wwandev = kzalloc(sizeof(*wwandev), GFP_KERNEL);
+	if (!wwandev) {
+		ida_free(&wwan_dev_ids, id);
+		goto done_unlock;
+	}
+
+	wwandev->dev.parent = parent;
+	wwandev->dev.class = wwan_class;
+	wwandev->dev.type = &wwan_dev_type;
+	wwandev->id = id;
+	dev_set_name(&wwandev->dev, "wwan%d", wwandev->id);
+
+	err = device_register(&wwandev->dev);
+	if (err) {
+		put_device(&wwandev->dev);
+		wwandev = NULL;
+	}
+
+done_unlock:
+	mutex_unlock(&wwan_register_lock);
+
+	return wwandev;
+}
+
+static int is_wwan_child(struct device *dev, void *data)
+{
+	return dev->class == wwan_class;
+}
+
+static void wwan_remove_dev(struct wwan_device *wwandev)
+{
+	int ret;
+
+	/* Prevent concurrent picking from wwan_create_dev */
+	mutex_lock(&wwan_register_lock);
+
+	/* WWAN device is created and registered (get+add) along with its first
+	 * child port, and subsequent port registrations only grab a reference
+	 * (get). The WWAN device must then be unregistered (del+put) along with
+	 * its latest port, and reference simply dropped (put) otherwise.
+	 */
+	ret = device_for_each_child(&wwandev->dev, NULL, is_wwan_child);
+	if (!ret)
+		device_unregister(&wwandev->dev);
+	else
+		put_device(&wwandev->dev);
+
+	mutex_unlock(&wwan_register_lock);
+}
+
+/* ------- WWAN port management ------- */
+
+static void wwan_port_destroy(struct device *dev)
+{
+	struct wwan_port *port = to_wwan_port(dev);
+
+	ida_free(&minors, MINOR(port->dev.devt));
+	skb_queue_purge(&port->rxq);
+	mutex_destroy(&port->ops_lock);
+	kfree(port);
+}
+
+static const struct device_type wwan_port_dev_type = {
+	.name = "wwan_port",
+	.release = wwan_port_destroy,
+};
+
+static int wwan_port_minor_match(struct device *dev, const void *minor)
+{
+	return (dev->type == &wwan_port_dev_type &&
+		MINOR(dev->devt) == *(unsigned int *)minor);
+}
+
+static struct wwan_port *wwan_port_get_by_minor(unsigned int minor)
+{
+	struct device *dev;
+
+	dev = class_find_device(wwan_class, NULL, &minor, wwan_port_minor_match);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	return to_wwan_port(dev);
+}
+
+/* Keep aligned with wwan_port_type enum */
+static const char * const wwan_port_type_str[] = {
+	"AT",
+	"MBIM",
+	"QMI",
+	"QCDM",
+	"FIREHOSE"
+};
+
+struct wwan_port *wwan_create_port(struct device *parent,
+				   enum wwan_port_type type,
+				   const struct wwan_port_ops *ops,
+				   void *drvdata)
+{
+	struct wwan_device *wwandev;
+	struct wwan_port *port;
+	int minor, err = -ENOMEM;
+
+	if (type >= WWAN_PORT_MAX || !ops)
+		return ERR_PTR(-EINVAL);
+
+	/* A port is always a child of a WWAN device, retrieve (allocate or
+	 * pick) the WWAN device based on the provided parent device.
+	 */
+	wwandev = wwan_create_dev(parent);
+	if (IS_ERR(wwandev))
+		return ERR_CAST(wwandev);
+
+	/* A port is exposed as character device, get a minor */
+	minor = ida_alloc_range(&minors, 0, WWAN_MAX_MINORS - 1, GFP_KERNEL);
+	if (minor < 0)
+		goto error_wwandev_remove;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port) {
+		ida_free(&minors, minor);
+		goto error_wwandev_remove;
+	}
+
+	port->type = type;
+	port->ops = ops;
+	mutex_init(&port->ops_lock);
+	skb_queue_head_init(&port->rxq);
+	init_waitqueue_head(&port->waitqueue);
+
+	port->dev.parent = &wwandev->dev;
+	port->dev.class = wwan_class;
+	port->dev.type = &wwan_port_dev_type;
+	port->dev.devt = MKDEV(wwan_major, minor);
+	dev_set_drvdata(&port->dev, drvdata);
+
+	/* create unique name based on wwan device id, port index and type */
+	dev_set_name(&port->dev, "wwan%up%u%s", wwandev->id,
+		     atomic_inc_return(&wwandev->port_id),
+		     wwan_port_type_str[port->type]);
+
+	err = device_register(&port->dev);
+	if (err)
+		goto error_put_device;
+
+	return port;
+
+error_put_device:
+	put_device(&port->dev);
+error_wwandev_remove:
+	wwan_remove_dev(wwandev);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(wwan_create_port);
+
+void wwan_remove_port(struct wwan_port *port)
+{
+	struct wwan_device *wwandev = to_wwan_dev(port->dev.parent);
+
+	mutex_lock(&port->ops_lock);
+	if (port->start_count)
+		port->ops->stop(port);
+	port->ops = NULL; /* Prevent any new port operations (e.g. from fops) */
+	mutex_unlock(&port->ops_lock);
+
+	wake_up_interruptible(&port->waitqueue);
+
+	skb_queue_purge(&port->rxq);
+	dev_set_drvdata(&port->dev, NULL);
+	device_unregister(&port->dev);
+
+	/* Release related wwan device */
+	wwan_remove_dev(wwandev);
+}
+EXPORT_SYMBOL_GPL(wwan_remove_port);
+
+void wwan_port_rx(struct wwan_port *port, struct sk_buff *skb)
+{
+	skb_queue_tail(&port->rxq, skb);
+	wake_up_interruptible(&port->waitqueue);
+}
+EXPORT_SYMBOL_GPL(wwan_port_rx);
+
+void wwan_port_txon(struct wwan_port *port)
+{
+	clear_bit(WWAN_PORT_TX_OFF, &port->flags);
+	wake_up_interruptible(&port->waitqueue);
+}
+EXPORT_SYMBOL_GPL(wwan_port_txon);
+
+void wwan_port_txoff(struct wwan_port *port)
+{
+	set_bit(WWAN_PORT_TX_OFF, &port->flags);
+}
+EXPORT_SYMBOL_GPL(wwan_port_txoff);
+
+void *wwan_port_get_drvdata(struct wwan_port *port)
+{
+	return dev_get_drvdata(&port->dev);
+}
+EXPORT_SYMBOL_GPL(wwan_port_get_drvdata);
+
+static int wwan_port_op_start(struct wwan_port *port)
+{
+	int ret = 0;
+
+	mutex_lock(&port->ops_lock);
+	if (!port->ops) { /* Port got unplugged */
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	/* If port is already started, don't start again */
+	if (!port->start_count)
+		ret = port->ops->start(port);
+
+	if (!ret)
+		port->start_count++;
+
+out_unlock:
+	mutex_unlock(&port->ops_lock);
+
+	return ret;
+}
+
+static void wwan_port_op_stop(struct wwan_port *port)
+{
+	mutex_lock(&port->ops_lock);
+	port->start_count--;
+	if (port->ops && !port->start_count)
+		port->ops->stop(port);
+	mutex_unlock(&port->ops_lock);
+}
+
+static int wwan_port_op_tx(struct wwan_port *port, struct sk_buff *skb)
+{
+	int ret;
+
+	mutex_lock(&port->ops_lock);
+	if (!port->ops) { /* Port got unplugged */
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	ret = port->ops->tx(port, skb);
+
+out_unlock:
+	mutex_unlock(&port->ops_lock);
+
+	return ret;
+}
+
+static bool is_read_blocked(struct wwan_port *port)
+{
+	return skb_queue_empty(&port->rxq) && port->ops;
+}
+
+static bool is_write_blocked(struct wwan_port *port)
+{
+	return test_bit(WWAN_PORT_TX_OFF, &port->flags) && port->ops;
+}
+
+static int wwan_wait_rx(struct wwan_port *port, bool nonblock)
+{
+	if (!is_read_blocked(port))
+		return 0;
+
+	if (nonblock)
+		return -EAGAIN;
+
+	if (wait_event_interruptible(port->waitqueue, !is_read_blocked(port)))
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
+static int wwan_wait_tx(struct wwan_port *port, bool nonblock)
+{
+	if (!is_write_blocked(port))
+		return 0;
+
+	if (nonblock)
+		return -EAGAIN;
+
+	if (wait_event_interruptible(port->waitqueue, !is_write_blocked(port)))
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
+static int wwan_port_fops_open(struct inode *inode, struct file *file)
+{
+	struct wwan_port *port;
+	int err = 0;
+
+	port = wwan_port_get_by_minor(iminor(inode));
+	if (IS_ERR(port))
+		return PTR_ERR(port);
+
+	file->private_data = port;
+	stream_open(inode, file);
+
+	err = wwan_port_op_start(port);
+	if (err)
+		put_device(&port->dev);
+
+	return err;
+}
+
+static int wwan_port_fops_release(struct inode *inode, struct file *filp)
+{
+	struct wwan_port *port = filp->private_data;
+
+	wwan_port_op_stop(port);
+	put_device(&port->dev);
+
+	return 0;
+}
+
+static ssize_t wwan_port_fops_read(struct file *filp, char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct wwan_port *port = filp->private_data;
+	struct sk_buff *skb;
+	size_t copied;
+	int ret;
+
+	ret = wwan_wait_rx(port, !!(filp->f_flags & O_NONBLOCK));
+	if (ret)
+		return ret;
+
+	skb = skb_dequeue(&port->rxq);
+	if (!skb)
+		return -EIO;
+
+	copied = min_t(size_t, count, skb->len);
+	if (copy_to_user(buf, skb->data, copied)) {
+		kfree_skb(skb);
+		return -EFAULT;
+	}
+	skb_pull(skb, copied);
+
+	/* skb is not fully consumed, keep it in the queue */
+	if (skb->len)
+		skb_queue_head(&port->rxq, skb);
+	else
+		consume_skb(skb);
+
+	return copied;
+}
+
+static ssize_t wwan_port_fops_write(struct file *filp, const char __user *buf,
+				    size_t count, loff_t *offp)
+{
+	struct wwan_port *port = filp->private_data;
+	struct sk_buff *skb;
+	int ret;
+
+	ret = wwan_wait_tx(port, !!(filp->f_flags & O_NONBLOCK));
+	if (ret)
+		return ret;
+
+	skb = alloc_skb(count, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	if (copy_from_user(skb_put(skb, count), buf, count)) {
+		kfree_skb(skb);
+		return -EFAULT;
+	}
+
+	ret = wwan_port_op_tx(port, skb);
+	if (ret) {
+		kfree_skb(skb);
+		return ret;
+	}
+
+	return count;
+}
+
+static __poll_t wwan_port_fops_poll(struct file *filp, poll_table *wait)
+{
+	struct wwan_port *port = filp->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(filp, &port->waitqueue, wait);
+
+	if (!is_write_blocked(port))
+		mask |= EPOLLOUT | EPOLLWRNORM;
+	if (!is_read_blocked(port))
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static const struct file_operations wwan_port_fops = {
+	.owner = THIS_MODULE,
+	.open = wwan_port_fops_open,
+	.release = wwan_port_fops_release,
+	.read = wwan_port_fops_read,
+	.write = wwan_port_fops_write,
+	.poll = wwan_port_fops_poll,
+	.llseek = noop_llseek,
+};
+
+static int __init wwan_init(void)
+{
+	wwan_class = class_create(THIS_MODULE, "wwan");
+	if (IS_ERR(wwan_class))
+		return PTR_ERR(wwan_class);
+
+	/* chrdev used for wwan ports */
+	wwan_major = register_chrdev(0, "wwan_port", &wwan_port_fops);
+	if (wwan_major < 0) {
+		class_destroy(wwan_class);
+		return wwan_major;
+	}
+
+	return 0;
+}
+
+static void __exit wwan_exit(void)
+{
+	unregister_chrdev(wwan_major, "wwan_port");
+	class_destroy(wwan_class);
+}
+
+module_init(wwan_init);
+module_exit(wwan_exit);
+
+MODULE_AUTHOR("Loic Poulain <loic.poulain@linaro.org>");
+MODULE_DESCRIPTION("WWAN core");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
new file mode 100644
index 000000000000..aa05a253dcf9
--- /dev/null
+++ b/include/linux/wwan.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021, Linaro Ltd <loic.poulain@linaro.org> */
+
+#ifndef __WWAN_H
+#define __WWAN_H
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+/**
+ * enum wwan_port_type - WWAN port types
+ * @WWAN_PORT_AT: AT commands
+ * @WWAN_PORT_MBIM: Mobile Broadband Interface Model control
+ * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control
+ * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface
+ * @WWAN_PORT_FIREHOSE: XML based command protocol
+ * @WWAN_PORT_MAX: Number of supported port types
+ */
+enum wwan_port_type {
+	WWAN_PORT_AT,
+	WWAN_PORT_MBIM,
+	WWAN_PORT_QMI,
+	WWAN_PORT_QCDM,
+	WWAN_PORT_FIREHOSE,
+	WWAN_PORT_MAX,
+};
+
+struct wwan_port;
+
+/** struct wwan_port_ops - The WWAN port operations
+ * @start: The routine for starting the WWAN port device.
+ * @stop: The routine for stopping the WWAN port device.
+ * @tx: The routine that sends WWAN port protocol data to the device.
+ *
+ * The wwan_port_ops structure contains a list of low-level operations
+ * that control a WWAN port device. All functions are mandatory.
+ */
+struct wwan_port_ops {
+	int (*start)(struct wwan_port *port);
+	void (*stop)(struct wwan_port *port);
+	int (*tx)(struct wwan_port *port, struct sk_buff *skb);
+};
+
+/**
+ * wwan_create_port - Add a new WWAN port
+ * @parent: Device to use as parent and shared by all WWAN ports
+ * @type: WWAN port type
+ * @ops: WWAN port operations
+ * @drvdata: Pointer to caller driver data
+ *
+ * Allocate and register a new WWAN port. The port will be automatically exposed
+ * to user as a character device and attached to the right virtual WWAN device,
+ * based on the parent pointer. The parent pointer is the device shared by all
+ * components of a same WWAN modem (e.g. USB dev, PCI dev, MHI controller...).
+ *
+ * drvdata will be placed in the WWAN port device driver data and can be
+ * retrieved with wwan_port_get_drvdata().
+ *
+ * This function must be balanced with a call to wwan_remove_port().
+ *
+ * Returns a valid pointer to wwan_port on success or PTR_ERR on failure
+ */
+struct wwan_port *wwan_create_port(struct device *parent,
+				   enum wwan_port_type type,
+				   const struct wwan_port_ops *ops,
+				   void *drvdata);
+
+/**
+ * wwan_remove_port - Remove a WWAN port
+ * @port: WWAN port to remove
+ *
+ * Remove a previously created port.
+ */
+void wwan_remove_port(struct wwan_port *port);
+
+/**
+ * wwan_port_rx - Receive data from the WWAN port
+ * @port: WWAN port for which data is received
+ * @skb: Pointer to the rx buffer
+ *
+ * A port driver calls this function upon data reception (MBIM, AT...).
+ */
+void wwan_port_rx(struct wwan_port *port, struct sk_buff *skb);
+
+/**
+ * wwan_port_txoff - Stop TX on WWAN port
+ * @port: WWAN port for which TX must be stopped
+ *
+ * Used for TX flow control, a port driver calls this function to indicate TX
+ * is temporary unavailable (e.g. due to ring buffer fullness).
+ */
+void wwan_port_txoff(struct wwan_port *port);
+
+
+/**
+ * wwan_port_txon - Restart TX on WWAN port
+ * @port: WWAN port for which TX must be restarted
+ *
+ * Used for TX flow control, a port driver calls this function to indicate TX
+ * is available again.
+ */
+void wwan_port_txon(struct wwan_port *port);
+
+/**
+ * wwan_port_get_drvdata - Retrieve driver data from a WWAN port
+ * @port: Related WWAN port
+ */
+void *wwan_port_get_drvdata(struct wwan_port *port);
+
+#endif /* __WWAN_H */
-- 
cgit v1.2.3


From 02c587733c8161355a43e6e110c2e29bd0acff72 Mon Sep 17 00:00:00 2001
From: Walter Wu <walter-zh.wu@mediatek.com>
Date: Fri, 16 Apr 2021 15:46:00 -0700
Subject: kasan: remove redundant config option

CONFIG_KASAN_STACK and CONFIG_KASAN_STACK_ENABLE both enable KASAN stack
instrumentation, but we should only need one config, so that we remove
CONFIG_KASAN_STACK_ENABLE and make CONFIG_KASAN_STACK workable.  see [1].

When enable KASAN stack instrumentation, then for gcc we could do no
prompt and default value y, and for clang prompt and default value n.

This patch fixes the following compilation warning:

  include/linux/kasan.h:333:30: warning: 'CONFIG_KASAN_STACK' is not defined, evaluates to 0 [-Wundef]

[akpm@linux-foundation.org: fix merge snafu]

Link: https://bugzilla.kernel.org/show_bug.cgi?id=210221 [1]
Link: https://lkml.kernel.org/r/20210226012531.29231-1-walter-zh.wu@mediatek.com
Fixes: d9b571c885a8 ("kasan: fix KASAN_STACK dependency for HW_TAGS")
Signed-off-by: Walter Wu <walter-zh.wu@mediatek.com>
Suggested-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/kernel/sleep.S        |  2 +-
 arch/x86/kernel/acpi/wakeup_64.S |  2 +-
 include/linux/kasan.h            |  2 +-
 lib/Kconfig.kasan                |  9 ++-------
 mm/kasan/common.c                |  2 +-
 mm/kasan/kasan.h                 |  2 +-
 mm/kasan/report_generic.c        |  2 +-
 scripts/Makefile.kasan           | 10 ++++++++--
 security/Kconfig.hardening       |  4 ++--
 9 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 5bfd9b87f85d..4ea9392f86e0 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -134,7 +134,7 @@ SYM_FUNC_START(_cpu_resume)
 	 */
 	bl	cpu_do_resume
 
-#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
 	mov	x0, sp
 	bl	kasan_unpoison_task_stack_below
 #endif
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 56b6865afb2a..d5d8a352eafa 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -115,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
 	movq	pt_regs_r14(%rax), %r14
 	movq	pt_regs_r15(%rax), %r15
 
-#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
 	/*
 	 * The suspend path may have poisoned some areas deeper in the stack,
 	 * which we now need to unpoison.
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b91732bd05d7..14f72ec96492 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -330,7 +330,7 @@ static inline bool kasan_check_byte(const void *address)
 
 #endif /* CONFIG_KASAN */
 
-#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
 void kasan_unpoison_task_stack(struct task_struct *task);
 #else
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index fba9909e31b7..cffc2ebbf185 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -138,9 +138,10 @@ config KASAN_INLINE
 
 endchoice
 
-config KASAN_STACK_ENABLE
+config KASAN_STACK
 	bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST
 	depends on KASAN_GENERIC || KASAN_SW_TAGS
+	default y if CC_IS_GCC
 	help
 	  The LLVM stack address sanitizer has a know problem that
 	  causes excessive stack usage in a lot of functions, see
@@ -154,12 +155,6 @@ config KASAN_STACK_ENABLE
 	  CONFIG_COMPILE_TEST.	On gcc it is assumed to always be safe
 	  to use and enabled by default.
 
-config KASAN_STACK
-	int
-	depends on KASAN_GENERIC || KASAN_SW_TAGS
-	default 1 if KASAN_STACK_ENABLE || CC_IS_GCC
-	default 0
-
 config KASAN_SW_TAGS_IDENTIFY
 	bool "Enable memory corruption identification"
 	depends on KASAN_SW_TAGS
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b5e08d4cefec..7b53291dafa1 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -63,7 +63,7 @@ void __kasan_unpoison_range(const void *address, size_t size)
 	kasan_unpoison(address, size);
 }
 
-#if CONFIG_KASAN_STACK
+#ifdef CONFIG_KASAN_STACK
 /* Unpoison the entire stack for a task. */
 void kasan_unpoison_task_stack(struct task_struct *task)
 {
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8c55634d6edd..3436c6bf7c0c 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -231,7 +231,7 @@ void *kasan_find_first_bad_addr(void *addr, size_t size);
 const char *kasan_get_bug_type(struct kasan_access_info *info);
 void kasan_metadata_fetch_row(char *buffer, void *row);
 
-#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK)
 void kasan_print_address_stack_frame(const void *addr);
 #else
 static inline void kasan_print_address_stack_frame(const void *addr) { }
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 41f374585144..de732bc341c5 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -128,7 +128,7 @@ void kasan_metadata_fetch_row(char *buffer, void *row)
 	memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
 }
 
-#if CONFIG_KASAN_STACK
+#ifdef CONFIG_KASAN_STACK
 static bool __must_check tokenize_frame_descr(const char **frame_descr,
 					      char *token, size_t max_tok_len,
 					      unsigned long *value)
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 127012f45166..3d791908ed36 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -4,6 +4,12 @@ KASAN_SHADOW_OFFSET ?= $(CONFIG_KASAN_SHADOW_OFFSET)
 
 cc-param = $(call cc-option, -mllvm -$(1), $(call cc-option, --param $(1)))
 
+ifdef CONFIG_KASAN_STACK
+	stack_enable := 1
+else
+	stack_enable := 0
+endif
+
 ifdef CONFIG_KASAN_GENERIC
 
 ifdef CONFIG_KASAN_INLINE
@@ -27,7 +33,7 @@ else
 	CFLAGS_KASAN := $(CFLAGS_KASAN_SHADOW) \
 	 $(call cc-param,asan-globals=1) \
 	 $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \
-	 $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \
+	 $(call cc-param,asan-stack=$(stack_enable)) \
 	 $(call cc-param,asan-instrument-allocas=1)
 endif
 
@@ -42,7 +48,7 @@ else
 endif
 
 CFLAGS_KASAN := -fsanitize=kernel-hwaddress \
-		$(call cc-param,hwasan-instrument-stack=$(CONFIG_KASAN_STACK)) \
+		$(call cc-param,hwasan-instrument-stack=$(stack_enable)) \
 		$(call cc-param,hwasan-use-short-granules=0) \
 		$(instrumentation_flags)
 
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index 269967c4fc1b..a56c36470cb1 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -64,7 +64,7 @@ choice
 	config GCC_PLUGIN_STRUCTLEAK_BYREF
 		bool "zero-init structs passed by reference (strong)"
 		depends on GCC_PLUGINS
-		depends on !(KASAN && KASAN_STACK=1)
+		depends on !(KASAN && KASAN_STACK)
 		select GCC_PLUGIN_STRUCTLEAK
 		help
 		  Zero-initialize any structures on the stack that may
@@ -82,7 +82,7 @@ choice
 	config GCC_PLUGIN_STRUCTLEAK_BYREF_ALL
 		bool "zero-init anything passed by reference (very strong)"
 		depends on GCC_PLUGINS
-		depends on !(KASAN && KASAN_STACK=1)
+		depends on !(KASAN && KASAN_STACK)
 		select GCC_PLUGIN_STRUCTLEAK
 		help
 		  Zero-initialize any stack variables that may be passed
-- 
cgit v1.2.3


From f09ea6fb12723d6726293d68de00b6307368bd76 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 16 Apr 2021 12:27:39 -0700
Subject: ethtool: add a new command for reading standard stats

Add an interface for reading standard stats, including
stats which don't have a corresponding control interface.

Start with IEEE 802.3 PHY stats. There seems to be only
one stat to expose there.

Define API to not require user space changes when new
stats or groups are added. Groups are based on bitset,
stats have a string set associated.

v1: wrap stats in a nest

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h              |  10 ++
 include/uapi/linux/ethtool.h         |   4 +
 include/uapi/linux/ethtool_netlink.h |  47 ++++++++
 net/ethtool/Makefile                 |   2 +-
 net/ethtool/netlink.c                |  10 ++
 net/ethtool/netlink.h                |   5 +
 net/ethtool/stats.c                  | 200 +++++++++++++++++++++++++++++++++++
 net/ethtool/strset.c                 |  10 ++
 8 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 net/ethtool/stats.c

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 112a85b57f1f..2d5455eedbf4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -250,6 +250,13 @@ static inline void ethtool_stats_init(u64 *stats, unsigned int n)
 		stats[n] = ETHTOOL_STAT_NOT_SET;
 }
 
+/* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed
+ * via a more targeted API.
+ */
+struct ethtool_eth_phy_stats {
+	u64 SymbolErrorDuringCarrier;
+};
+
 /**
  * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames
  * @tx_pause_frames: transmitted pause frame count. Reported to user space
@@ -487,6 +494,7 @@ struct ethtool_module_eeprom {
  * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from
  *	specified page. Returns a negative error code or the amount of bytes
  *	read.
+ * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -597,6 +605,8 @@ struct ethtool_ops {
 	int	(*get_module_eeprom_by_page)(struct net_device *dev,
 					     const struct ethtool_module_eeprom *page,
 					     struct netlink_ext_ack *extack);
+	void	(*get_eth_phy_stats)(struct net_device *dev,
+				     struct ethtool_eth_phy_stats *phy_stats);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f91e079e3108..190ae6e03918 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -669,6 +669,8 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_TS_TX_TYPES: timestamping Tx types
  * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters
  * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types
+ * @ETH_SS_STATS_STD: standardized stats
+ * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -689,6 +691,8 @@ enum ethtool_stringset {
 	ETH_SS_TS_TX_TYPES,
 	ETH_SS_TS_RX_FILTERS,
 	ETH_SS_UDP_TUNNEL_TYPES,
+	ETH_SS_STATS_STD,
+	ETH_SS_STATS_ETH_PHY,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 3a2b31ccbc5b..a54cfe625f34 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -45,6 +45,7 @@ enum {
 	ETHTOOL_MSG_FEC_GET,
 	ETHTOOL_MSG_FEC_SET,
 	ETHTOOL_MSG_MODULE_EEPROM_GET,
+	ETHTOOL_MSG_STATS_GET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -86,6 +87,7 @@ enum {
 	ETHTOOL_MSG_FEC_GET_REPLY,
 	ETHTOOL_MSG_FEC_NTF,
 	ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
+	ETHTOOL_MSG_STATS_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -679,6 +681,51 @@ enum {
 	ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1)
 };
 
+/* STATS */
+
+enum {
+	ETHTOOL_A_STATS_UNSPEC,
+	ETHTOOL_A_STATS_PAD,
+	ETHTOOL_A_STATS_HEADER,			/* nest - _A_HEADER_* */
+	ETHTOOL_A_STATS_GROUPS,			/* bitset */
+
+	ETHTOOL_A_STATS_GRP,			/* nest - _A_STATS_GRP_* */
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_CNT,
+	ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1)
+};
+
+enum {
+	ETHTOOL_STATS_ETH_PHY,
+
+	/* add new constants above here */
+	__ETHTOOL_STATS_CNT
+};
+
+enum {
+	ETHTOOL_A_STATS_GRP_UNSPEC,
+	ETHTOOL_A_STATS_GRP_PAD,
+
+	ETHTOOL_A_STATS_GRP_ID,			/* u32 */
+	ETHTOOL_A_STATS_GRP_SS_ID,		/* u32 */
+
+	ETHTOOL_A_STATS_GRP_STAT,		/* nest */
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_GRP_CNT,
+	ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_CNT - 1)
+};
+
+enum {
+	/* 30.3.2.1.5 aSymbolErrorDuringCarrier */
+	ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_ETH_PHY_CNT,
+	ETHTOOL_A_STATS_ETH_PHY_MAX = (__ETHTOOL_A_STATS_ETH_PHY_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 83842685fd8c..723c9a8a8cdf 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o eeprom.o
+		   tunnels.o fec.o eeprom.o stats.o
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 5f5d7c4b3d4a..290012d0d11d 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -247,6 +247,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_FEC_GET]		= &ethnl_fec_request_ops,
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
 	[ETHTOOL_MSG_MODULE_EEPROM_GET]	= &ethnl_module_eeprom_request_ops,
+	[ETHTOOL_MSG_STATS_GET]		= &ethnl_stats_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -942,6 +943,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_module_eeprom_get_policy,
 		.maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_STATS_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_stats_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 4305ac971bb0..9d88983b6597 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -346,6 +346,7 @@ extern const struct ethnl_request_ops ethnl_eee_request_ops;
 extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
 extern const struct ethnl_request_ops ethnl_fec_request_ops;
 extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
+extern const struct ethnl_request_ops ethnl_stats_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -380,6 +381,7 @@ extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INF
 extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
 extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
 extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1];
+extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
@@ -399,4 +401,7 @@ int ethnl_tunnel_info_start(struct netlink_callback *cb);
 int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 
+extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
+
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
new file mode 100644
index 000000000000..fd8f47178c06
--- /dev/null
+++ b/net/ethtool/stats.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct stats_req_info {
+	struct ethnl_req_info		base;
+	DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT);
+};
+
+#define STATS_REQINFO(__req_base) \
+	container_of(__req_base, struct stats_req_info, base)
+
+struct stats_reply_data {
+	struct ethnl_reply_data		base;
+	struct ethtool_eth_phy_stats	phy_stats;
+};
+
+#define STATS_REPDATA(__reply_base) \
+	container_of(__reply_base, struct stats_reply_data, base)
+
+const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_STATS_ETH_PHY]			= "eth-phy",
+};
+
+const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR]	= "SymbolErrorDuringCarrier",
+};
+
+const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = {
+	[ETHTOOL_A_STATS_HEADER]	=
+		NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_STATS_GROUPS]	= { .type = NLA_NESTED },
+};
+
+static int stats_parse_request(struct ethnl_req_info *req_base,
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
+{
+	struct stats_req_info *req_info = STATS_REQINFO(req_base);
+	bool mod = false;
+	int err;
+
+	err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT,
+				  tb[ETHTOOL_A_STATS_GROUPS], stats_std_names,
+				  extack, &mod);
+	if (err)
+		return err;
+
+	if (!mod) {
+		NL_SET_ERR_MSG(extack, "no stats requested");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int stats_prepare_data(const struct ethnl_req_info *req_base,
+			      struct ethnl_reply_data *reply_base,
+			      struct genl_info *info)
+{
+	const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+	struct stats_reply_data *data = STATS_REPDATA(reply_base);
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return ret;
+
+	memset(&data->phy_stats, 0xff, sizeof(data->phy_stats));
+
+	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
+	    dev->ethtool_ops->get_eth_phy_stats)
+		dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats);
+
+	ethnl_ops_complete(dev);
+	return 0;
+}
+
+static int stats_reply_size(const struct ethnl_req_info *req_base,
+			    const struct ethnl_reply_data *reply_base)
+{
+	const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+	unsigned int n_grps = 0, n_stats = 0;
+	int len = 0;
+
+	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64);
+		n_grps++;
+	}
+
+	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
+			 nla_total_size(4) + /* _A_STATS_GRP_ID */
+			 nla_total_size(4)); /* _A_STATS_GRP_SS_ID */
+	len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */
+			  nla_total_size_64bit(sizeof(u64)));
+
+	return len;
+}
+
+static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val)
+{
+	struct nlattr *nest;
+	int ret;
+
+	if (val == ETHTOOL_STAT_NOT_SET)
+		return 0;
+
+	/* We want to start stats attr types from 0, so we don't have a type
+	 * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside
+	 * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the
+	 * actual attr we're 4B off - nla_need_padding_for_64bit() & co.
+	 * can't be used.
+	 */
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
+		if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0))
+			return -EMSGSIZE;
+#endif
+
+	nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT);
+	if (!nest)
+		return -EMSGSIZE;
+
+	ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */);
+	if (ret) {
+		nla_nest_cancel(skb, nest);
+		return ret;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+}
+
+static int stats_put_phy_stats(struct sk_buff *skb,
+			       const struct stats_reply_data *data)
+{
+	if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR,
+		     data->phy_stats.SymbolErrorDuringCarrier))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int stats_put_stats(struct sk_buff *skb,
+			   const struct stats_reply_data *data,
+			   u32 id, u32 ss_id,
+			   int (*cb)(struct sk_buff *skb,
+				     const struct stats_reply_data *data))
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) ||
+	    nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id))
+		goto err_cancel;
+
+	if (cb(skb, data))
+		goto err_cancel;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+err_cancel:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int stats_fill_reply(struct sk_buff *skb,
+			    const struct ethnl_req_info *req_base,
+			    const struct ethnl_reply_data *reply_base)
+{
+	const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+	const struct stats_reply_data *data = STATS_REPDATA(reply_base);
+	int ret = 0;
+
+	if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY,
+				      ETH_SS_STATS_ETH_PHY,
+				      stats_put_phy_stats);
+
+	return ret;
+}
+
+const struct ethnl_request_ops ethnl_stats_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_STATS_GET,
+	.reply_cmd		= ETHTOOL_MSG_STATS_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_STATS_HEADER,
+	.req_info_size		= sizeof(struct stats_req_info),
+	.reply_data_size	= sizeof(struct stats_reply_data),
+
+	.parse_request		= stats_parse_request,
+	.prepare_data		= stats_prepare_data,
+	.reply_size		= stats_reply_size,
+	.fill_reply		= stats_fill_reply,
+};
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index c3a5489964cd..5f3c73587ff4 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -80,6 +80,16 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
 		.strings	= udp_tunnel_type_names,
 	},
+	[ETH_SS_STATS_STD] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_STATS_CNT,
+		.strings	= stats_std_names,
+	},
+	[ETH_SS_STATS_ETH_PHY] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_ETH_PHY_CNT,
+		.strings	= stats_eth_phy_names,
+	},
 };
 
 struct strset_req_info {
-- 
cgit v1.2.3


From ca2244547ec7505d1cf61d43f5e76e3ffd99cf77 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 16 Apr 2021 12:27:40 -0700
Subject: ethtool: add interface to read standard MAC stats

Most of the MAC statistics are included in
struct rtnl_link_stats64, but some fields
are aggregated. Besides it's good to expose
these clearly hardware stats separately.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h              | 31 +++++++++++++
 include/uapi/linux/ethtool.h         |  2 +
 include/uapi/linux/ethtool_netlink.h | 53 +++++++++++++++++++++
 net/ethtool/netlink.h                |  1 +
 net/ethtool/stats.c                  | 90 ++++++++++++++++++++++++++++++++++++
 net/ethtool/strset.c                 |  5 ++
 6 files changed, 182 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 2d5455eedbf4..3c689a13e679 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -250,6 +250,34 @@ static inline void ethtool_stats_init(u64 *stats, unsigned int n)
 		stats[n] = ETHTOOL_STAT_NOT_SET;
 }
 
+/* Basic IEEE 802.3 MAC statistics (30.3.1.1.*), not otherwise exposed
+ * via a more targeted API.
+ */
+struct ethtool_eth_mac_stats {
+	u64 FramesTransmittedOK;
+	u64 SingleCollisionFrames;
+	u64 MultipleCollisionFrames;
+	u64 FramesReceivedOK;
+	u64 FrameCheckSequenceErrors;
+	u64 AlignmentErrors;
+	u64 OctetsTransmittedOK;
+	u64 FramesWithDeferredXmissions;
+	u64 LateCollisions;
+	u64 FramesAbortedDueToXSColls;
+	u64 FramesLostDueToIntMACXmitError;
+	u64 CarrierSenseErrors;
+	u64 OctetsReceivedOK;
+	u64 FramesLostDueToIntMACRcvError;
+	u64 MulticastFramesXmittedOK;
+	u64 BroadcastFramesXmittedOK;
+	u64 FramesWithExcessiveDeferral;
+	u64 MulticastFramesReceivedOK;
+	u64 BroadcastFramesReceivedOK;
+	u64 InRangeLengthErrors;
+	u64 OutOfRangeLengthField;
+	u64 FrameTooLongErrors;
+};
+
 /* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed
  * via a more targeted API.
  */
@@ -495,6 +523,7 @@ struct ethtool_module_eeprom {
  *	specified page. Returns a negative error code or the amount of bytes
  *	read.
  * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics.
+ * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -607,6 +636,8 @@ struct ethtool_ops {
 					     struct netlink_ext_ack *extack);
 	void	(*get_eth_phy_stats)(struct net_device *dev,
 				     struct ethtool_eth_phy_stats *phy_stats);
+	void	(*get_eth_mac_stats)(struct net_device *dev,
+				     struct ethtool_eth_mac_stats *mac_stats);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 190ae6e03918..c227376d811a 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -671,6 +671,7 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types
  * @ETH_SS_STATS_STD: standardized stats
  * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
+ * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -693,6 +694,7 @@ enum ethtool_stringset {
 	ETH_SS_UDP_TUNNEL_TYPES,
 	ETH_SS_STATS_STD,
 	ETH_SS_STATS_ETH_PHY,
+	ETH_SS_STATS_ETH_MAC,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index a54cfe625f34..f0fbe8f4eb1b 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -698,6 +698,7 @@ enum {
 
 enum {
 	ETHTOOL_STATS_ETH_PHY,
+	ETHTOOL_STATS_ETH_MAC,
 
 	/* add new constants above here */
 	__ETHTOOL_STATS_CNT
@@ -726,6 +727,58 @@ enum {
 	ETHTOOL_A_STATS_ETH_PHY_MAX = (__ETHTOOL_A_STATS_ETH_PHY_CNT - 1)
 };
 
+enum {
+	/* 30.3.1.1.2 aFramesTransmittedOK */
+	ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT,
+	/* 30.3.1.1.3 aSingleCollisionFrames */
+	ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL,
+	/* 30.3.1.1.4 aMultipleCollisionFrames */
+	ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL,
+	/* 30.3.1.1.5 aFramesReceivedOK */
+	ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT,
+	/* 30.3.1.1.6 aFrameCheckSequenceErrors */
+	ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR,
+	/* 30.3.1.1.7 aAlignmentErrors */
+	ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR,
+	/* 30.3.1.1.8 aOctetsTransmittedOK */
+	ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES,
+	/* 30.3.1.1.9 aFramesWithDeferredXmissions */
+	ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER,
+	/* 30.3.1.1.10 aLateCollisions */
+	ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL,
+	/* 30.3.1.1.11 aFramesAbortedDueToXSColls */
+	ETHTOOL_A_STATS_ETH_MAC_11_XS_COL,
+	/* 30.3.1.1.12 aFramesLostDueToIntMACXmitError */
+	ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR,
+	/* 30.3.1.1.13 aCarrierSenseErrors */
+	ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR,
+	/* 30.3.1.1.14 aOctetsReceivedOK */
+	ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES,
+	/* 30.3.1.1.15 aFramesLostDueToIntMACRcvError */
+	ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR,
+
+	/* 30.3.1.1.18 aMulticastFramesXmittedOK */
+	ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST,
+	/* 30.3.1.1.19 aBroadcastFramesXmittedOK */
+	ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST,
+	/* 30.3.1.1.20 aFramesWithExcessiveDeferral */
+	ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER,
+	/* 30.3.1.1.21 aMulticastFramesReceivedOK */
+	ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST,
+	/* 30.3.1.1.22 aBroadcastFramesReceivedOK */
+	ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST,
+	/* 30.3.1.1.23 aInRangeLengthErrors */
+	ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR,
+	/* 30.3.1.1.24 aOutOfRangeLengthField */
+	ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN,
+	/* 30.3.1.1.25 aFrameTooLongErrors */
+	ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_ETH_MAC_CNT,
+	ETHTOOL_A_STATS_ETH_MAC_MAX = (__ETHTOOL_A_STATS_ETH_MAC_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 9d88983b6597..c70bac5329af 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -403,5 +403,6 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
index fd8f47178c06..e80175872226 100644
--- a/net/ethtool/stats.c
+++ b/net/ethtool/stats.c
@@ -15,6 +15,7 @@ struct stats_req_info {
 struct stats_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_eth_phy_stats	phy_stats;
+	struct ethtool_eth_mac_stats	mac_stats;
 };
 
 #define STATS_REPDATA(__reply_base) \
@@ -22,12 +23,38 @@ struct stats_reply_data {
 
 const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_STATS_ETH_PHY]			= "eth-phy",
+	[ETHTOOL_STATS_ETH_MAC]			= "eth-mac",
 };
 
 const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR]	= "SymbolErrorDuringCarrier",
 };
 
+const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT]	= "FramesTransmittedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL]	= "SingleCollisionFrames",
+	[ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL]	= "MultipleCollisionFrames",
+	[ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT]	= "FramesReceivedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR]	= "FrameCheckSequenceErrors",
+	[ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR]	= "AlignmentErrors",
+	[ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES]	= "OctetsTransmittedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER]	= "FramesWithDeferredXmissions",
+	[ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL]	= "LateCollisions",
+	[ETHTOOL_A_STATS_ETH_MAC_11_XS_COL]	= "FramesAbortedDueToXSColls",
+	[ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR]	= "FramesLostDueToIntMACXmitError",
+	[ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR]	= "CarrierSenseErrors",
+	[ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES]	= "OctetsReceivedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR]	= "FramesLostDueToIntMACRcvError",
+	[ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST]	= "MulticastFramesXmittedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST]	= "BroadcastFramesXmittedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER]	= "FramesWithExcessiveDeferral",
+	[ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST]	= "MulticastFramesReceivedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST]	= "BroadcastFramesReceivedOK",
+	[ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR]	= "InRangeLengthErrors",
+	[ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN]	= "OutOfRangeLengthField",
+	[ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR]	= "FrameTooLongErrors",
+};
+
 const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = {
 	[ETHTOOL_A_STATS_HEADER]	=
 		NLA_POLICY_NESTED(ethnl_header_policy),
@@ -70,10 +97,14 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 		return ret;
 
 	memset(&data->phy_stats, 0xff, sizeof(data->phy_stats));
+	memset(&data->mac_stats, 0xff, sizeof(data->mac_stats));
 
 	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_phy_stats)
 		dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats);
+	if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) &&
+	    dev->ethtool_ops->get_eth_mac_stats)
+		dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats);
 
 	ethnl_ops_complete(dev);
 	return 0;
@@ -90,6 +121,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base,
 		n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64);
 		n_grps++;
 	}
+	if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64);
+		n_grps++;
+	}
 
 	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
 			 nla_total_size(4) + /* _A_STATS_GRP_ID */
@@ -143,6 +178,57 @@ static int stats_put_phy_stats(struct sk_buff *skb,
 	return 0;
 }
 
+static int stats_put_mac_stats(struct sk_buff *skb,
+			       const struct stats_reply_data *data)
+{
+	if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT,
+		     data->mac_stats.FramesTransmittedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL,
+		     data->mac_stats.SingleCollisionFrames) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL,
+		     data->mac_stats.MultipleCollisionFrames) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT,
+		     data->mac_stats.FramesReceivedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR,
+		     data->mac_stats.FrameCheckSequenceErrors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR,
+		     data->mac_stats.AlignmentErrors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES,
+		     data->mac_stats.OctetsTransmittedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER,
+		     data->mac_stats.FramesWithDeferredXmissions) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL,
+		     data->mac_stats.LateCollisions) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL,
+		     data->mac_stats.FramesAbortedDueToXSColls) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR,
+		     data->mac_stats.FramesLostDueToIntMACXmitError) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR,
+		     data->mac_stats.CarrierSenseErrors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES,
+		     data->mac_stats.OctetsReceivedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR,
+		     data->mac_stats.FramesLostDueToIntMACRcvError) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST,
+		     data->mac_stats.MulticastFramesXmittedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST,
+		     data->mac_stats.BroadcastFramesXmittedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER,
+		     data->mac_stats.FramesWithExcessiveDeferral) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST,
+		     data->mac_stats.MulticastFramesReceivedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST,
+		     data->mac_stats.BroadcastFramesReceivedOK) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR,
+		     data->mac_stats.InRangeLengthErrors) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN,
+		     data->mac_stats.OutOfRangeLengthField) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR,
+		     data->mac_stats.FrameTooLongErrors))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int stats_put_stats(struct sk_buff *skb,
 			   const struct stats_reply_data *data,
 			   u32 id, u32 ss_id,
@@ -182,6 +268,10 @@ static int stats_fill_reply(struct sk_buff *skb,
 		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY,
 				      ETH_SS_STATS_ETH_PHY,
 				      stats_put_phy_stats);
+	if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC,
+				      ETH_SS_STATS_ETH_MAC,
+				      stats_put_mac_stats);
 
 	return ret;
 }
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index 5f3c73587ff4..a8aac7bcfcc9 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -90,6 +90,11 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_A_STATS_ETH_PHY_CNT,
 		.strings	= stats_eth_phy_names,
 	},
+	[ETH_SS_STATS_ETH_MAC] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_ETH_MAC_CNT,
+		.strings	= stats_eth_mac_names,
+	},
 };
 
 struct strset_req_info {
-- 
cgit v1.2.3


From bfad2b979ddcc330c08bb071eb3c3f7b3411a681 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 16 Apr 2021 12:27:41 -0700
Subject: ethtool: add interface to read standard MAC Ctrl stats

Number of devices maintains the standard-based MAC control
counters for control frames. Add a API for those.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h              | 12 ++++++++++++
 include/uapi/linux/ethtool.h         |  2 ++
 include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++
 net/ethtool/netlink.h                |  1 +
 net/ethtool/stats.c                  | 33 +++++++++++++++++++++++++++++++++
 net/ethtool/strset.c                 |  5 +++++
 6 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 3c689a13e679..1ca6b836f9fe 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -285,6 +285,15 @@ struct ethtool_eth_phy_stats {
 	u64 SymbolErrorDuringCarrier;
 };
 
+/* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed
+ * via a more targeted API.
+ */
+struct ethtool_eth_ctrl_stats {
+	u64 MACControlFramesTransmitted;
+	u64 MACControlFramesReceived;
+	u64 UnsupportedOpcodesReceived;
+};
+
 /**
  * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames
  * @tx_pause_frames: transmitted pause frame count. Reported to user space
@@ -524,6 +533,7 @@ struct ethtool_module_eeprom {
  *	read.
  * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics.
  * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics.
+ * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -638,6 +648,8 @@ struct ethtool_ops {
 				     struct ethtool_eth_phy_stats *phy_stats);
 	void	(*get_eth_mac_stats)(struct net_device *dev,
 				     struct ethtool_eth_mac_stats *mac_stats);
+	void	(*get_eth_ctrl_stats)(struct net_device *dev,
+				      struct ethtool_eth_ctrl_stats *ctrl_stats);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index c227376d811a..9cb8df89d4f2 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -672,6 +672,7 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_STATS_STD: standardized stats
  * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
  * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics
+ * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -695,6 +696,7 @@ enum ethtool_stringset {
 	ETH_SS_STATS_STD,
 	ETH_SS_STATS_ETH_PHY,
 	ETH_SS_STATS_ETH_MAC,
+	ETH_SS_STATS_ETH_CTRL,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index f0fbe8f4eb1b..2ea5f049df6a 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -699,6 +699,7 @@ enum {
 enum {
 	ETHTOOL_STATS_ETH_PHY,
 	ETHTOOL_STATS_ETH_MAC,
+	ETHTOOL_STATS_ETH_CTRL,
 
 	/* add new constants above here */
 	__ETHTOOL_STATS_CNT
@@ -779,6 +780,19 @@ enum {
 	ETHTOOL_A_STATS_ETH_MAC_MAX = (__ETHTOOL_A_STATS_ETH_MAC_CNT - 1)
 };
 
+enum {
+	/* 30.3.3.3 aMACControlFramesTransmitted */
+	ETHTOOL_A_STATS_ETH_CTRL_3_TX,
+	/* 30.3.3.4 aMACControlFramesReceived */
+	ETHTOOL_A_STATS_ETH_CTRL_4_RX,
+	/* 30.3.3.5 aUnsupportedOpcodesReceived */
+	ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_ETH_CTRL_CNT,
+	ETHTOOL_A_STATS_ETH_CTRL_MAX = (__ETHTOOL_A_STATS_ETH_CTRL_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index c70bac5329af..febfa61e52e2 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -404,5 +404,6 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN];
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
index e80175872226..f4fded66731c 100644
--- a/net/ethtool/stats.c
+++ b/net/ethtool/stats.c
@@ -16,6 +16,7 @@ struct stats_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_eth_phy_stats	phy_stats;
 	struct ethtool_eth_mac_stats	mac_stats;
+	struct ethtool_eth_ctrl_stats	ctrl_stats;
 };
 
 #define STATS_REPDATA(__reply_base) \
@@ -24,6 +25,7 @@ struct stats_reply_data {
 const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_STATS_ETH_PHY]			= "eth-phy",
 	[ETHTOOL_STATS_ETH_MAC]			= "eth-mac",
+	[ETHTOOL_STATS_ETH_CTRL]		= "eth-ctrl",
 };
 
 const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
@@ -55,6 +57,12 @@ const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] =
 	[ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR]	= "FrameTooLongErrors",
 };
 
+const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_ETH_CTRL_3_TX]		= "MACControlFramesTransmitted",
+	[ETHTOOL_A_STATS_ETH_CTRL_4_RX]		= "MACControlFramesReceived",
+	[ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP]	= "UnsupportedOpcodesReceived",
+};
+
 const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = {
 	[ETHTOOL_A_STATS_HEADER]	=
 		NLA_POLICY_NESTED(ethnl_header_policy),
@@ -98,6 +106,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 
 	memset(&data->phy_stats, 0xff, sizeof(data->phy_stats));
 	memset(&data->mac_stats, 0xff, sizeof(data->mac_stats));
+	memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats));
 
 	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_phy_stats)
@@ -105,6 +114,9 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 	if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_mac_stats)
 		dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats);
+	if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) &&
+	    dev->ethtool_ops->get_eth_ctrl_stats)
+		dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats);
 
 	ethnl_ops_complete(dev);
 	return 0;
@@ -125,6 +137,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base,
 		n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64);
 		n_grps++;
 	}
+	if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64);
+		n_grps++;
+	}
 
 	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
 			 nla_total_size(4) + /* _A_STATS_GRP_ID */
@@ -229,6 +245,19 @@ static int stats_put_mac_stats(struct sk_buff *skb,
 	return 0;
 }
 
+static int stats_put_ctrl_stats(struct sk_buff *skb,
+				const struct stats_reply_data *data)
+{
+	if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX,
+		     data->ctrl_stats.MACControlFramesTransmitted) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX,
+		     data->ctrl_stats.MACControlFramesReceived) ||
+	    stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP,
+		     data->ctrl_stats.UnsupportedOpcodesReceived))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int stats_put_stats(struct sk_buff *skb,
 			   const struct stats_reply_data *data,
 			   u32 id, u32 ss_id,
@@ -272,6 +301,10 @@ static int stats_fill_reply(struct sk_buff *skb,
 		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC,
 				      ETH_SS_STATS_ETH_MAC,
 				      stats_put_mac_stats);
+	if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL,
+				      ETH_SS_STATS_ETH_CTRL,
+				      stats_put_ctrl_stats);
 
 	return ret;
 }
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index a8aac7bcfcc9..a33c603a7a02 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -95,6 +95,11 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_A_STATS_ETH_MAC_CNT,
 		.strings	= stats_eth_mac_names,
 	},
+	[ETH_SS_STATS_ETH_CTRL] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_ETH_CTRL_CNT,
+		.strings	= stats_eth_ctrl_names,
+	},
 };
 
 struct strset_req_info {
-- 
cgit v1.2.3


From a8b06e9d40d8b18c41c8ce060e8dc004fa59e708 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 16 Apr 2021 12:27:42 -0700
Subject: ethtool: add interface to read RMON stats

Most devices maintain RMON (RFC 2819) stats - particularly
the "histogram" of packets received by size. Unlike other
RFCs which duplicate IEEE stats, the short/oversized frame
counters in RMON don't seem to match IEEE stats 1-to-1 either,
so expose those, too. Do not expose basic packet, CRC errors
etc - those are already otherwise covered.

Because standard defines packet ranges only up to 1518, and
everything above that should theoretically be "oversized"
- devices often create their own ranges.

Going beyond what the RFC defines - expose the "histogram"
in the Tx direction (assume for now that the ranges will
be the same).

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h              | 43 ++++++++++++++++++
 include/uapi/linux/ethtool.h         |  2 +
 include/uapi/linux/ethtool_netlink.h | 23 ++++++++++
 net/ethtool/netlink.h                |  1 +
 net/ethtool/stats.c                  | 87 ++++++++++++++++++++++++++++++++++++
 net/ethtool/strset.c                 |  5 +++
 6 files changed, 161 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 1ca6b836f9fe..e030f7510cd3 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -346,6 +346,44 @@ struct ethtool_fec_stats {
 	} corrected_blocks, uncorrectable_blocks, corrected_bits;
 };
 
+/**
+ * struct ethtool_rmon_hist_range - byte range for histogram statistics
+ * @low: low bound of the bucket (inclusive)
+ * @high: high bound of the bucket (inclusive)
+ */
+struct ethtool_rmon_hist_range {
+	u16 low;
+	u16 high;
+};
+
+#define ETHTOOL_RMON_HIST_MAX	10
+
+/**
+ * struct ethtool_rmon_stats - selected RMON (RFC 2819) statistics
+ * @undersize_pkts: Equivalent to `etherStatsUndersizePkts` from the RFC.
+ * @oversize_pkts: Equivalent to `etherStatsOversizePkts` from the RFC.
+ * @fragments: Equivalent to `etherStatsFragments` from the RFC.
+ * @jabbers: Equivalent to `etherStatsJabbers` from the RFC.
+ * @hist: Packet counter for packet length buckets (e.g.
+ *	`etherStatsPkts128to255Octets` from the RFC).
+ * @hist_tx: Tx counters in similar form to @hist, not defined in the RFC.
+ *
+ * Selection of RMON (RFC 2819) statistics which are not exposed via different
+ * APIs, primarily the packet-length-based counters.
+ * Unfortunately different designs choose different buckets beyond
+ * the 1024B mark (jumbo frame teritory), so the definition of the bucket
+ * ranges is left to the driver.
+ */
+struct ethtool_rmon_stats {
+	u64 undersize_pkts;
+	u64 oversize_pkts;
+	u64 fragments;
+	u64 jabbers;
+
+	u64 hist[ETHTOOL_RMON_HIST_MAX];
+	u64 hist_tx[ETHTOOL_RMON_HIST_MAX];
+};
+
 #define ETH_MODULE_EEPROM_PAGE_LEN	128
 #define ETH_MODULE_MAX_I2C_ADDRESS	0x7f
 
@@ -534,6 +572,8 @@ struct ethtool_module_eeprom {
  * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics.
  * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics.
  * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics.
+ * @get_rmon_stats: Query some of the RMON (RFC 2819) statistics.
+ *	Set %ranges to a pointer to zero-terminated array of byte ranges.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -650,6 +690,9 @@ struct ethtool_ops {
 				     struct ethtool_eth_mac_stats *mac_stats);
 	void	(*get_eth_ctrl_stats)(struct net_device *dev,
 				      struct ethtool_eth_ctrl_stats *ctrl_stats);
+	void	(*get_rmon_stats)(struct net_device *dev,
+				  struct ethtool_rmon_stats *rmon_stats,
+				  const struct ethtool_rmon_hist_range **ranges);
 };
 
 int ethtool_check_ops(const struct ethtool_ops *ops);
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9cb8df89d4f2..cfef6b08169a 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -673,6 +673,7 @@ enum ethtool_link_ext_substate_cable_issue {
  * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
  * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics
  * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics
+ * @ETH_SS_STATS_RMON: names of RMON statistics
  *
  * @ETH_SS_COUNT: number of defined string sets
  */
@@ -697,6 +698,7 @@ enum ethtool_stringset {
 	ETH_SS_STATS_ETH_PHY,
 	ETH_SS_STATS_ETH_MAC,
 	ETH_SS_STATS_ETH_CTRL,
+	ETH_SS_STATS_RMON,
 
 	/* add new constants above here */
 	ETH_SS_COUNT
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 2ea5f049df6a..825cfda1c5d5 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -700,6 +700,7 @@ enum {
 	ETHTOOL_STATS_ETH_PHY,
 	ETHTOOL_STATS_ETH_MAC,
 	ETHTOOL_STATS_ETH_CTRL,
+	ETHTOOL_STATS_RMON,
 
 	/* add new constants above here */
 	__ETHTOOL_STATS_CNT
@@ -714,6 +715,13 @@ enum {
 
 	ETHTOOL_A_STATS_GRP_STAT,		/* nest */
 
+	ETHTOOL_A_STATS_GRP_HIST_RX,		/* nest */
+	ETHTOOL_A_STATS_GRP_HIST_TX,		/* nest */
+
+	ETHTOOL_A_STATS_GRP_HIST_BKT_LOW,	/* u32 */
+	ETHTOOL_A_STATS_GRP_HIST_BKT_HI,	/* u32 */
+	ETHTOOL_A_STATS_GRP_HIST_VAL,		/* u64 */
+
 	/* add new constants above here */
 	__ETHTOOL_A_STATS_GRP_CNT,
 	ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_CNT - 1)
@@ -793,6 +801,21 @@ enum {
 	ETHTOOL_A_STATS_ETH_CTRL_MAX = (__ETHTOOL_A_STATS_ETH_CTRL_CNT - 1)
 };
 
+enum {
+	/* etherStatsUndersizePkts */
+	ETHTOOL_A_STATS_RMON_UNDERSIZE,
+	/* etherStatsOversizePkts */
+	ETHTOOL_A_STATS_RMON_OVERSIZE,
+	/* etherStatsFragments */
+	ETHTOOL_A_STATS_RMON_FRAG,
+	/* etherStatsJabbers */
+	ETHTOOL_A_STATS_RMON_JABBER,
+
+	/* add new constants above here */
+	__ETHTOOL_A_STATS_RMON_CNT,
+	ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index febfa61e52e2..bed3afdf3656 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -405,5 +405,6 @@ extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN];
+extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN];
 
 #endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
index f4fded66731c..acb2b080c358 100644
--- a/net/ethtool/stats.c
+++ b/net/ethtool/stats.c
@@ -17,6 +17,8 @@ struct stats_reply_data {
 	struct ethtool_eth_phy_stats	phy_stats;
 	struct ethtool_eth_mac_stats	mac_stats;
 	struct ethtool_eth_ctrl_stats	ctrl_stats;
+	struct ethtool_rmon_stats	rmon_stats;
+	const struct ethtool_rmon_hist_range	*rmon_ranges;
 };
 
 #define STATS_REPDATA(__reply_base) \
@@ -26,6 +28,7 @@ const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_STATS_ETH_PHY]			= "eth-phy",
 	[ETHTOOL_STATS_ETH_MAC]			= "eth-mac",
 	[ETHTOOL_STATS_ETH_CTRL]		= "eth-ctrl",
+	[ETHTOOL_STATS_RMON]			= "rmon",
 };
 
 const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
@@ -63,6 +66,13 @@ const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]
 	[ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP]	= "UnsupportedOpcodesReceived",
 };
 
+const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_A_STATS_RMON_UNDERSIZE]	= "etherStatsUndersizePkts",
+	[ETHTOOL_A_STATS_RMON_OVERSIZE]		= "etherStatsOversizePkts",
+	[ETHTOOL_A_STATS_RMON_FRAG]		= "etherStatsFragments",
+	[ETHTOOL_A_STATS_RMON_JABBER]		= "etherStatsJabbers",
+};
+
 const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = {
 	[ETHTOOL_A_STATS_HEADER]	=
 		NLA_POLICY_NESTED(ethnl_header_policy),
@@ -107,6 +117,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 	memset(&data->phy_stats, 0xff, sizeof(data->phy_stats));
 	memset(&data->mac_stats, 0xff, sizeof(data->mac_stats));
 	memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats));
+	memset(&data->rmon_stats, 0xff, sizeof(data->rmon_stats));
 
 	if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_phy_stats)
@@ -117,6 +128,10 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
 	if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) &&
 	    dev->ethtool_ops->get_eth_ctrl_stats)
 		dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats);
+	if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) &&
+	    dev->ethtool_ops->get_rmon_stats)
+		dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats,
+						 &data->rmon_ranges);
 
 	ethnl_ops_complete(dev);
 	return 0;
@@ -141,6 +156,16 @@ static int stats_reply_size(const struct ethnl_req_info *req_base,
 		n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64);
 		n_grps++;
 	}
+	if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) {
+		n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64);
+		n_grps++;
+		/* Above includes the space for _A_STATS_GRP_HIST_VALs */
+
+		len += (nla_total_size(0) +	/* _A_STATS_GRP_HIST */
+			nla_total_size(4) +	/* _A_STATS_GRP_HIST_BKT_LOW */
+			nla_total_size(4)) *	/* _A_STATS_GRP_HIST_BKT_HI */
+			ETHTOOL_RMON_HIST_MAX * 2;
+	}
 
 	len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
 			 nla_total_size(4) + /* _A_STATS_GRP_ID */
@@ -258,6 +283,65 @@ static int stats_put_ctrl_stats(struct sk_buff *skb,
 	return 0;
 }
 
+static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist,
+			       const struct ethtool_rmon_hist_range *ranges)
+{
+	struct nlattr *nest;
+	int i;
+
+	if (!ranges)
+		return 0;
+
+	for (i = 0; i <	ETHTOOL_RMON_HIST_MAX; i++) {
+		if (!ranges[i].low && !ranges[i].high)
+			break;
+		if (hist[i] == ETHTOOL_STAT_NOT_SET)
+			continue;
+
+		nest = nla_nest_start(skb, attr);
+		if (!nest)
+			return -EMSGSIZE;
+
+		if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW,
+				ranges[i].low) ||
+		    nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI,
+				ranges[i].high) ||
+		    nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL,
+				      hist[i], ETHTOOL_A_STATS_GRP_PAD))
+			goto err_cancel_hist;
+
+		nla_nest_end(skb, nest);
+	}
+
+	return 0;
+
+err_cancel_hist:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int stats_put_rmon_stats(struct sk_buff *skb,
+				const struct stats_reply_data *data)
+{
+	if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX,
+				data->rmon_stats.hist, data->rmon_ranges) ||
+	    stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX,
+				data->rmon_stats.hist_tx, data->rmon_ranges))
+		return -EMSGSIZE;
+
+	if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE,
+		     data->rmon_stats.undersize_pkts) ||
+	    stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE,
+		     data->rmon_stats.oversize_pkts) ||
+	    stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG,
+		     data->rmon_stats.fragments) ||
+	    stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER,
+		     data->rmon_stats.jabbers))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int stats_put_stats(struct sk_buff *skb,
 			   const struct stats_reply_data *data,
 			   u32 id, u32 ss_id,
@@ -305,6 +389,9 @@ static int stats_fill_reply(struct sk_buff *skb,
 		ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL,
 				      ETH_SS_STATS_ETH_CTRL,
 				      stats_put_ctrl_stats);
+	if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask))
+		ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON,
+				      ETH_SS_STATS_RMON, stats_put_rmon_stats);
 
 	return ret;
 }
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index a33c603a7a02..b3029fff715d 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -100,6 +100,11 @@ static const struct strset_info info_template[] = {
 		.count		= __ETHTOOL_A_STATS_ETH_CTRL_CNT,
 		.strings	= stats_eth_ctrl_names,
 	},
+	[ETH_SS_STATS_RMON] = {
+		.per_dev	= false,
+		.count		= __ETHTOOL_A_STATS_RMON_CNT,
+		.strings	= stats_rmon_names,
+	},
 };
 
 struct strset_req_info {
-- 
cgit v1.2.3


From 5f7c292b8975c9146063abbb91c0b9cdc1a5e9c5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 25 Mar 2021 19:19:47 -0700
Subject: KVM: Move prototypes for MMU notifier callbacks to generic code

Move the prototypes for the MMU notifier callbacks out of arch code and
into common code.  There is no benefit to having each arch replicate the
prototypes since any deviation from the invocation in common code will
explode.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210326021957.1424875-9-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/include/asm/kvm_host.h   | 5 -----
 arch/mips/include/asm/kvm_host.h    | 5 -----
 arch/powerpc/include/asm/kvm_host.h | 7 -------
 arch/x86/include/asm/kvm_host.h     | 6 +-----
 include/linux/kvm_host.h            | 8 ++++++++
 5 files changed, 9 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3d10e6527f7d..72e6b4600264 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -582,11 +582,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 			      struct kvm_vcpu_events *events);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
-			unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 3a5612e7304c..feaa77036b67 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -967,11 +967,6 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
 						   bool write);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm,
-			unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 
 /* Emulation */
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 05fb00d37609..1e83359f286b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -56,13 +56,6 @@
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 
-extern int kvm_unmap_hva_range(struct kvm *kvm,
-			       unsigned long start, unsigned long end,
-			       unsigned flags);
-extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-
 #define HPTEG_CACHE_NUM			(1 << 15)
 #define HPTEG_HASH_BITS_PTE		13
 #define HPTEG_HASH_BITS_PTE_LONG	12
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a52f973bdff6..99778ac51243 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1718,11 +1718,7 @@ asmlinkage void kvm_spurious_fault(void);
 	_ASM_EXTABLE(666b, 667b)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-			unsigned flags);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_extint(struct kvm_vcpu *v);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1b65e7204344..e6d77353025c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -218,6 +218,14 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva_range(struct kvm *kvm,
+			unsigned long start, unsigned long end, unsigned flags);
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+#endif
+
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
-- 
cgit v1.2.3


From 6c9dd6d26216ad9733e57f382e1669c142494aab Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 2 Apr 2021 17:53:09 +0200
Subject: KVM: constify kvm_arch_flush_remote_tlbs_memslot

memslots are stored in RCU and there should be no need to
change them.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/arm.c     | 2 +-
 arch/mips/kvm/mips.c     | 2 +-
 arch/x86/kvm/mmu/mmu.c   | 2 +-
 include/linux/kvm_host.h | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e575eff76e97..0d92a4e5fe80 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1270,7 +1270,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					struct kvm_memory_slot *memslot)
+					const struct kvm_memory_slot *memslot)
 {
 	kvm_flush_remote_tlbs(kvm);
 }
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 58a8812e2fa5..7db8234a4407 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -997,7 +997,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					struct kvm_memory_slot *memslot)
+					const struct kvm_memory_slot *memslot)
 {
 	/* Let implementation handle TLB/GVA invalidation */
 	kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 9a369575c3a5..3546ada27506 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5638,7 +5638,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					struct kvm_memory_slot *memslot)
+					const struct kvm_memory_slot *memslot)
 {
 	/*
 	 * All current use cases for flushing the TLBs for a specific memslot
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e6d77353025c..34a974ffc882 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -894,7 +894,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);
 
 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-					struct kvm_memory_slot *memslot);
+					const struct kvm_memory_slot *memslot);
 #else /* !CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
-- 
cgit v1.2.3


From 3039bcc744980afe87c612122e47a27306483bc2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 1 Apr 2021 17:56:50 -0700
Subject: KVM: Move x86's MMU notifier memslot walkers to generic code

Move the hva->gfn lookup for MMU notifiers into common code.  Every arch
does a similar lookup, and some arch code is all but identical across
multiple architectures.

In addition to consolidating code, this will allow introducing
optimizations that will benefit all architectures without incurring
multiple walks of the memslots, e.g. by taking mmu_lock if and only if a
relevant range exists in the memslots.

The use of __always_inline to avoid indirect call retpolines, as done by
x86, may also benefit other architectures.

Consolidating the lookups also fixes a wart in x86, where the legacy MMU
and TDP MMU each do their own memslot walks.

Lastly, future enhancements to the memslot implementation, e.g. to add an
interval tree to track host address, will need to touch far less arch
specific code.

MIPS, PPC, and arm64 will be converted one at a time in future patches.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210402005658.3024832-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/mmu/mmu.c          | 127 ++++++++--------------
 arch/x86/kvm/mmu/tdp_mmu.c      | 235 ++++++++++++++--------------------------
 arch/x86/kvm/mmu/tdp_mmu.h      |  14 +--
 include/linux/kvm_host.h        |  14 +++
 virt/kvm/kvm_main.c             | 169 ++++++++++++++++++++++++++++-
 6 files changed, 314 insertions(+), 246 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0d217c3db3f8..fc437a36204d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1727,6 +1727,7 @@ asmlinkage void kvm_spurious_fault(void);
 	_ASM_EXTABLE(666b, 667b)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
+#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3546ada27506..d0f53189c257 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1298,26 +1298,25 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	return flush;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			   struct kvm_memory_slot *slot, gfn_t gfn, int level,
-			   unsigned long data)
+static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+			    struct kvm_memory_slot *slot, gfn_t gfn, int level,
+			    pte_t unused)
 {
 	return kvm_zap_rmapp(kvm, rmap_head, slot);
 }
 
-static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
-			     unsigned long data)
+static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+			      struct kvm_memory_slot *slot, gfn_t gfn, int level,
+			      pte_t pte)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
 	int need_flush = 0;
 	u64 new_spte;
-	pte_t *ptep = (pte_t *)data;
 	kvm_pfn_t new_pfn;
 
-	WARN_ON(pte_huge(*ptep));
-	new_pfn = pte_pfn(*ptep);
+	WARN_ON(pte_huge(pte));
+	new_pfn = pte_pfn(pte);
 
 restart:
 	for_each_rmap_spte(rmap_head, &iter, sptep) {
@@ -1326,7 +1325,7 @@ restart:
 
 		need_flush = 1;
 
-		if (pte_write(*ptep)) {
+		if (pte_write(pte)) {
 			pte_list_remove(rmap_head, sptep);
 			goto restart;
 		} else {
@@ -1414,86 +1413,52 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
 	     slot_rmap_walk_okay(_iter_);				\
 	     slot_rmap_walk_next(_iter_))
 
-typedef int (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			      struct kvm_memory_slot *slot, gfn_t gfn,
-			      int level, unsigned long data);
+typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+			       struct kvm_memory_slot *slot, gfn_t gfn,
+			       int level, pte_t pte);
 
-static __always_inline int kvm_handle_hva_range(struct kvm *kvm,
-						unsigned long start,
-						unsigned long end,
-						unsigned long data,
-						rmap_handler_t handler)
+static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
+						 struct kvm_gfn_range *range,
+						 rmap_handler_t handler)
 {
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
 	struct slot_rmap_walk_iterator iterator;
-	int ret = 0;
-	int i;
-
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-		slots = __kvm_memslots(kvm, i);
-		kvm_for_each_memslot(memslot, slots) {
-			unsigned long hva_start, hva_end;
-			gfn_t gfn_start, gfn_end;
+	bool ret = false;
 
-			hva_start = max(start, memslot->userspace_addr);
-			hva_end = min(end, memslot->userspace_addr +
-				      (memslot->npages << PAGE_SHIFT));
-			if (hva_start >= hva_end)
-				continue;
-			/*
-			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
-			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-			 */
-			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-			for_each_slot_rmap_range(memslot, PG_LEVEL_4K,
-						 KVM_MAX_HUGEPAGE_LEVEL,
-						 gfn_start, gfn_end - 1,
-						 &iterator)
-				ret |= handler(kvm, iterator.rmap, memslot,
-					       iterator.gfn, iterator.level, data);
-		}
-	}
+	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+				 range->start, range->end - 1, &iterator)
+		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
+			       iterator.level, range->pte);
 
 	return ret;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  unsigned long data, rmap_handler_t handler)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
-			unsigned flags)
-{
-	int r;
+	bool flush;
 
-	r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+	flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
 
 	if (is_tdp_mmu_enabled(kvm))
-		r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+		flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
 
-	return r;
+	return flush;
 }
 
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	int r;
+	bool flush;
 
-	r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+	flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
 
 	if (is_tdp_mmu_enabled(kvm))
-		r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+		flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
 
-	return r;
+	return flush;
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
-			 unsigned long data)
+static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+			  struct kvm_memory_slot *slot, gfn_t gfn, int level,
+			  pte_t unused)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1505,9 +1470,9 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 	return young;
 }
 
-static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			      struct kvm_memory_slot *slot, gfn_t gfn,
-			      int level, unsigned long data)
+static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+			       struct kvm_memory_slot *slot, gfn_t gfn,
+			       int level, pte_t unused)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1529,29 +1494,31 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
 
-	kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+	kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
 	kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
 			KVM_PAGES_PER_HPAGE(sp->role.level));
 }
 
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	int young = false;
+	bool young;
+
+	young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
 
-	young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
 	if (is_tdp_mmu_enabled(kvm))
-		young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+		young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
 
 	return young;
 }
 
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	int young = false;
+	bool young;
+
+	young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
 
-	young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
 	if (is_tdp_mmu_enabled(kvm))
-		young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+		young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
 
 	return young;
 }
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ccf0d774a181..ea15f3098eb2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -873,204 +873,135 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 	return ret;
 }
 
-typedef int (*tdp_handler_t)(struct kvm *kvm, struct kvm_memory_slot *slot,
-			     struct kvm_mmu_page *root, gfn_t start, gfn_t end,
-			     unsigned long data);
-
-static __always_inline int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
-							unsigned long start,
-							unsigned long end,
-							unsigned long data,
-							tdp_handler_t handler)
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+				 bool flush)
 {
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
 	struct kvm_mmu_page *root;
-	int ret = 0;
-	int as_id;
-
-	for (as_id = 0; as_id < KVM_ADDRESS_SPACE_NUM; as_id++) {
-		for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) {
-			slots = __kvm_memslots(kvm, as_id);
-			kvm_for_each_memslot(memslot, slots) {
-				unsigned long hva_start, hva_end;
-				gfn_t gfn_start, gfn_end;
-
-				hva_start = max(start, memslot->userspace_addr);
-				hva_end = min(end, memslot->userspace_addr +
-					(memslot->npages << PAGE_SHIFT));
-				if (hva_start >= hva_end)
-					continue;
-				/*
-				 * {gfn(page) | page intersects with [hva_start, hva_end)} =
-				 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-				 */
-				gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-				gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-				ret |= handler(kvm, memslot, root, gfn_start,
-					gfn_end, data);
-			}
-		}
-	}
 
-	return ret;
-}
+	for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
+		flush |= zap_gfn_range(kvm, root, range->start, range->end,
+				       false, flush);
 
-static __always_inline int kvm_tdp_mmu_handle_hva(struct kvm *kvm,
-						  unsigned long addr,
-						  unsigned long data,
-						  tdp_handler_t handler)
-{
-	return kvm_tdp_mmu_handle_hva_range(kvm, addr, addr + 1, data, handler);
+	return flush;
 }
 
-static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
-				     struct kvm_memory_slot *slot,
-				     struct kvm_mmu_page *root, gfn_t start,
-				     gfn_t end, unsigned long unused)
-{
-	return zap_gfn_range(kvm, root, start, end, false, false);
-}
+typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
+			      struct kvm_gfn_range *range);
 
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
-			      unsigned long end)
+static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
+						   struct kvm_gfn_range *range,
+						   tdp_handler_t handler)
 {
-	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
-					    zap_gfn_range_hva_wrapper);
+	struct kvm_mmu_page *root;
+	struct tdp_iter iter;
+	bool ret = false;
+
+	rcu_read_lock();
+
+	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
+		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+			ret |= handler(kvm, &iter, range);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
 }
 
 /*
  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
  * if any of the GFNs in the range have been accessed.
  */
-static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
-			 struct kvm_mmu_page *root, gfn_t start, gfn_t end,
-			 unsigned long unused)
+static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
+			  struct kvm_gfn_range *range)
 {
-	struct tdp_iter iter;
-	int young = 0;
-	u64 new_spte;
+	u64 new_spte = 0;
 
-	rcu_read_lock();
+	/* If we have a non-accessed entry we don't need to change the pte. */
+	if (!is_accessed_spte(iter->old_spte))
+		return false;
 
-	tdp_root_for_each_leaf_pte(iter, root, start, end) {
+	new_spte = iter->old_spte;
+
+	if (spte_ad_enabled(new_spte)) {
+		new_spte &= ~shadow_accessed_mask;
+	} else {
 		/*
-		 * If we have a non-accessed entry we don't need to change the
-		 * pte.
+		 * Capture the dirty status of the page, so that it doesn't get
+		 * lost when the SPTE is marked for access tracking.
 		 */
-		if (!is_accessed_spte(iter.old_spte))
-			continue;
+		if (is_writable_pte(new_spte))
+			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
 
-		new_spte = iter.old_spte;
-
-		if (spte_ad_enabled(new_spte)) {
-			new_spte &= ~shadow_accessed_mask;
-		} else {
-			/*
-			 * Capture the dirty status of the page, so that it doesn't get
-			 * lost when the SPTE is marked for access tracking.
-			 */
-			if (is_writable_pte(new_spte))
-				kvm_set_pfn_dirty(spte_to_pfn(new_spte));
-
-			new_spte = mark_spte_for_access_track(new_spte);
-		}
-
-		tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte);
-		young = 1;
+		new_spte = mark_spte_for_access_track(new_spte);
 	}
 
-	rcu_read_unlock();
+	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
 
-	return young;
+	return true;
 }
 
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
-			      unsigned long end)
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0,
-					    age_gfn_range);
+	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
 }
 
-static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
-			struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
-			unsigned long unused)
+static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
+			 struct kvm_gfn_range *range)
 {
-	struct tdp_iter iter;
-
-	tdp_root_for_each_leaf_pte(iter, root, gfn, end)
-		if (is_accessed_spte(iter.old_spte))
-			return 1;
-
-	return 0;
+	return is_accessed_spte(iter->old_spte);
 }
 
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva)
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	return kvm_tdp_mmu_handle_hva(kvm, hva, 0, test_age_gfn);
+	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 }
 
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
-			struct kvm_mmu_page *root, gfn_t gfn, gfn_t end,
-			unsigned long data)
+static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
+			 struct kvm_gfn_range *range)
 {
-	struct tdp_iter iter;
-	pte_t *ptep = (pte_t *)data;
-	kvm_pfn_t new_pfn;
 	u64 new_spte;
-	int need_flush = 0;
-
-	rcu_read_lock();
 
-	WARN_ON(pte_huge(*ptep) || (gfn + 1) != end);
+	/* Huge pages aren't expected to be modified without first being zapped. */
+	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
 
-	new_pfn = pte_pfn(*ptep);
-
-	tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
-		if (iter.level != PG_LEVEL_4K)
-			continue;
-
-		if (!is_shadow_present_pte(iter.old_spte))
-			break;
-
-		/*
-		 * Note, when changing a read-only SPTE, it's not strictly
-		 * necessary to zero the SPTE before setting the new PFN, but
-		 * doing so preserves the invariant that the PFN of a present
-		 * leaf SPTE can never change.  See __handle_changed_spte().
-		 */
-		tdp_mmu_set_spte(kvm, &iter, 0);
+	if (iter->level != PG_LEVEL_4K ||
+	    !is_shadow_present_pte(iter->old_spte))
+		return false;
 
-		if (!pte_write(*ptep)) {
-			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
-					iter.old_spte, new_pfn);
+	/*
+	 * Note, when changing a read-only SPTE, it's not strictly necessary to
+	 * zero the SPTE before setting the new PFN, but doing so preserves the
+	 * invariant that the PFN of a present * leaf SPTE can never change.
+	 * See __handle_changed_spte().
+	 */
+	tdp_mmu_set_spte(kvm, iter, 0);
 
-			tdp_mmu_set_spte(kvm, &iter, new_spte);
-		}
+	if (!pte_write(range->pte)) {
+		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
+								  pte_pfn(range->pte));
 
-		need_flush = 1;
+		tdp_mmu_set_spte(kvm, iter, new_spte);
 	}
 
-	if (need_flush)
-		kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
-
-	rcu_read_unlock();
-
-	return 0;
+	return true;
 }
 
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
-			     pte_t *host_ptep)
+/*
+ * Handle the changed_pte MMU notifier for the TDP MMU.
+ * data is a pointer to the new pte_t mapping the HVA specified by the MMU
+ * notifier.
+ * Returns non-zero if a flush is needed before releasing the MMU lock.
+ */
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
-	return kvm_tdp_mmu_handle_hva(kvm, address, (unsigned long)host_ptep,
-				      set_tdp_spte);
+	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
+
+	/* FIXME: return 'flush' instead of flushing here. */
+	if (flush)
+		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
+
+	return false;
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index bf3ce169122e..ee8efa58902f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -38,15 +38,11 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 		    int map_writable, int max_level, kvm_pfn_t pfn,
 		    bool prefault);
 
-int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
-			      unsigned long end);
-
-int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start,
-			      unsigned long end);
-int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva);
-
-int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address,
-			     pte_t *host_ptep);
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+				 bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 
 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			     int min_level);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 34a974ffc882..0228436bc446 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -219,11 +219,25 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+struct kvm_gfn_range {
+	struct kvm_memory_slot *slot;
+	gfn_t start;
+	gfn_t end;
+	pte_t pte;
+	bool may_block;
+};
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+#else
 int kvm_unmap_hva_range(struct kvm *kvm,
 			unsigned long start, unsigned long end, unsigned flags);
 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */
 #endif
 
 enum {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5ce91254a793..20836cf342ba 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -451,14 +451,131 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
+#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+
+typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+
+struct kvm_hva_range {
+	unsigned long start;
+	unsigned long end;
+	pte_t pte;
+	hva_handler_t handler;
+	bool flush_on_ret;
+	bool may_block;
+};
+
+static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
+						  const struct kvm_hva_range *range)
+{
+	struct kvm_memory_slot *slot;
+	struct kvm_memslots *slots;
+	struct kvm_gfn_range gfn_range;
+	bool ret = false;
+	int i, idx;
+
+	lockdep_assert_held_write(&kvm->mmu_lock);
+
+	idx = srcu_read_lock(&kvm->srcu);
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot(slot, slots) {
+			unsigned long hva_start, hva_end;
+
+			hva_start = max(range->start, slot->userspace_addr);
+			hva_end = min(range->end, slot->userspace_addr +
+						  (slot->npages << PAGE_SHIFT));
+			if (hva_start >= hva_end)
+				continue;
+
+			/*
+			 * To optimize for the likely case where the address
+			 * range is covered by zero or one memslots, don't
+			 * bother making these conditional (to avoid writes on
+			 * the second or later invocation of the handler).
+			 */
+			gfn_range.pte = range->pte;
+			gfn_range.may_block = range->may_block;
+
+			/*
+			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+			 */
+			gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
+			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
+			gfn_range.slot = slot;
+
+			ret |= range->handler(kvm, &gfn_range);
+		}
+	}
+
+	if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
+		kvm_flush_remote_tlbs(kvm);
+
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	/* The notifiers are averse to booleans. :-( */
+	return (int)ret;
+}
+
+static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
+						unsigned long start,
+						unsigned long end,
+						pte_t pte,
+						hva_handler_t handler)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	const struct kvm_hva_range range = {
+		.start		= start,
+		.end		= end,
+		.pte		= pte,
+		.handler	= handler,
+		.flush_on_ret	= true,
+		.may_block	= false,
+	};
+	int ret;
+
+	KVM_MMU_LOCK(kvm);
+	ret = __kvm_handle_hva_range(kvm, &range);
+	KVM_MMU_UNLOCK(kvm);
+
+	return ret;
+}
+
+static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
+							 unsigned long start,
+							 unsigned long end,
+							 hva_handler_t handler)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	const struct kvm_hva_range range = {
+		.start		= start,
+		.end		= end,
+		.pte		= __pte(0),
+		.handler	= handler,
+		.flush_on_ret	= false,
+		.may_block	= false,
+	};
+	int ret;
+
+	KVM_MMU_LOCK(kvm);
+	ret = __kvm_handle_hva_range(kvm, &range);
+	KVM_MMU_UNLOCK(kvm);
+
+	return ret;
+}
+#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */
+
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 					struct mm_struct *mm,
 					unsigned long address,
 					pte_t pte)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int idx;
 
+#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+	int idx;
+#endif
 	trace_kvm_set_spte_hva(address);
 
 	/*
@@ -468,6 +585,9 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	 */
 	WARN_ON_ONCE(!kvm->mmu_notifier_count);
 
+#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+#else
 	idx = srcu_read_lock(&kvm->srcu);
 
 	KVM_MMU_LOCK(kvm);
@@ -477,17 +597,32 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 
 	KVM_MMU_UNLOCK(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
+#endif
 }
 
 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+	const struct kvm_hva_range hva_range = {
+		.start		= range->start,
+		.end		= range->end,
+		.pte		= __pte(0),
+		.handler	= kvm_unmap_gfn_range,
+		.flush_on_ret	= true,
+		.may_block	= mmu_notifier_range_blockable(range),
+	};
+#else
 	int need_tlb_flush = 0, idx;
+#endif
 
 	trace_kvm_unmap_hva_range(range->start, range->end);
 
+#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	idx = srcu_read_lock(&kvm->srcu);
+#endif
+
 	KVM_MMU_LOCK(kvm);
 	/*
 	 * The count increase must become visible at unlock time as no
@@ -513,14 +648,21 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		kvm->mmu_notifier_range_end =
 			max(kvm->mmu_notifier_range_end, range->end);
 	}
+
+#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+	__kvm_handle_hva_range(kvm, &hva_range);
+#else
 	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
 					     range->flags);
 	/* we've to flush the tlb before the pages can be freed */
 	if (need_tlb_flush || kvm->tlbs_dirty)
 		kvm_flush_remote_tlbs(kvm);
+#endif
 
 	KVM_MMU_UNLOCK(kvm);
+#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	srcu_read_unlock(&kvm->srcu, idx);
+#endif
 
 	return 0;
 }
@@ -554,11 +696,15 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 					      unsigned long start,
 					      unsigned long end)
 {
+#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int young, idx;
-
+#endif
 	trace_kvm_age_hva(start, end);
 
+#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+	return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
+#else
 	idx = srcu_read_lock(&kvm->srcu);
 	KVM_MMU_LOCK(kvm);
 
@@ -570,6 +716,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return young;
+#endif
 }
 
 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -577,13 +724,13 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 					unsigned long start,
 					unsigned long end)
 {
+#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int young, idx;
+#endif
 
 	trace_kvm_age_hva(start, end);
 
-	idx = srcu_read_lock(&kvm->srcu);
-	KVM_MMU_LOCK(kvm);
 	/*
 	 * Even though we do not flush TLB, this will still adversely
 	 * affect performance on pre-Haswell Intel EPT, where there is
@@ -597,22 +744,33 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 	 * cadence. If we find this inaccurate, we might come up with a
 	 * more sophisticated heuristic later.
 	 */
+#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
+#else
+	idx = srcu_read_lock(&kvm->srcu);
+	KVM_MMU_LOCK(kvm);
 	young = kvm_age_hva(kvm, start, end);
 	KVM_MMU_UNLOCK(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return young;
+#endif
 }
 
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
 {
+#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int young, idx;
-
+#endif
 	trace_kvm_test_age_hva(address);
 
+#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
+	return kvm_handle_hva_range_no_flush(mn, address, address + 1,
+					     kvm_test_age_gfn);
+#else
 	idx = srcu_read_lock(&kvm->srcu);
 	KVM_MMU_LOCK(kvm);
 	young = kvm_test_age_hva(kvm, address);
@@ -620,6 +778,7 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return young;
+#endif
 }
 
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
-- 
cgit v1.2.3


From b4c5936c47f86295cc76672e8dbeeca8b2379ba6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 1 Apr 2021 17:56:54 -0700
Subject: KVM: Kill off the old hva-based MMU notifier callbacks

Yank out the hva-based MMU notifier APIs now that all architectures that
use the notifiers have moved to the gfn-based APIs.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210402005658.3024832-7-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/include/asm/kvm_host.h   |  1 -
 arch/mips/include/asm/kvm_host.h    |  1 -
 arch/powerpc/include/asm/kvm_host.h |  1 -
 arch/x86/include/asm/kvm_host.h     |  1 -
 include/linux/kvm_host.h            |  8 ----
 virt/kvm/kvm_main.c                 | 85 -------------------------------------
 6 files changed, 97 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index eba448bd4c7f..3a708be3ed71 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -586,7 +586,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 			      struct kvm_vcpu_events *events);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index b70645670548..d0944a75fc8d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -960,7 +960,6 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
 						   bool write);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 
 /* Emulation */
 int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1335f0001bdd..1e83359f286b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -55,7 +55,6 @@
 #include <linux/mmu_notifier.h>
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 
 #define HPTEG_CACHE_NUM			(1 << 15)
 #define HPTEG_HASH_BITS_PTE		13
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fc437a36204d..0d217c3db3f8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1727,7 +1727,6 @@ asmlinkage void kvm_spurious_fault(void);
 	_ASM_EXTABLE(666b, 667b)
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
-#define KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0228436bc446..6b4dd9500d70 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -219,7 +219,6 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
-#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 struct kvm_gfn_range {
 	struct kvm_memory_slot *slot;
 	gfn_t start;
@@ -231,13 +230,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-#else
-int kvm_unmap_hva_range(struct kvm *kvm,
-			unsigned long start, unsigned long end, unsigned flags);
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
-#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */
 #endif
 
 enum {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 20836cf342ba..d4c249719a56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -451,8 +451,6 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
-#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
-
 typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 
 struct kvm_hva_range {
@@ -564,8 +562,6 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 
 	return ret;
 }
-#endif /* KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS */
-
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 					struct mm_struct *mm,
 					unsigned long address,
@@ -573,9 +569,6 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 
-#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
-	int idx;
-#endif
 	trace_kvm_set_spte_hva(address);
 
 	/*
@@ -585,26 +578,13 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 	 */
 	WARN_ON_ONCE(!kvm->mmu_notifier_count);
 
-#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
-#else
-	idx = srcu_read_lock(&kvm->srcu);
-
-	KVM_MMU_LOCK(kvm);
-
-	if (kvm_set_spte_hva(kvm, address, pte))
-		kvm_flush_remote_tlbs(kvm);
-
-	KVM_MMU_UNLOCK(kvm);
-	srcu_read_unlock(&kvm->srcu, idx);
-#endif
 }
 
 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	const struct kvm_hva_range hva_range = {
 		.start		= range->start,
 		.end		= range->end,
@@ -613,16 +593,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		.flush_on_ret	= true,
 		.may_block	= mmu_notifier_range_blockable(range),
 	};
-#else
-	int need_tlb_flush = 0, idx;
-#endif
 
 	trace_kvm_unmap_hva_range(range->start, range->end);
 
-#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
-	idx = srcu_read_lock(&kvm->srcu);
-#endif
-
 	KVM_MMU_LOCK(kvm);
 	/*
 	 * The count increase must become visible at unlock time as no
@@ -649,20 +622,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 			max(kvm->mmu_notifier_range_end, range->end);
 	}
 
-#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	__kvm_handle_hva_range(kvm, &hva_range);
-#else
-	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
-					     range->flags);
-	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush || kvm->tlbs_dirty)
-		kvm_flush_remote_tlbs(kvm);
-#endif
 
 	KVM_MMU_UNLOCK(kvm);
-#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
-	srcu_read_unlock(&kvm->srcu, idx);
-#endif
 
 	return 0;
 }
@@ -696,27 +658,9 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 					      unsigned long start,
 					      unsigned long end)
 {
-#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int young, idx;
-#endif
 	trace_kvm_age_hva(start, end);
 
-#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
-#else
-	idx = srcu_read_lock(&kvm->srcu);
-	KVM_MMU_LOCK(kvm);
-
-	young = kvm_age_hva(kvm, start, end);
-	if (young)
-		kvm_flush_remote_tlbs(kvm);
-
-	KVM_MMU_UNLOCK(kvm);
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return young;
-#endif
 }
 
 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -724,11 +668,6 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 					unsigned long start,
 					unsigned long end)
 {
-#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int young, idx;
-#endif
-
 	trace_kvm_age_hva(start, end);
 
 	/*
@@ -744,41 +683,17 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 	 * cadence. If we find this inaccurate, we might come up with a
 	 * more sophisticated heuristic later.
 	 */
-#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
-#else
-	idx = srcu_read_lock(&kvm->srcu);
-	KVM_MMU_LOCK(kvm);
-	young = kvm_age_hva(kvm, start, end);
-	KVM_MMU_UNLOCK(kvm);
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return young;
-#endif
 }
 
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
 {
-#ifndef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int young, idx;
-#endif
 	trace_kvm_test_age_hva(address);
 
-#ifdef KVM_ARCH_WANT_NEW_MMU_NOTIFIER_APIS
 	return kvm_handle_hva_range_no_flush(mn, address, address + 1,
 					     kvm_test_age_gfn);
-#else
-	idx = srcu_read_lock(&kvm->srcu);
-	KVM_MMU_LOCK(kvm);
-	young = kvm_test_age_hva(kvm, address);
-	KVM_MMU_UNLOCK(kvm);
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return young;
-#endif
 }
 
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
-- 
cgit v1.2.3


From 1df53d212c803c0e11a2b10d47ec830d3576b972 Mon Sep 17 00:00:00 2001
From: "Andrea Parri (Microsoft)" <parri.andrea@gmail.com>
Date: Fri, 16 Apr 2021 16:34:47 +0200
Subject: Drivers: hv: vmbus: Introduce and negotiate VMBus protocol version
 5.3

Hyper-V has added VMBus protocol version 5.3.  Allow Linux guests to
negotiate the new version on version of Hyper-V that support it.

Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20210416143449.16185-2-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/connection.c | 3 ++-
 include/linux/hyperv.h  | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 350e8c5cafa8..dc19d5ae4373 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -45,6 +45,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
  * Table of VMBus versions listed from newest to oldest.
  */
 static __u32 vmbus_versions[] = {
+	VERSION_WIN10_V5_3,
 	VERSION_WIN10_V5_2,
 	VERSION_WIN10_V5_1,
 	VERSION_WIN10_V5,
@@ -60,7 +61,7 @@ static __u32 vmbus_versions[] = {
  * Maximal VMBus protocol version guests can negotiate.  Useful to cap the
  * VMBus version for testing and debugging purpose.
  */
-static uint max_version = VERSION_WIN10_V5_2;
+static uint max_version = VERSION_WIN10_V5_3;
 
 module_param(max_version, uint, S_IRUGO);
 MODULE_PARM_DESC(max_version,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 2c18c8e768ef..3ce36bbb398e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -234,6 +234,7 @@ static inline u32 hv_get_avail_to_write_percent(
  * 5 . 0  (Newer Windows 10)
  * 5 . 1  (Windows 10 RS4)
  * 5 . 2  (Windows Server 2019, RS5)
+ * 5 . 3  (Windows Server 2022)
  */
 
 #define VERSION_WS2008  ((0 << 16) | (13))
@@ -245,6 +246,7 @@ static inline u32 hv_get_avail_to_write_percent(
 #define VERSION_WIN10_V5 ((5 << 16) | (0))
 #define VERSION_WIN10_V5_1 ((5 << 16) | (1))
 #define VERSION_WIN10_V5_2 ((5 << 16) | (2))
+#define VERSION_WIN10_V5_3 ((5 << 16) | (3))
 
 /* Make maximum size of pipe payload of 16K */
 #define MAX_PIPE_DATA_PAYLOAD		(sizeof(u8) * 16384)
-- 
cgit v1.2.3


From 870ced0548c895eb0bf8d982400344e5a9b2df00 Mon Sep 17 00:00:00 2001
From: "Andrea Parri (Microsoft)" <parri.andrea@gmail.com>
Date: Fri, 16 Apr 2021 16:34:48 +0200
Subject: Drivers: hv: vmbus: Drivers: hv: vmbus: Introduce
 CHANNELMSG_MODIFYCHANNEL_RESPONSE

Introduce the CHANNELMSG_MODIFYCHANNEL_RESPONSE message type, and code
to receive and process such a message.

Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Reviewed-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/20210416143449.16185-3-parri.andrea@gmail.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel.c      | 99 +++++++++++++++++++++++++++++++++++++++--------
 drivers/hv/channel_mgmt.c | 42 ++++++++++++++++++++
 drivers/hv/hv_trace.h     | 15 +++++++
 drivers/hv/vmbus_drv.c    |  4 +-
 include/linux/hyperv.h    | 11 +++++-
 5 files changed, 152 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 1c5a418c1962..c2635e913a92 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -209,31 +209,96 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
 }
 EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);
 
+static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 target_vp)
+{
+	struct vmbus_channel_modifychannel msg;
+	int ret;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+	msg.child_relid = channel->offermsg.child_relid;
+	msg.target_vp = target_vp;
+
+	ret = vmbus_post_msg(&msg, sizeof(msg), true);
+	trace_vmbus_send_modifychannel(&msg, ret);
+
+	return ret;
+}
+
+static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target_vp)
+{
+	struct vmbus_channel_modifychannel *msg;
+	struct vmbus_channel_msginfo *info;
+	unsigned long flags;
+	int ret;
+
+	info = kzalloc(sizeof(struct vmbus_channel_msginfo) +
+				sizeof(struct vmbus_channel_modifychannel),
+		       GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	init_completion(&info->waitevent);
+	info->waiting_channel = channel;
+
+	msg = (struct vmbus_channel_modifychannel *)info->msg;
+	msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+	msg->child_relid = channel->offermsg.child_relid;
+	msg->target_vp = target_vp;
+
+	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+	list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list);
+	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+	ret = vmbus_post_msg(msg, sizeof(*msg), true);
+	trace_vmbus_send_modifychannel(msg, ret);
+	if (ret != 0) {
+		spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+		list_del(&info->msglistentry);
+		spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+		goto free_info;
+	}
+
+	/*
+	 * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could block on
+	 * the mutex and be unable to signal the completion.
+	 *
+	 * See the caller target_cpu_store() for information about the usage of the
+	 * mutex.
+	 */
+	mutex_unlock(&vmbus_connection.channel_mutex);
+	wait_for_completion(&info->waitevent);
+	mutex_lock(&vmbus_connection.channel_mutex);
+
+	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+	list_del(&info->msglistentry);
+	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+	if (info->response.modify_response.status)
+		ret = -EAGAIN;
+
+free_info:
+	kfree(info);
+	return ret;
+}
+
 /*
  * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
  *
- * CHANNELMSG_MODIFYCHANNEL messages are aynchronous.  Also, Hyper-V does not
- * ACK such messages.  IOW we can't know when the host will stop interrupting
- * the "old" vCPU and start interrupting the "new" vCPU for the given channel.
+ * CHANNELMSG_MODIFYCHANNEL messages are aynchronous.  When VMbus version 5.3
+ * or later is negotiated, Hyper-V always sends an ACK in response to such a
+ * message.  For VMbus version 5.2 and earlier, it never sends an ACK.  With-
+ * out an ACK, we can not know when the host will stop interrupting the "old"
+ * vCPU and start interrupting the "new" vCPU for the given channel.
  *
  * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
  * VERSION_WIN10_V4_1.
  */
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp)
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp)
 {
-	struct vmbus_channel_modifychannel conn_msg;
-	int ret;
-
-	memset(&conn_msg, 0, sizeof(conn_msg));
-	conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
-	conn_msg.child_relid = child_relid;
-	conn_msg.target_vp = target_vp;
-
-	ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true);
-
-	trace_vmbus_send_modifychannel(&conn_msg, ret);
-
-	return ret;
+	if (vmbus_proto_version >= VERSION_WIN10_V5_3)
+		return send_modifychannel_with_ack(channel, target_vp);
+	return send_modifychannel_without_ack(channel, target_vp);
 }
 EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
 
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index f3cf4af01e10..4c9e45d1f462 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -1311,6 +1311,46 @@ static void vmbus_ongpadl_created(struct vmbus_channel_message_header *hdr)
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 }
 
+/*
+ * vmbus_onmodifychannel_response - Modify Channel response handler.
+ *
+ * This is invoked when we received a response to our channel modify request.
+ * Find the matching request, copy the response and signal the requesting thread.
+ */
+static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header *hdr)
+{
+	struct vmbus_channel_modifychannel_response *response;
+	struct vmbus_channel_msginfo *msginfo;
+	unsigned long flags;
+
+	response = (struct vmbus_channel_modifychannel_response *)hdr;
+
+	trace_vmbus_onmodifychannel_response(response);
+
+	/*
+	 * Find the modify msg, copy the response and signal/unblock the wait event.
+	 */
+	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+
+	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) {
+		struct vmbus_channel_message_header *responseheader =
+				(struct vmbus_channel_message_header *)msginfo->msg;
+
+		if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) {
+			struct vmbus_channel_modifychannel *modifymsg;
+
+			modifymsg = (struct vmbus_channel_modifychannel *)msginfo->msg;
+			if (modifymsg->child_relid == response->child_relid) {
+				memcpy(&msginfo->response.modify_response, response,
+				       sizeof(*response));
+				complete(&msginfo->waitevent);
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+}
+
 /*
  * vmbus_ongpadl_torndown - GPADL torndown handler.
  *
@@ -1428,6 +1468,8 @@ channel_message_table[CHANNELMSG_COUNT] = {
 	{ CHANNELMSG_TL_CONNECT_REQUEST,	0, NULL, 0},
 	{ CHANNELMSG_MODIFYCHANNEL,		0, NULL, 0},
 	{ CHANNELMSG_TL_CONNECT_RESULT,		0, NULL, 0},
+	{ CHANNELMSG_MODIFYCHANNEL_RESPONSE,	1, vmbus_onmodifychannel_response,
+		sizeof(struct vmbus_channel_modifychannel_response)},
 };
 
 /*
diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h
index 6063bb21bb13..c02a1719e92f 100644
--- a/drivers/hv/hv_trace.h
+++ b/drivers/hv/hv_trace.h
@@ -103,6 +103,21 @@ TRACE_EVENT(vmbus_ongpadl_created,
 		    )
 	);
 
+TRACE_EVENT(vmbus_onmodifychannel_response,
+	    TP_PROTO(const struct vmbus_channel_modifychannel_response *response),
+	    TP_ARGS(response),
+	    TP_STRUCT__entry(
+		    __field(u32, child_relid)
+		    __field(u32, status)
+		    ),
+	    TP_fast_assign(__entry->child_relid = response->child_relid;
+			   __entry->status = response->status;
+		    ),
+	    TP_printk("child_relid 0x%x, status %d",
+		      __entry->child_relid,  __entry->status
+		    )
+	);
+
 TRACE_EVENT(vmbus_ongpadl_torndown,
 	    TP_PROTO(const struct vmbus_channel_gpadl_torndown *gpadltorndown),
 	    TP_ARGS(gpadltorndown),
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 51c40d5e3c8a..b12d6827b222 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1848,13 +1848,15 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	if (target_cpu == origin_cpu)
 		goto cpu_store_unlock;
 
-	if (vmbus_send_modifychannel(channel->offermsg.child_relid,
+	if (vmbus_send_modifychannel(channel,
 				     hv_cpu_number_to_vp_number(target_cpu))) {
 		ret = -EIO;
 		goto cpu_store_unlock;
 	}
 
 	/*
+	 * For version before VERSION_WIN10_V5_3, the following warning holds:
+	 *
 	 * Warning.  At this point, there is *no* guarantee that the host will
 	 * have successfully processed the vmbus_send_modifychannel() request.
 	 * See the header comment of vmbus_send_modifychannel() for more info.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 3ce36bbb398e..9c2373a1cb2d 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -477,6 +477,7 @@ enum vmbus_channel_message_type {
 	CHANNELMSG_TL_CONNECT_REQUEST		= 21,
 	CHANNELMSG_MODIFYCHANNEL		= 22,
 	CHANNELMSG_TL_CONNECT_RESULT		= 23,
+	CHANNELMSG_MODIFYCHANNEL_RESPONSE	= 24,
 	CHANNELMSG_COUNT
 };
 
@@ -590,6 +591,13 @@ struct vmbus_channel_open_result {
 	u32 status;
 } __packed;
 
+/* Modify Channel Result parameters */
+struct vmbus_channel_modifychannel_response {
+	struct vmbus_channel_message_header header;
+	u32 child_relid;
+	u32 status;
+} __packed;
+
 /* Close channel parameters; */
 struct vmbus_channel_close_channel {
 	struct vmbus_channel_message_header header;
@@ -722,6 +730,7 @@ struct vmbus_channel_msginfo {
 		struct vmbus_channel_gpadl_torndown gpadl_torndown;
 		struct vmbus_channel_gpadl_created gpadl_created;
 		struct vmbus_channel_version_response version_response;
+		struct vmbus_channel_modifychannel_response modify_response;
 	} response;
 
 	u32 msgsize;
@@ -1596,7 +1605,7 @@ extern __u32 vmbus_proto_version;
 
 int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
 				  const guid_t *shv_host_servie_id);
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp);
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp);
 void vmbus_set_event(struct vmbus_channel *channel);
 
 /* Get the start of the ring buffer. */
-- 
cgit v1.2.3


From efce5b50bad8b63d07719318c34a664ccdb56b70 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 9 Apr 2021 12:40:26 +0300
Subject: ieee80211: add the values of ranging parameters max LTF total field

Add an enum with the values of the ranging parameters max LTF total
field, as defined in IEEE802.11az_D2.6, table Table 9-322h23fc.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20210409123755.d2588ebb1974.I9424c8ade13c4c938cb9999d8ce99d0d4c1cc198@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 72ff75fb1971..25fc7bee868a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3861,4 +3861,11 @@ struct ieee80211_neighbor_ap_info {
        u8 channel;
 } __packed;
 
+enum ieee80211_range_params_max_total_ltf {
+	IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_4 = 0,
+	IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_8,
+	IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_16,
+	IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_UNSPECIFIED,
+};
+
 #endif /* LINUX_IEEE80211_H */
-- 
cgit v1.2.3


From 76cf42213307f0908e010ac4c2bdcb77113202dd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 9 Apr 2021 12:40:17 +0300
Subject: wireless: align some HE capabilities with the spec

Some names were changed, align that with the spec as of
802.11ax-D6.1.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20210409123755.b1e5fbab0d8c.I3eb6076cb0714ec6aec6b8f9dee613ce4a05d825@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath11k/mac.c              | 10 +++++-----
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c |  6 +++---
 drivers/net/wireless/mediatek/mt76/mt7915/init.c   | 10 +++++-----
 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c    |  4 ++--
 drivers/net/wireless/mediatek/mt76/mt7921/main.c   |  2 +-
 include/linux/ieee80211.h                          | 14 +++++++-------
 net/mac80211/debugfs_sta.c                         | 19 ++++++++++---------
 7 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index faa2e678e63e..343768afedc4 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -3807,7 +3807,7 @@ ath11k_mac_filter_he_cap_mesh(struct ieee80211_he_cap_elem *he_cap_elem)
 	    IEEE80211_HE_MAC_CAP4_BQR;
 	he_cap_elem->mac_cap_info[4] &= ~m;
 
-	m = IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECVITE_TRANSMISSION |
+	m = IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION |
 	    IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU |
 	    IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING |
 	    IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX;
@@ -3817,7 +3817,7 @@ ath11k_mac_filter_he_cap_mesh(struct ieee80211_he_cap_elem *he_cap_elem)
 	    IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO;
 	he_cap_elem->phy_cap_info[2] &= ~m;
 
-	m = IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA |
+	m = IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU |
 	    IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK |
 	    IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK;
 	he_cap_elem->phy_cap_info[3] &= ~m;
@@ -3829,13 +3829,13 @@ ath11k_mac_filter_he_cap_mesh(struct ieee80211_he_cap_elem *he_cap_elem)
 	he_cap_elem->phy_cap_info[5] &= ~m;
 
 	m = IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU |
-	    IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB |
+	    IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB |
 	    IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB |
 	    IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO;
 	he_cap_elem->phy_cap_info[6] &= ~m;
 
-	m = IEEE80211_HE_PHY_CAP7_SRP_BASED_SR |
-	    IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR |
+	m = IEEE80211_HE_PHY_CAP7_PSR_BASED_SR |
+	    IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP |
 	    IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ |
 	    IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ;
 	he_cap_elem->phy_cap_info[7] &= ~m;
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index af684f80b0cc..632f20d4027d 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -583,11 +583,11 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = {
 					IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 |
 					IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2,
 				.phy_cap_info[6] =
-					IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB |
-					IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB |
+					IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB |
+					IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB |
 					IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT,
 				.phy_cap_info[7] =
-					IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR |
+					IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP |
 					IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI |
 					IEEE80211_HE_PHY_CAP7_MAX_NC_1,
 				.phy_cap_info[8] =
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c
index ad4e5b95158b..d9a0587f234b 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c
@@ -370,8 +370,8 @@ mt7915_set_stream_he_txbf_caps(struct ieee80211_sta_he_cap *he_cap,
 	    IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK;
 	elem->phy_cap_info[5] &= ~c;
 
-	c = IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB |
-	    IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB;
+	c = IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB |
+	    IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB;
 	elem->phy_cap_info[6] &= ~c;
 
 	elem->phy_cap_info[7] &= ~IEEE80211_HE_PHY_CAP7_MAX_NC_MASK;
@@ -408,8 +408,8 @@ mt7915_set_stream_he_txbf_caps(struct ieee80211_sta_he_cap *he_cap,
 	c = (nss - 1) | (max_t(int, le16_to_cpu(mcs->tx_mcs_160), 1) << 3);
 	elem->phy_cap_info[5] |= c;
 
-	c = IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB |
-	    IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB;
+	c = IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB |
+	    IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB;
 	elem->phy_cap_info[6] |= c;
 
 	/* the maximum cap is 4 x 3, (Nr, Nc) = (3, 2) */
@@ -535,7 +535,7 @@ mt7915_init_he_caps(struct mt7915_phy *phy, enum nl80211_band band,
 				IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE |
 				IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT;
 			he_cap_elem->phy_cap_info[7] |=
-				IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR |
+				IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP |
 				IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI;
 			he_cap_elem->phy_cap_info[8] |=
 				IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G |
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index 195929242b72..97ef0265c516 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -1821,9 +1821,9 @@ mt7915_mcu_sta_bfer_he(struct ieee80211_sta *sta, struct ieee80211_vif *vif,
 
 	bf->tx_mode = MT_PHY_TYPE_HE_SU;
 	mt7915_mcu_sta_sounding_rate(bf);
-	bf->trigger_su = HE_PHY(CAP6_TRIG_SU_BEAMFORMER_FB,
+	bf->trigger_su = HE_PHY(CAP6_TRIG_SU_BEAMFORMING_FB,
 				pe->phy_cap_info[6]);
-	bf->trigger_mu = HE_PHY(CAP6_TRIG_MU_BEAMFORMER_FB,
+	bf->trigger_mu = HE_PHY(CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB,
 				pe->phy_cap_info[6]);
 	bfer_nr = HE_PHY(CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK,
 			 ve->phy_cap_info[5]);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 729f6c42cdde..fdd93926b516 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -108,7 +108,7 @@ mt7921_init_he_caps(struct mt7921_phy *phy, enum nl80211_band band,
 				IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE |
 				IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT;
 			he_cap_elem->phy_cap_info[7] |=
-				IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR |
+				IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP |
 				IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI;
 			he_cap_elem->phy_cap_info[8] |=
 				IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G |
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 25fc7bee868a..687db25eb85f 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2020,7 +2020,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 #define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG		0x01
 #define IEEE80211_HE_MAC_CAP4_QTP				0x02
 #define IEEE80211_HE_MAC_CAP4_BQR				0x04
-#define IEEE80211_HE_MAC_CAP4_SRP_RESP				0x08
+#define IEEE80211_HE_MAC_CAP4_PSR_RESP				0x08
 #define IEEE80211_HE_MAC_CAP4_NDP_FB_REP			0x10
 #define IEEE80211_HE_MAC_CAP4_OPS				0x20
 #define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU			0x40
@@ -2031,7 +2031,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 
 #define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40		0x01
 #define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41		0x02
-#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECVITE_TRANSMISSION	0x04
+#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION	0x04
 #define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU			0x08
 #define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX		0x10
 #define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS			0x20
@@ -2089,7 +2089,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 #define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK			0x18
 #define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1				0x00
 #define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2				0x20
-#define IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA		0x40
+#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU		0x40
 #define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER				0x80
 
 #define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE				0x01
@@ -2136,15 +2136,15 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 
 #define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU			0x01
 #define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU			0x02
-#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB			0x04
-#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB			0x08
+#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB			0x04
+#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB		0x08
 #define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB				0x10
 #define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE			0x20
 #define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO		0x40
 #define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT			0x80
 
-#define IEEE80211_HE_PHY_CAP7_SRP_BASED_SR				0x01
-#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR			0x02
+#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR				0x01
+#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP			0x02
 #define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI		0x04
 #define IEEE80211_HE_PHY_CAP7_MAX_NC_1					0x08
 #define IEEE80211_HE_PHY_CAP7_MAX_NC_2					0x10
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 5a27c61a7b38..d350224d45e8 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -732,15 +732,15 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 	PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG");
 	PFLAG(MAC, 4, QTP, "QTP");
 	PFLAG(MAC, 4, BQR, "BQR");
-	PFLAG(MAC, 4, SRP_RESP, "SRP-RESP");
+	PFLAG(MAC, 4, PSR_RESP, "PSR-RESP");
 	PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP");
 	PFLAG(MAC, 4, OPS, "OPS");
 	PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU");
 
 	PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7);
 
-	PFLAG(MAC, 5, SUBCHAN_SELECVITE_TRANSMISSION,
-	      "SUBCHAN-SELECVITE-TRANSMISSION");
+	PFLAG(MAC, 5, SUBCHAN_SELECTIVE_TRANSMISSION,
+	      "SUBCHAN-SELECTIVE-TRANSMISSION");
 	PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU");
 	PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX");
 	PFLAG(MAC, 5, HE_DYNAMIC_SM_PS, "HE-DYNAMIC-SM-PS");
@@ -832,8 +832,8 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 
 	PFLAG(PHY, 3, DCM_MAX_RX_NSS_1, "DCM-MAX-RX-NSS-1");
 	PFLAG(PHY, 3, DCM_MAX_RX_NSS_2, "DCM-MAX-RX-NSS-2");
-	PFLAG(PHY, 3, RX_HE_MU_PPDU_FROM_NON_AP_STA,
-	      "RX-HE-MU-PPDU-FROM-NON-AP-STA");
+	PFLAG(PHY, 3, RX_PARTIAL_BW_SU_IN_20MHZ_MU,
+	      "RX-PARTIAL-BW-SU-IN-20MHZ-MU");
 	PFLAG(PHY, 3, SU_BEAMFORMER, "SU-BEAMFORMER");
 
 	PFLAG(PHY, 4, SU_BEAMFORMEE, "SU-BEAMFORMEE");
@@ -853,16 +853,17 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 
 	PFLAG(PHY, 6, CODEBOOK_SIZE_42_SU, "CODEBOOK-SIZE-42-SU");
 	PFLAG(PHY, 6, CODEBOOK_SIZE_75_MU, "CODEBOOK-SIZE-75-MU");
-	PFLAG(PHY, 6, TRIG_SU_BEAMFORMER_FB, "TRIG-SU-BEAMFORMER-FB");
-	PFLAG(PHY, 6, TRIG_MU_BEAMFORMER_FB, "TRIG-MU-BEAMFORMER-FB");
+	PFLAG(PHY, 6, TRIG_SU_BEAMFORMING_FB, "TRIG-SU-BEAMFORMING-FB");
+	PFLAG(PHY, 6, TRIG_MU_BEAMFORMING_PARTIAL_BW_FB,
+	      "MU-BEAMFORMING-PARTIAL-BW-FB");
 	PFLAG(PHY, 6, TRIG_CQI_FB, "TRIG-CQI-FB");
 	PFLAG(PHY, 6, PARTIAL_BW_EXT_RANGE, "PARTIAL-BW-EXT-RANGE");
 	PFLAG(PHY, 6, PARTIAL_BANDWIDTH_DL_MUMIMO,
 	      "PARTIAL-BANDWIDTH-DL-MUMIMO");
 	PFLAG(PHY, 6, PPE_THRESHOLD_PRESENT, "PPE-THRESHOLD-PRESENT");
 
-	PFLAG(PHY, 7, SRP_BASED_SR, "SRP-BASED-SR");
-	PFLAG(PHY, 7, POWER_BOOST_FACTOR_AR, "POWER-BOOST-FACTOR-AR");
+	PFLAG(PHY, 7, PSR_BASED_SR, "PSR-BASED-SR");
+	PFLAG(PHY, 7, POWER_BOOST_FACTOR_SUPP, "POWER-BOOST-FACTOR-SUPP");
 	PFLAG(PHY, 7, HE_SU_MU_PPDU_4XLTF_AND_08_US_GI,
 	      "HE-SU-MU-PPDU-4XLTF-AND-08-US-GI");
 	PFLAG_RANGE(PHY, 7, MAX_NC, 0, 1, 1, "MAX-NC-%d");
-- 
cgit v1.2.3


From 1f851b8dfd76a0e91560247802dd25a4754753c7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 9 Apr 2021 12:40:20 +0300
Subject: wireless: align HE capabilities A-MPDU Length Exponent Extension

The A-MPDU length exponent extension is defined differently in
802.11ax D6.1, align with that.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20210409123755.c2a257d3e2df.I3455245d388c52c61dace7e7958dbed7e807cfb6@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath11k/mac.c              |  5 ++---
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c |  4 ++--
 drivers/net/wireless/mac80211_hwsim.c              |  8 ++++----
 drivers/net/wireless/mediatek/mt76/mt7915/init.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7921/main.c   |  2 +-
 include/linux/ieee80211.h                          | 10 ++++------
 net/mac80211/debugfs_sta.c                         | 16 ++++++++--------
 7 files changed, 22 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 343768afedc4..a44ca32e9f72 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -1265,9 +1265,8 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar,
 	 * request, then use MAX_AMPDU_LEN_FACTOR as 16 to calculate max_ampdu
 	 * length.
 	 */
-	ampdu_factor = (he_cap->he_cap_elem.mac_cap_info[3] &
-			IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) >>
-			IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_SHIFT;
+	ampdu_factor = u8_get_bits(he_cap->he_cap_elem.mac_cap_info[3],
+				   IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK);
 
 	if (ampdu_factor) {
 		if (sta->vht_cap.vht_supported)
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index 632f20d4027d..d2058cdcb0d8 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -550,7 +550,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = {
 					IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP,
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
+					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2,
 				.mac_cap_info[4] =
 					IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU |
 					IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39,
@@ -636,7 +636,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = {
 					IEEE80211_HE_MAC_CAP2_BSR,
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
+					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2,
 				.mac_cap_info[4] =
 					IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
 				.mac_cap_info[5] =
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index d56d2095a0d4..9630324c535e 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2795,7 +2795,7 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz[] = {
 					IEEE80211_HE_MAC_CAP2_ACK_EN,
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
+					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
 				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
 				.phy_cap_info[1] =
 					IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
@@ -2839,7 +2839,7 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz[] = {
 					IEEE80211_HE_MAC_CAP2_ACK_EN,
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
+					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
 				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
 				.phy_cap_info[1] =
 					IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
@@ -2885,7 +2885,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz[] = {
 					IEEE80211_HE_MAC_CAP2_ACK_EN,
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
+					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
 				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
 				.phy_cap_info[0] =
 					IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
@@ -2933,7 +2933,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz[] = {
 					IEEE80211_HE_MAC_CAP2_ACK_EN,
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2,
+					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
 				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
 				.phy_cap_info[0] =
 					IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c
index d9a0587f234b..82b9e15dc6e3 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c
@@ -476,7 +476,7 @@ mt7915_init_he_caps(struct mt7915_phy *phy, enum nl80211_band band,
 			IEEE80211_HE_MAC_CAP0_HTC_HE;
 		he_cap_elem->mac_cap_info[3] =
 			IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-			IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED;
+			IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3;
 		he_cap_elem->mac_cap_info[4] =
 			IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index fdd93926b516..23149fcdf413 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -65,7 +65,7 @@ mt7921_init_he_caps(struct mt7921_phy *phy, enum nl80211_band band,
 			IEEE80211_HE_MAC_CAP0_HTC_HE;
 		he_cap_elem->mac_cap_info[3] =
 			IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
-			IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED;
+			IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3;
 		he_cap_elem->mac_cap_info[4] =
 			IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU;
 
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 687db25eb85f..c74033aca726 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2006,17 +2006,15 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
  * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the
  * same field in the HE capabilities.
  */
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT	0x00
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1		0x08
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2		0x10
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED	0x18
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0		0x00
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1		0x08
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2		0x10
+#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3		0x18
 #define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK		0x18
 #define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG			0x20
 #define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED			0x40
 #define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS		0x80
 
-#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_SHIFT		3
-
 #define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG		0x01
 #define IEEE80211_HE_MAC_CAP4_QTP				0x02
 #define IEEE80211_HE_MAC_CAP4_BQR				0x04
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index d350224d45e8..25b3d4822aed 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -711,17 +711,17 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 	PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA");
 
 	switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) {
-	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT:
-		PRINT("MAX-AMPDU-LEN-EXP-USE-VHT");
+	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0:
+		PRINT("MAX-AMPDU-LEN-EXP-USE-EXT-0");
 		break;
-	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1:
-		PRINT("MAX-AMPDU-LEN-EXP-VHT-1");
+	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1:
+		PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-1");
 		break;
-	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2:
-		PRINT("MAX-AMPDU-LEN-EXP-VHT-2");
+	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2:
+		PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-2");
 		break;
-	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED:
-		PRINT("MAX-AMPDU-LEN-EXP-RESERVED");
+	case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3:
+		PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-3");
 		break;
 	}
 
-- 
cgit v1.2.3


From 2f5164447cdab6419edddde3a214f93a53aa4e60 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 9 Apr 2021 12:40:24 +0300
Subject: wireless: fix spelling of A-MSDU in HE capabilities

In the HE capabilities, spell A-MSDU correctly, not "A-MDSU".

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/iwlwifi.20210409123755.9e6ff1af1181.If6868bc6902ccd9a95c74c78f716c4b41473ef14@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c   | 4 ++--
 drivers/net/wireless/mac80211_hwsim.c                | 8 ++++----
 drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c | 2 +-
 drivers/net/wireless/mediatek/mt76/mt7915/init.c     | 2 +-
 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c      | 2 +-
 drivers/net/wireless/mediatek/mt76/mt7921/main.c     | 2 +-
 include/linux/ieee80211.h                            | 2 +-
 net/mac80211/debugfs_sta.c                           | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index d2058cdcb0d8..4e7da2e32354 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -552,7 +552,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = {
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
 					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2,
 				.mac_cap_info[4] =
-					IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU |
+					IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU |
 					IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39,
 				.mac_cap_info[5] =
 					IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40 |
@@ -638,7 +638,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = {
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
 					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2,
 				.mac_cap_info[4] =
-					IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
+					IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
 				.mac_cap_info[5] =
 					IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU,
 				.phy_cap_info[0] =
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 9630324c535e..51ce767eaf88 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2796,7 +2796,7 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz[] = {
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
 					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
-				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
+				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
 				.phy_cap_info[1] =
 					IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
 					IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
@@ -2840,7 +2840,7 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz[] = {
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
 					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
-				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
+				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
 				.phy_cap_info[1] =
 					IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
 					IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
@@ -2886,7 +2886,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz[] = {
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
 					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
-				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
+				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
 				.phy_cap_info[0] =
 					IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
 					IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
@@ -2934,7 +2934,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz[] = {
 				.mac_cap_info[3] =
 					IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
 					IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
-				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU,
+				.mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
 				.phy_cap_info[0] =
 					IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
 					IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
index 6cbccfb05f8b..e7c23f9f0ea5 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c
@@ -496,7 +496,7 @@ mt76_connac_mcu_sta_he_tlv(struct sk_buff *skb, struct ieee80211_sta *sta)
 	if (elem->mac_cap_info[3] & IEEE80211_HE_MAC_CAP3_OMI_CONTROL)
 		cap |= STA_REC_HE_CAP_OM;
 
-	if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU)
+	if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU)
 		cap |= STA_REC_HE_CAP_AMSDU_IN_AMPDU;
 
 	if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_BQR)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c
index 82b9e15dc6e3..152ac7192163 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c
@@ -478,7 +478,7 @@ mt7915_init_he_caps(struct mt7915_phy *phy, enum nl80211_band band,
 			IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
 			IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3;
 		he_cap_elem->mac_cap_info[4] =
-			IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU;
+			IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU;
 
 		if (band == NL80211_BAND_2GHZ)
 			he_cap_elem->phy_cap_info[0] =
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index 97ef0265c516..c44091754a98 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -1330,7 +1330,7 @@ mt7915_mcu_sta_he_tlv(struct sk_buff *skb, struct ieee80211_sta *sta)
 	if (elem->mac_cap_info[3] & IEEE80211_HE_MAC_CAP3_OMI_CONTROL)
 		cap |= STA_REC_HE_CAP_OM;
 
-	if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU)
+	if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU)
 		cap |= STA_REC_HE_CAP_AMSDU_IN_AMPDU;
 
 	if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_BQR)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 23149fcdf413..07141e98a077 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -67,7 +67,7 @@ mt7921_init_he_caps(struct mt7921_phy *phy, enum nl80211_band band,
 			IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
 			IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3;
 		he_cap_elem->mac_cap_info[4] =
-			IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU;
+			IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU;
 
 		if (band == NL80211_BAND_2GHZ)
 			he_cap_elem->phy_cap_info[0] =
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index c74033aca726..2967437f1b11 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2021,7 +2021,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
 #define IEEE80211_HE_MAC_CAP4_PSR_RESP				0x08
 #define IEEE80211_HE_MAC_CAP4_NDP_FB_REP			0x10
 #define IEEE80211_HE_MAC_CAP4_OPS				0x20
-#define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU			0x40
+#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU			0x40
 /* Multi TID agg TX is split between byte #4 and #5
  * The value is a combination of B39,B40,B41
  */
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 25b3d4822aed..936c9dfa86c8 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -735,7 +735,7 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
 	PFLAG(MAC, 4, PSR_RESP, "PSR-RESP");
 	PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP");
 	PFLAG(MAC, 4, OPS, "OPS");
-	PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU");
+	PFLAG(MAC, 4, AMSDU_IN_AMPDU, "AMSDU-IN-AMPDU");
 
 	PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7);
 
-- 
cgit v1.2.3


From 8ca6f063b73d3754213d009efb3df486c8fe52d2 Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Thu, 1 Apr 2021 16:37:24 -0700
Subject: KVM: x86/mmu: Re-add const qualifier in
 kvm_tdp_mmu_zap_collapsible_sptes

kvm_tdp_mmu_zap_collapsible_sptes unnecessarily removes the const
qualifier from its memlsot argument, leading to a compiler warning. Add
the const annotation and pass it to subsequent functions.

Signed-off-by: Ben Gardon <bgardon@google.com>
Message-Id: <20210401233736.638171-2-bgardon@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c          | 10 +++++-----
 arch/x86/kvm/mmu/mmu_internal.h |  5 +++--
 arch/x86/kvm/mmu/tdp_mmu.c      |  5 +++--
 arch/x86/kvm/mmu/tdp_mmu.h      |  3 ++-
 include/linux/kvm_host.h        |  2 +-
 5 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d0f53189c257..6395ab9450af 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -715,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
  * handling slots that are not large page aligned.
  */
 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
-					      struct kvm_memory_slot *slot,
-					      int level)
+		const struct kvm_memory_slot *slot, int level)
 {
 	unsigned long idx;
 
@@ -2702,7 +2701,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
-				  struct kvm_memory_slot *slot)
+				  const struct kvm_memory_slot *slot)
 {
 	unsigned long hva;
 	pte_t *pte;
@@ -2728,8 +2727,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 	return level;
 }
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
-			      gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+			      const struct kvm_memory_slot *slot, gfn_t gfn,
+			      kvm_pfn_t pfn, int max_level)
 {
 	struct kvm_lpage_info *linfo;
 
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index f296b8a258ff..9f513822b568 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -161,8 +161,9 @@ enum {
 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH	BIT(1)
 #define SET_SPTE_SPURIOUS		BIT(2)
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
-			      gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+			      const struct kvm_memory_slot *slot, gfn_t gfn,
+			      kvm_pfn_t pfn, int max_level);
 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
 			    int max_level, kvm_pfn_t *pfnp,
 			    bool huge_page_disallowed, int *req_level);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 10cb16c55dd0..5f517ef06e61 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1190,7 +1190,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
  */
 static bool zap_collapsible_spte_range(struct kvm *kvm,
 				       struct kvm_mmu_page *root,
-				       struct kvm_memory_slot *slot,
+				       const struct kvm_memory_slot *slot,
 				       bool flush)
 {
 	gfn_t start = slot->base_gfn;
@@ -1231,7 +1231,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
  * be replaced by large mappings, for GFNs within the slot.
  */
 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-				       struct kvm_memory_slot *slot, bool flush)
+				       const struct kvm_memory_slot *slot,
+				       bool flush)
 {
 	struct kvm_mmu_page *root;
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index ee8efa58902f..11050994e7e4 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -53,7 +53,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				       gfn_t gfn, unsigned long mask,
 				       bool wrprot);
 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-				       struct kvm_memory_slot *slot, bool flush);
+				       const struct kvm_memory_slot *slot,
+				       bool flush);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 				   struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6b4dd9500d70..d17e3ff1138d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1130,7 +1130,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 }
 
 static inline unsigned long
-__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
 }
-- 
cgit v1.2.3


From 9591c3a34f7722bd77f42c98d76fd5a5bad465f0 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 22 Mar 2021 19:39:43 +0200
Subject: fs: introduce a wrapper uuid_to_fsid()

Some filesystem's use a digest of their uuid for f_fsid.
Create a simple wrapper for this open coded folding.

Filesystems that have a non null uuid but use the block device
number for f_fsid may also consider using this helper.

[JK: Added missing asm/byteorder.h include]
Link: https://lore.kernel.org/r/20210322173944.449469-2-amir73il@gmail.com
Acked-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/super.c        | 5 +----
 fs/ext4/super.c        | 5 +----
 fs/zonefs/super.c      | 5 +----
 include/linux/statfs.h | 8 ++++++++
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 6c4753277916..0d679451657c 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1399,7 +1399,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
-	u64 fsid;
 
 	spin_lock(&sbi->s_lock);
 
@@ -1453,9 +1452,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	buf->f_ffree = ext2_count_free_inodes(sb);
 	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
 	buf->f_namelen = EXT2_NAME_LEN;
-	fsid = le64_to_cpup((void *)es->s_uuid) ^
-	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
-	buf->f_fsid = u64_to_fsid(fsid);
+	buf->f_fsid = uuid_to_fsid(es->s_uuid);
 	spin_unlock(&sbi->s_lock);
 	return 0;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad34a37278cd..3581c1cdc19e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6148,7 +6148,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_fsblk_t overhead = 0, resv_blocks;
-	u64 fsid;
 	s64 bfree;
 	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
 
@@ -6169,9 +6168,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
 	buf->f_namelen = EXT4_NAME_LEN;
-	fsid = le64_to_cpup((void *)es->s_uuid) ^
-	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
-	buf->f_fsid = u64_to_fsid(fsid);
+	buf->f_fsid = uuid_to_fsid(es->s_uuid);
 
 #ifdef CONFIG_QUOTA
 	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 0fe76f376dee..e09810311162 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1104,7 +1104,6 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *sb = dentry->d_sb;
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 	enum zonefs_ztype t;
-	u64 fsid;
 
 	buf->f_type = ZONEFS_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
@@ -1127,9 +1126,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	spin_unlock(&sbi->s_lock);
 
-	fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^
-		le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64));
-	buf->f_fsid = u64_to_fsid(fsid);
+	buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b);
 
 	return 0;
 }
diff --git a/include/linux/statfs.h b/include/linux/statfs.h
index 20f695b90aab..02c862686ea3 100644
--- a/include/linux/statfs.h
+++ b/include/linux/statfs.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <asm/statfs.h>
+#include <asm/byteorder.h>
 
 struct kstatfs {
 	long f_type;
@@ -50,4 +51,11 @@ static inline __kernel_fsid_t u64_to_fsid(u64 v)
 	return (__kernel_fsid_t){.val = {(u32)v, (u32)(v>>32)}};
 }
 
+/* Fold 16 bytes uuid to 64 bit fsid */
+static inline __kernel_fsid_t uuid_to_fsid(__u8 *uuid)
+{
+	return u64_to_fsid(le64_to_cpup((void *)uuid) ^
+		le64_to_cpup((void *)(uuid + sizeof(u64))));
+}
+
 #endif
-- 
cgit v1.2.3


From 32c0a6bcaa1f57e80f67bf3ae2b35d00cea8361a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sun, 21 Mar 2021 21:03:11 +0000
Subject: btrfs: add and use readahead_batch_length

Implement readahead_batch_length() to determine the number of bytes in
the current batch of readahead pages and use it in btrfs. Also use the
readahead_pos to get the offset.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c    | 6 ++----
 include/linux/pagemap.h | 9 +++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 18e69e1a5f9c..a50adbd8808d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4650,10 +4650,8 @@ void extent_readahead(struct readahead_control *rac)
 	int nr;
 
 	while ((nr = readahead_page_batch(rac, pagepool))) {
-		u64 contig_start = page_offset(pagepool[0]);
-		u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
-
-		ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
+		u64 contig_start = readahead_pos(rac);
+		u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
 
 		contiguous_readpages(pagepool, nr, contig_start, contig_end,
 				&em_cached, &bio, &bio_flags, &prev_em_start);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8c9947fd62f3..4c1ffe1d5dac 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -981,6 +981,15 @@ static inline unsigned int readahead_count(struct readahead_control *rac)
 	return rac->_nr_pages;
 }
 
+/**
+ * readahead_batch_length - The number of bytes in the current batch.
+ * @rac: The readahead request.
+ */
+static inline loff_t readahead_batch_length(struct readahead_control *rac)
+{
+	return rac->_batch_count * PAGE_SIZE;
+}
+
 static inline unsigned long dir_pages(struct inode *inode)
 {
 	return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
-- 
cgit v1.2.3


From 7a35693adcd38664b852ad10e3742782b3e87987 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 7 Apr 2021 14:25:22 +0100
Subject: dm: replace dm_vcalloc()

Use kvcalloc or kvmalloc_array instead (depending whether zeroing is
useful).

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-snap-persistent.c |  6 +++---
 drivers/md/dm-snap.c            |  5 +++--
 drivers/md/dm-table.c           | 30 ++++++------------------------
 include/linux/device-mapper.h   |  5 -----
 4 files changed, 12 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 8e329c3f3a78..9ab4bf651ca9 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -596,7 +596,7 @@ static void persistent_dtr(struct dm_exception_store *store)
 	free_area(ps);
 
 	/* Allocated in persistent_read_metadata */
-	vfree(ps->callbacks);
+	kvfree(ps->callbacks);
 
 	kfree(ps);
 }
@@ -621,8 +621,8 @@ static int persistent_read_metadata(struct dm_exception_store *store,
 	 */
 	ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
 				  sizeof(struct disk_exception);
-	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
-				   sizeof(*ps->callbacks));
+	ps->callbacks = kvcalloc(ps->exceptions_per_area,
+				 sizeof(*ps->callbacks), GFP_KERNEL);
 	if (!ps->callbacks)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 11890db71f3f..a2acb014c13a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -663,7 +663,8 @@ static int dm_exception_table_init(struct dm_exception_table *et,
 
 	et->hash_shift = hash_shift;
 	et->hash_mask = size - 1;
-	et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
+	et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head),
+				   GFP_KERNEL);
 	if (!et->table)
 		return -ENOMEM;
 
@@ -689,7 +690,7 @@ static void dm_exception_table_exit(struct dm_exception_table *et,
 			kmem_cache_free(mem, ex);
 	}
 
-	vfree(et->table);
+	kvfree(et->table);
 }
 
 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e5f0f1703c5d..ee47a332b462 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -94,24 +94,6 @@ static int setup_btree_index(unsigned int l, struct dm_table *t)
 	return 0;
 }
 
-void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
-{
-	unsigned long size;
-	void *addr;
-
-	/*
-	 * Check that we're not going to overflow.
-	 */
-	if (nmemb > (ULONG_MAX / elem_size))
-		return NULL;
-
-	size = nmemb * elem_size;
-	addr = vzalloc(size);
-
-	return addr;
-}
-EXPORT_SYMBOL(dm_vcalloc);
-
 /*
  * highs, and targets are managed as dynamic arrays during a
  * table load.
@@ -124,15 +106,15 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
 	/*
 	 * Allocate both the target array and offset array at once.
 	 */
-	n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) +
-					  sizeof(sector_t));
+	n_highs = kvcalloc(num, sizeof(struct dm_target) + sizeof(sector_t),
+			   GFP_KERNEL);
 	if (!n_highs)
 		return -ENOMEM;
 
 	n_targets = (struct dm_target *) (n_highs + num);
 
 	memset(n_highs, -1, sizeof(*n_highs) * num);
-	vfree(t->highs);
+	kvfree(t->highs);
 
 	t->num_allocated = num;
 	t->highs = n_highs;
@@ -198,7 +180,7 @@ void dm_table_destroy(struct dm_table *t)
 
 	/* free the indexes */
 	if (t->depth >= 2)
-		vfree(t->index[t->depth - 2]);
+		kvfree(t->index[t->depth - 2]);
 
 	/* free the targets */
 	for (i = 0; i < t->num_targets; i++) {
@@ -210,7 +192,7 @@ void dm_table_destroy(struct dm_table *t)
 		dm_put_target_type(tgt->type);
 	}
 
-	vfree(t->highs);
+	kvfree(t->highs);
 
 	/* free the device list */
 	free_devices(&t->devices, t->md);
@@ -1077,7 +1059,7 @@ static int setup_indexes(struct dm_table *t)
 		total += t->counts[i];
 	}
 
-	indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
+	indexes = kvcalloc(total, NODE_SIZE, GFP_KERNEL);
 	if (!indexes)
 		return -ENOMEM;
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 5c641f930caf..ff700fb6ce1d 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -574,11 +574,6 @@ struct dm_table *dm_swap_table(struct mapped_device *md,
  */
 void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm);
 
-/*
- * A wrapper around vmalloc.
- */
-void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
-
 /*-----------------------------------------------------------------
  * Macros.
  *---------------------------------------------------------------*/
-- 
cgit v1.2.3


From a9c81ccdf52dd73a20178c40bca34cf52991fdea Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 12 Apr 2021 07:30:57 -0700
Subject: perf/x86: Add structures for the attributes of Hybrid PMUs

Hybrid PMUs have different events and formats. In theory, Hybrid PMU
specific attributes should be maintained in the dedicated struct
x86_hybrid_pmu, but it wastes space because the events and formats are
similar among Hybrid PMUs.

To reduce duplication, all hybrid PMUs will share a group of attributes
in the following patch. To distinguish an attribute from different
Hybrid PMUs, a PMU aware attribute structure is introduced. A PMU type
is required for the attribute structure. The type is internal usage. It
is not visible in the sysfs API.

Hybrid PMUs may support the same event name, but with different event
encoding, e.g., the mem-loads event on an Atom PMU has different event
encoding from a Core PMU. It brings issue if two attributes are
created for them. Current sysfs_update_group finds an attribute by
searching the attr name (aka event name). If two attributes have the
same event name, the first attribute will be replaced.
To address the issue, only one attribute is created for the event. The
event_str is extended and stores event encodings from all Hybrid PMUs.
Each event encoding is divided by ";". The order of the event encodings
must follow the order of the hybrid PMU index. The event_str is internal
usage as well. When a user wants to show the attribute of a Hybrid PMU,
only the corresponding part of the string is displayed.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lkml.kernel.org/r/1618237865-33448-18-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/core.c       | 43 +++++++++++++++++++++++++++++++++++++++++++
 arch/x86/events/perf_event.h | 19 +++++++++++++++++++
 include/linux/perf_event.h   | 12 ++++++++++++
 3 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index bd465a806638..37ab109a4108 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1860,6 +1860,49 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
 			pmu_attr->event_str_noht);
 }
 
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *page)
+{
+	struct perf_pmu_events_hybrid_attr *pmu_attr =
+		container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
+	struct x86_hybrid_pmu *pmu;
+	const char *str, *next_str;
+	int i;
+
+	if (hweight64(pmu_attr->pmu_type) == 1)
+		return sprintf(page, "%s", pmu_attr->event_str);
+
+	/*
+	 * Hybrid PMUs may support the same event name, but with different
+	 * event encoding, e.g., the mem-loads event on an Atom PMU has
+	 * different event encoding from a Core PMU.
+	 *
+	 * The event_str includes all event encodings. Each event encoding
+	 * is divided by ";". The order of the event encodings must follow
+	 * the order of the hybrid PMU index.
+	 */
+	pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+	str = pmu_attr->event_str;
+	for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+		if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type))
+			continue;
+		if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) {
+			next_str = strchr(str, ';');
+			if (next_str)
+				return snprintf(page, next_str - str + 1, "%s", str);
+			else
+				return sprintf(page, "%s", str);
+		}
+		str = strchr(str, ';');
+		str++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
+
 EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
 EVENT_ATTR(instructions,		INSTRUCTIONS		);
 EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 4282ce48c063..e2be927158d5 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -979,6 +979,22 @@ static struct perf_pmu_events_ht_attr event_attr_##v = {		\
 	.event_str_ht	= ht,						\
 }
 
+#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu)			\
+static struct perf_pmu_events_hybrid_attr event_attr_##v = {		\
+	.attr		= __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\
+	.id		= 0,						\
+	.event_str	= str,						\
+	.pmu_type	= _pmu,						\
+}
+
+#define FORMAT_HYBRID_PTR(_id) (&format_attr_hybrid_##_id.attr.attr)
+
+#define FORMAT_ATTR_HYBRID(_name, _pmu)					\
+static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
+	.attr		= __ATTR_RO(_name),				\
+	.pmu_type	= _pmu,						\
+}
+
 struct pmu *x86_get_pmu(unsigned int cpu);
 extern struct x86_pmu x86_pmu __read_mostly;
 
@@ -1149,6 +1165,9 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
 			  char *page);
 ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
 			  char *page);
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *page);
 
 static inline bool fixed_counter_disabled(int i, struct pmu *pmu)
 {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8989b2b7268d..61b385154c4a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1549,6 +1549,18 @@ struct perf_pmu_events_ht_attr {
 	const char				*event_str_noht;
 };
 
+struct perf_pmu_events_hybrid_attr {
+	struct device_attribute			attr;
+	u64					id;
+	const char				*event_str;
+	u64					pmu_type;
+};
+
+struct perf_pmu_format_hybrid_attr {
+	struct device_attribute			attr;
+	u64					pmu_type;
+};
+
 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
 			      char *page);
 
-- 
cgit v1.2.3


From 55bcf6ef314ae8ba81bcd74aa760247b635ed47b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Mon, 12 Apr 2021 07:31:01 -0700
Subject: perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE

Current Hardware events and Hardware cache events have special perf
types, PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The two types don't
pass the PMU type in the user interface. For a hybrid system, the perf
subsystem doesn't know which PMU the events belong to. The first capable
PMU will always be assigned to the events. The events never get a chance
to run on the other capable PMUs.

Extend the two types to become PMU aware types. The PMU type ID is
stored at attr.config[63:32].

Add a new PMU capability, PERF_PMU_CAP_EXTENDED_HW_TYPE, to indicate a
PMU which supports the extended PERF_TYPE_HARDWARE and
PERF_TYPE_HW_CACHE.

The PMU type is only required when searching a specific PMU. The PMU
specific codes will only be interested in the 'real' config value, which
is stored in the low 32 bit of the event->attr.config. Update the
event->attr.config in the generic code, so the PMU specific codes don't
need to calculate it separately.

If a user specifies a PMU type, but the PMU doesn't support the extended
type, error out.

If an event cannot be initialized in a PMU specified by a user, error
out immediately. Perf should not try to open it on other PMUs.

The new PMU capability is only set for the X86 hybrid PMUs for now.
Other architectures, e.g., ARM, may need it as well. The support on ARM
may be implemented later separately.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-22-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/core.c          |  1 +
 include/linux/perf_event.h      | 19 ++++++++++---------
 include/uapi/linux/perf_event.h | 15 +++++++++++++++
 kernel/events/core.c            | 19 ++++++++++++++++---
 4 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 4f6595ee6359..3fe66b7aa721 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2173,6 +2173,7 @@ static int __init init_hw_perf_events(void)
 			hybrid_pmu->pmu.type = -1;
 			hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
 			hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+			hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
 
 			err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
 						(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61b385154c4a..a763928a0e41 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -260,15 +260,16 @@ struct perf_event;
 /**
  * pmu::capabilities flags
  */
-#define PERF_PMU_CAP_NO_INTERRUPT		0x01
-#define PERF_PMU_CAP_NO_NMI			0x02
-#define PERF_PMU_CAP_AUX_NO_SG			0x04
-#define PERF_PMU_CAP_EXTENDED_REGS		0x08
-#define PERF_PMU_CAP_EXCLUSIVE			0x10
-#define PERF_PMU_CAP_ITRACE			0x20
-#define PERF_PMU_CAP_HETEROGENEOUS_CPUS		0x40
-#define PERF_PMU_CAP_NO_EXCLUDE			0x80
-#define PERF_PMU_CAP_AUX_OUTPUT			0x100
+#define PERF_PMU_CAP_NO_INTERRUPT		0x0001
+#define PERF_PMU_CAP_NO_NMI			0x0002
+#define PERF_PMU_CAP_AUX_NO_SG			0x0004
+#define PERF_PMU_CAP_EXTENDED_REGS		0x0008
+#define PERF_PMU_CAP_EXCLUSIVE			0x0010
+#define PERF_PMU_CAP_ITRACE			0x0020
+#define PERF_PMU_CAP_HETEROGENEOUS_CPUS		0x0040
+#define PERF_PMU_CAP_NO_EXCLUDE			0x0080
+#define PERF_PMU_CAP_AUX_OUTPUT			0x0100
+#define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0200
 
 struct perf_output_handle;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 0b58970bab6b..e54e639248c8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -37,6 +37,21 @@ enum perf_type_id {
 	PERF_TYPE_MAX,				/* non-ABI */
 };
 
+/*
+ * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * PERF_TYPE_HARDWARE:			0xEEEEEEEE000000AA
+ *					AA: hardware event ID
+ *					EEEEEEEE: PMU type ID
+ * PERF_TYPE_HW_CACHE:			0xEEEEEEEE00DDCCBB
+ *					BB: hardware cache ID
+ *					CC: hardware cache op ID
+ *					DD: hardware cache op result ID
+ *					EEEEEEEE: PMU type ID
+ * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied.
+ */
+#define PERF_PMU_TYPE_SHIFT		32
+#define PERF_HW_EVENT_MASK		0xffffffff
+
 /*
  * Generalized performance event event_id types, used by the
  * attr.event_id parameter of the sys_perf_event_open()
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f0723c711a9..928b166d888e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11220,6 +11220,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 
 static struct pmu *perf_init_event(struct perf_event *event)
 {
+	bool extended_type = false;
 	int idx, type, ret;
 	struct pmu *pmu;
 
@@ -11238,16 +11239,27 @@ static struct pmu *perf_init_event(struct perf_event *event)
 	 * are often aliases for PERF_TYPE_RAW.
 	 */
 	type = event->attr.type;
-	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
-		type = PERF_TYPE_RAW;
+	if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
+		type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
+		if (!type) {
+			type = PERF_TYPE_RAW;
+		} else {
+			extended_type = true;
+			event->attr.config &= PERF_HW_EVENT_MASK;
+		}
+	}
 
 again:
 	rcu_read_lock();
 	pmu = idr_find(&pmu_idr, type);
 	rcu_read_unlock();
 	if (pmu) {
+		if (event->attr.type != type && type != PERF_TYPE_RAW &&
+		    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
+			goto fail;
+
 		ret = perf_try_init_event(pmu, event);
-		if (ret == -ENOENT && event->attr.type != type) {
+		if (ret == -ENOENT && event->attr.type != type && !extended_type) {
 			type = event->attr.type;
 			goto again;
 		}
@@ -11268,6 +11280,7 @@ again:
 			goto unlock;
 		}
 	}
+fail:
 	pmu = ERR_PTR(-ENOENT);
 unlock:
 	srcu_read_unlock(&pmus_srcu, idx);
-- 
cgit v1.2.3


From d9c9e4db186ab4d81f84e6f22b225d333b9424e3 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 19 Apr 2021 17:52:38 +0200
Subject: bpf: Factorize bpf_trace_printk and bpf_seq_printf

Two helpers (trace_printk and seq_printf) have very similar
implementations of format string parsing and a third one is coming
(snprintf). To avoid code duplication and make the code easier to
maintain, this moves the operations associated with format string
parsing (validation and argument sanitization) into one generic
function.

The implementation of the two existing helpers already drifted quite a
bit so unifying them entailed a lot of changes:

- bpf_trace_printk always expected fmt[fmt_size] to be the terminating
  NULL character, this is no longer true, the first 0 is terminating.
- bpf_trace_printk now supports %% (which produces the percentage char).
- bpf_trace_printk now skips width formating fields.
- bpf_trace_printk now supports the X modifier (capital hexadecimal).
- bpf_trace_printk now supports %pK, %px, %pB, %pi4, %pI4, %pi6 and %pI6
- argument casting on 32 bit has been simplified into one macro and
  using an enum instead of obscure int increments.

- bpf_seq_printf now uses bpf_trace_copy_string instead of
  strncpy_from_kernel_nofault and handles the %pks %pus specifiers.
- bpf_seq_printf now prints longs correctly on 32 bit architectures.

- both were changed to use a global per-cpu tmp buffer instead of one
  stack buffer for trace_printk and 6 small buffers for seq_printf.
- to avoid per-cpu buffer usage conflict, these helpers disable
  preemption while the per-cpu buffer is in use.
- both helpers now support the %ps and %pS specifiers to print symbols.

The implementation is also moved from bpf_trace.c to helpers.c because
the upcoming bpf_snprintf helper will be made available to all BPF
programs and will need it.

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210419155243.1632274-2-revest@chromium.org
---
 include/linux/bpf.h      |  20 +++
 kernel/bpf/helpers.c     | 256 ++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c | 371 +++++------------------------------------------
 3 files changed, 313 insertions(+), 334 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ff8cd68c01b3..77d1d8c65b81 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2077,4 +2077,24 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 
+enum bpf_printf_mod_type {
+	BPF_PRINTF_INT,
+	BPF_PRINTF_LONG,
+	BPF_PRINTF_LONG_LONG,
+};
+
+/* Workaround for getting va_list handling working with different argument type
+ * combinations generically for 32 and 64 bit archs.
+ */
+#define BPF_CAST_FMT_ARG(arg_nb, args, mod)				\
+	(mod[arg_nb] == BPF_PRINTF_LONG_LONG ||				\
+	 (mod[arg_nb] == BPF_PRINTF_LONG && __BITS_PER_LONG == 64)	\
+	  ? (u64)args[arg_nb]						\
+	  : (u32)args[arg_nb])
+
+int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+		       u64 *final_args, enum bpf_printf_mod_type *mod,
+		       u32 num_args);
+void bpf_printf_cleanup(void);
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f306611c4ddf..9ca57eb1fc0d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -669,6 +669,262 @@ const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
 	.arg1_type	= ARG_PTR_TO_PERCPU_BTF_ID,
 };
 
+static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
+		size_t bufsz)
+{
+	void __user *user_ptr = (__force void __user *)unsafe_ptr;
+
+	buf[0] = 0;
+
+	switch (fmt_ptype) {
+	case 's':
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+		if ((unsigned long)unsafe_ptr < TASK_SIZE)
+			return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+		fallthrough;
+#endif
+	case 'k':
+		return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
+	case 'u':
+		return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+	}
+
+	return -EINVAL;
+}
+
+/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p
+ */
+#define MAX_PRINTF_BUF_LEN	512
+
+struct bpf_printf_buf {
+	char tmp_buf[MAX_PRINTF_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf);
+static DEFINE_PER_CPU(int, bpf_printf_buf_used);
+
+static int try_get_fmt_tmp_buf(char **tmp_buf)
+{
+	struct bpf_printf_buf *bufs;
+	int used;
+
+	if (*tmp_buf)
+		return 0;
+
+	preempt_disable();
+	used = this_cpu_inc_return(bpf_printf_buf_used);
+	if (WARN_ON_ONCE(used > 1)) {
+		this_cpu_dec(bpf_printf_buf_used);
+		preempt_enable();
+		return -EBUSY;
+	}
+	bufs = this_cpu_ptr(&bpf_printf_buf);
+	*tmp_buf = bufs->tmp_buf;
+
+	return 0;
+}
+
+void bpf_printf_cleanup(void)
+{
+	if (this_cpu_read(bpf_printf_buf_used)) {
+		this_cpu_dec(bpf_printf_buf_used);
+		preempt_enable();
+	}
+}
+
+/*
+ * bpf_parse_fmt_str - Generic pass on format strings for printf-like helpers
+ *
+ * Returns a negative value if fmt is an invalid format string or 0 otherwise.
+ *
+ * This can be used in two ways:
+ * - Format string verification only: when final_args and mod are NULL
+ * - Arguments preparation: in addition to the above verification, it writes in
+ *   final_args a copy of raw_args where pointers from BPF have been sanitized
+ *   into pointers safe to use by snprintf. This also writes in the mod array
+ *   the size requirement of each argument, usable by BPF_CAST_FMT_ARG for ex.
+ *
+ * In argument preparation mode, if 0 is returned, safe temporary buffers are
+ * allocated and bpf_printf_cleanup should be called to free them after use.
+ */
+int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+			u64 *final_args, enum bpf_printf_mod_type *mod,
+			u32 num_args)
+{
+	char *unsafe_ptr = NULL, *tmp_buf = NULL, *fmt_end;
+	size_t tmp_buf_len = MAX_PRINTF_BUF_LEN;
+	int err, i, num_spec = 0, copy_size;
+	enum bpf_printf_mod_type cur_mod;
+	u64 cur_arg;
+	char fmt_ptype;
+
+	if (!!final_args != !!mod)
+		return -EINVAL;
+
+	fmt_end = strnchr(fmt, fmt_size, 0);
+	if (!fmt_end)
+		return -EINVAL;
+	fmt_size = fmt_end - fmt;
+
+	for (i = 0; i < fmt_size; i++) {
+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+
+		if (fmt[i] != '%')
+			continue;
+
+		if (fmt[i + 1] == '%') {
+			i++;
+			continue;
+		}
+
+		if (num_spec >= num_args) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+
+		/* The string is zero-terminated so if fmt[i] != 0, we can
+		 * always access fmt[i + 1], in the worst case it will be a 0
+		 */
+		i++;
+
+		/* skip optional "[0 +-][num]" width formatting field */
+		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
+		       fmt[i] == ' ')
+			i++;
+		if (fmt[i] >= '1' && fmt[i] <= '9') {
+			i++;
+			while (fmt[i] >= '0' && fmt[i] <= '9')
+				i++;
+		}
+
+		if (fmt[i] == 'p') {
+			cur_mod = BPF_PRINTF_LONG;
+
+			if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
+			    fmt[i + 2] == 's') {
+				fmt_ptype = fmt[i + 1];
+				i += 2;
+				goto fmt_str;
+			}
+
+			if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+			    ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
+			    fmt[i + 1] == 'x' || fmt[i + 1] == 'B' ||
+			    fmt[i + 1] == 's' || fmt[i + 1] == 'S') {
+				/* just kernel pointers */
+				if (final_args)
+					cur_arg = raw_args[num_spec];
+				goto fmt_next;
+			}
+
+			/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
+			if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
+			    (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
+				err = -EINVAL;
+				goto cleanup;
+			}
+
+			i += 2;
+			if (!final_args)
+				goto fmt_next;
+
+			if (try_get_fmt_tmp_buf(&tmp_buf)) {
+				err = -EBUSY;
+				goto out;
+			}
+
+			copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+			if (tmp_buf_len < copy_size) {
+				err = -ENOSPC;
+				goto cleanup;
+			}
+
+			unsafe_ptr = (char *)(long)raw_args[num_spec];
+			err = copy_from_kernel_nofault(tmp_buf, unsafe_ptr,
+						       copy_size);
+			if (err < 0)
+				memset(tmp_buf, 0, copy_size);
+			cur_arg = (u64)(long)tmp_buf;
+			tmp_buf += copy_size;
+			tmp_buf_len -= copy_size;
+
+			goto fmt_next;
+		} else if (fmt[i] == 's') {
+			cur_mod = BPF_PRINTF_LONG;
+			fmt_ptype = fmt[i];
+fmt_str:
+			if (fmt[i + 1] != 0 &&
+			    !isspace(fmt[i + 1]) &&
+			    !ispunct(fmt[i + 1])) {
+				err = -EINVAL;
+				goto cleanup;
+			}
+
+			if (!final_args)
+				goto fmt_next;
+
+			if (try_get_fmt_tmp_buf(&tmp_buf)) {
+				err = -EBUSY;
+				goto out;
+			}
+
+			if (!tmp_buf_len) {
+				err = -ENOSPC;
+				goto cleanup;
+			}
+
+			unsafe_ptr = (char *)(long)raw_args[num_spec];
+			err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
+						    fmt_ptype, tmp_buf_len);
+			if (err < 0) {
+				tmp_buf[0] = '\0';
+				err = 1;
+			}
+
+			cur_arg = (u64)(long)tmp_buf;
+			tmp_buf += err;
+			tmp_buf_len -= err;
+
+			goto fmt_next;
+		}
+
+		cur_mod = BPF_PRINTF_INT;
+
+		if (fmt[i] == 'l') {
+			cur_mod = BPF_PRINTF_LONG;
+			i++;
+		}
+		if (fmt[i] == 'l') {
+			cur_mod = BPF_PRINTF_LONG_LONG;
+			i++;
+		}
+
+		if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
+		    fmt[i] != 'x' && fmt[i] != 'X') {
+			err = -EINVAL;
+			goto cleanup;
+		}
+
+		if (final_args)
+			cur_arg = raw_args[num_spec];
+fmt_next:
+		if (final_args) {
+			mod[num_spec] = cur_mod;
+			final_args[num_spec] = cur_arg;
+		}
+		num_spec++;
+	}
+
+	err = 0;
+cleanup:
+	if (err)
+		bpf_printf_cleanup();
+out:
+	return err;
+}
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0d23755c2747..a13f8644b357 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -372,188 +372,38 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
 	return &bpf_probe_write_user_proto;
 }
 
-static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
-		size_t bufsz)
-{
-	void __user *user_ptr = (__force void __user *)unsafe_ptr;
-
-	buf[0] = 0;
-
-	switch (fmt_ptype) {
-	case 's':
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
-		if ((unsigned long)unsafe_ptr < TASK_SIZE) {
-			strncpy_from_user_nofault(buf, user_ptr, bufsz);
-			break;
-		}
-		fallthrough;
-#endif
-	case 'k':
-		strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
-		break;
-	case 'u':
-		strncpy_from_user_nofault(buf, user_ptr, bufsz);
-		break;
-	}
-}
-
 static DEFINE_RAW_SPINLOCK(trace_printk_lock);
 
-#define BPF_TRACE_PRINTK_SIZE   1024
+#define MAX_TRACE_PRINTK_VARARGS	3
+#define BPF_TRACE_PRINTK_SIZE		1024
 
-static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
+BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
+	   u64, arg2, u64, arg3)
 {
+	u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
+	enum bpf_printf_mod_type mod[MAX_TRACE_PRINTK_VARARGS];
 	static char buf[BPF_TRACE_PRINTK_SIZE];
 	unsigned long flags;
-	va_list ap;
 	int ret;
 
-	raw_spin_lock_irqsave(&trace_printk_lock, flags);
-	va_start(ap, fmt);
-	ret = vsnprintf(buf, sizeof(buf), fmt, ap);
-	va_end(ap);
-	/* vsnprintf() will not append null for zero-length strings */
+	ret = bpf_printf_prepare(fmt, fmt_size, args, args, mod,
+				 MAX_TRACE_PRINTK_VARARGS);
+	if (ret < 0)
+		return ret;
+
+	ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod),
+		BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod));
+	/* snprintf() will not append null for zero-length strings */
 	if (ret == 0)
 		buf[0] = '\0';
+
+	raw_spin_lock_irqsave(&trace_printk_lock, flags);
 	trace_bpf_trace_printk(buf);
 	raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
 
-	return ret;
-}
-
-/*
- * Only limited trace_printk() conversion specifiers allowed:
- * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s
- */
-BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
-	   u64, arg2, u64, arg3)
-{
-	int i, mod[3] = {}, fmt_cnt = 0;
-	char buf[64], fmt_ptype;
-	void *unsafe_ptr = NULL;
-	bool str_seen = false;
+	bpf_printf_cleanup();
 
-	/*
-	 * bpf_check()->check_func_arg()->check_stack_boundary()
-	 * guarantees that fmt points to bpf program stack,
-	 * fmt_size bytes of it were initialized and fmt_size > 0
-	 */
-	if (fmt[--fmt_size] != 0)
-		return -EINVAL;
-
-	/* check format string for allowed specifiers */
-	for (i = 0; i < fmt_size; i++) {
-		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
-			return -EINVAL;
-
-		if (fmt[i] != '%')
-			continue;
-
-		if (fmt_cnt >= 3)
-			return -EINVAL;
-
-		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
-		i++;
-		if (fmt[i] == 'l') {
-			mod[fmt_cnt]++;
-			i++;
-		} else if (fmt[i] == 'p') {
-			mod[fmt_cnt]++;
-			if ((fmt[i + 1] == 'k' ||
-			     fmt[i + 1] == 'u') &&
-			    fmt[i + 2] == 's') {
-				fmt_ptype = fmt[i + 1];
-				i += 2;
-				goto fmt_str;
-			}
-
-			if (fmt[i + 1] == 'B') {
-				i++;
-				goto fmt_next;
-			}
-
-			/* disallow any further format extensions */
-			if (fmt[i + 1] != 0 &&
-			    !isspace(fmt[i + 1]) &&
-			    !ispunct(fmt[i + 1]))
-				return -EINVAL;
-
-			goto fmt_next;
-		} else if (fmt[i] == 's') {
-			mod[fmt_cnt]++;
-			fmt_ptype = fmt[i];
-fmt_str:
-			if (str_seen)
-				/* allow only one '%s' per fmt string */
-				return -EINVAL;
-			str_seen = true;
-
-			if (fmt[i + 1] != 0 &&
-			    !isspace(fmt[i + 1]) &&
-			    !ispunct(fmt[i + 1]))
-				return -EINVAL;
-
-			switch (fmt_cnt) {
-			case 0:
-				unsafe_ptr = (void *)(long)arg1;
-				arg1 = (long)buf;
-				break;
-			case 1:
-				unsafe_ptr = (void *)(long)arg2;
-				arg2 = (long)buf;
-				break;
-			case 2:
-				unsafe_ptr = (void *)(long)arg3;
-				arg3 = (long)buf;
-				break;
-			}
-
-			bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype,
-					sizeof(buf));
-			goto fmt_next;
-		}
-
-		if (fmt[i] == 'l') {
-			mod[fmt_cnt]++;
-			i++;
-		}
-
-		if (fmt[i] != 'i' && fmt[i] != 'd' &&
-		    fmt[i] != 'u' && fmt[i] != 'x')
-			return -EINVAL;
-fmt_next:
-		fmt_cnt++;
-	}
-
-/* Horrid workaround for getting va_list handling working with different
- * argument type combinations generically for 32 and 64 bit archs.
- */
-#define __BPF_TP_EMIT()	__BPF_ARG3_TP()
-#define __BPF_TP(...)							\
-	bpf_do_trace_printk(fmt, ##__VA_ARGS__)
-
-#define __BPF_ARG1_TP(...)						\
-	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\
-	  ? __BPF_TP(arg1, ##__VA_ARGS__)				\
-	  : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))	\
-	      ? __BPF_TP((long)arg1, ##__VA_ARGS__)			\
-	      : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
-
-#define __BPF_ARG2_TP(...)						\
-	((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))	\
-	  ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__)				\
-	  : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))	\
-	      ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__)		\
-	      : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
-
-#define __BPF_ARG3_TP(...)						\
-	((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64))	\
-	  ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__)				\
-	  : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32))	\
-	      ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__)		\
-	      : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
-
-	return __BPF_TP_EMIT();
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -581,184 +431,37 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 }
 
 #define MAX_SEQ_PRINTF_VARARGS		12
-#define MAX_SEQ_PRINTF_MAX_MEMCPY	6
-#define MAX_SEQ_PRINTF_STR_LEN		128
-
-struct bpf_seq_printf_buf {
-	char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
-};
-static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
-static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
 
 BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
 	   const void *, data, u32, data_len)
 {
-	int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
-	int i, buf_used, copy_size, num_args;
-	u64 params[MAX_SEQ_PRINTF_VARARGS];
-	struct bpf_seq_printf_buf *bufs;
-	const u64 *args = data;
-
-	buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
-	if (WARN_ON_ONCE(buf_used > 1)) {
-		err = -EBUSY;
-		goto out;
-	}
-
-	bufs = this_cpu_ptr(&bpf_seq_printf_buf);
-
-	/*
-	 * bpf_check()->check_func_arg()->check_stack_boundary()
-	 * guarantees that fmt points to bpf program stack,
-	 * fmt_size bytes of it were initialized and fmt_size > 0
-	 */
-	if (fmt[--fmt_size] != 0)
-		goto out;
-
-	if (data_len & 7)
-		goto out;
-
-	for (i = 0; i < fmt_size; i++) {
-		if (fmt[i] == '%') {
-			if (fmt[i + 1] == '%')
-				i++;
-			else if (!data || !data_len)
-				goto out;
-		}
-	}
+	enum bpf_printf_mod_type mod[MAX_SEQ_PRINTF_VARARGS];
+	u64 args[MAX_SEQ_PRINTF_VARARGS];
+	int err, num_args;
 
+	if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 ||
+	    (data_len && !data))
+		return -EINVAL;
 	num_args = data_len / 8;
 
-	/* check format string for allowed specifiers */
-	for (i = 0; i < fmt_size; i++) {
-		/* only printable ascii for now. */
-		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		if (fmt[i] != '%')
-			continue;
-
-		if (fmt[i + 1] == '%') {
-			i++;
-			continue;
-		}
-
-		if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
-			err = -E2BIG;
-			goto out;
-		}
-
-		if (fmt_cnt >= num_args) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
-		i++;
-
-		/* skip optional "[0 +-][num]" width formating field */
-		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-' ||
-		       fmt[i] == ' ')
-			i++;
-		if (fmt[i] >= '1' && fmt[i] <= '9') {
-			i++;
-			while (fmt[i] >= '0' && fmt[i] <= '9')
-				i++;
-		}
-
-		if (fmt[i] == 's') {
-			void *unsafe_ptr;
-
-			/* try our best to copy */
-			if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
-				err = -E2BIG;
-				goto out;
-			}
-
-			unsafe_ptr = (void *)(long)args[fmt_cnt];
-			err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt],
-					unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN);
-			if (err < 0)
-				bufs->buf[memcpy_cnt][0] = '\0';
-			params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
-
-			fmt_cnt++;
-			memcpy_cnt++;
-			continue;
-		}
-
-		if (fmt[i] == 'p') {
-			if (fmt[i + 1] == 0 ||
-			    fmt[i + 1] == 'K' ||
-			    fmt[i + 1] == 'x' ||
-			    fmt[i + 1] == 'B') {
-				/* just kernel pointers */
-				params[fmt_cnt] = args[fmt_cnt];
-				fmt_cnt++;
-				continue;
-			}
-
-			/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
-			if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
-				err = -EINVAL;
-				goto out;
-			}
-			if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
-				err = -EINVAL;
-				goto out;
-			}
-
-			if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
-				err = -E2BIG;
-				goto out;
-			}
-
-
-			copy_size = (fmt[i + 2] == '4') ? 4 : 16;
-
-			err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt],
-						(void *) (long) args[fmt_cnt],
-						copy_size);
-			if (err < 0)
-				memset(bufs->buf[memcpy_cnt], 0, copy_size);
-			params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
-
-			i += 2;
-			fmt_cnt++;
-			memcpy_cnt++;
-			continue;
-		}
-
-		if (fmt[i] == 'l') {
-			i++;
-			if (fmt[i] == 'l')
-				i++;
-		}
-
-		if (fmt[i] != 'i' && fmt[i] != 'd' &&
-		    fmt[i] != 'u' && fmt[i] != 'x' &&
-		    fmt[i] != 'X') {
-			err = -EINVAL;
-			goto out;
-		}
-
-		params[fmt_cnt] = args[fmt_cnt];
-		fmt_cnt++;
-	}
+	err = bpf_printf_prepare(fmt, fmt_size, data, args, mod, num_args);
+	if (err < 0)
+		return err;
 
 	/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
 	 * all of them to seq_printf().
 	 */
-	seq_printf(m, fmt, params[0], params[1], params[2], params[3],
-		   params[4], params[5], params[6], params[7], params[8],
-		   params[9], params[10], params[11]);
+	seq_printf(m, fmt, BPF_CAST_FMT_ARG(0, args, mod),
+		BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod),
+		BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod),
+		BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod),
+		BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod),
+		BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod),
+		BPF_CAST_FMT_ARG(11, args, mod));
 
-	err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
-out:
-	this_cpu_dec(bpf_seq_printf_buf_used);
-	return err;
+	bpf_printf_cleanup();
+
+	return seq_has_overflowed(m) ? -EOVERFLOW : 0;
 }
 
 BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)
-- 
cgit v1.2.3


From fff13c4bb646ef849fd74ced87eef54340d28c21 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 19 Apr 2021 17:52:39 +0200
Subject: bpf: Add a ARG_PTR_TO_CONST_STR argument type

This type provides the guarantee that an argument is going to be a const
pointer to somewhere in a read-only map value. It also checks that this
pointer is followed by a zero character before the end of the map value.

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210419155243.1632274-3-revest@chromium.org
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 77d1d8c65b81..c160526fc8bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -309,6 +309,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_PERCPU_BTF_ID,	/* pointer to in-kernel percpu type */
 	ARG_PTR_TO_FUNC,	/* pointer to a bpf program function */
 	ARG_PTR_TO_STACK_OR_NULL,	/* pointer to stack or NULL */
+	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
 	__BPF_ARG_TYPE_MAX,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 852541a435ef..5f46dd6f3383 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4787,6 +4787,7 @@ static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALU
 static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
 static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
+static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -4817,6 +4818,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_PERCPU_BTF_ID]	= &percpu_btf_ptr_types,
 	[ARG_PTR_TO_FUNC]		= &func_ptr_types,
 	[ARG_PTR_TO_STACK_OR_NULL]	= &stack_ptr_types,
+	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5067,6 +5069,45 @@ skip_type_check:
 		if (err)
 			return err;
 		err = check_ptr_alignment(env, reg, 0, size, true);
+	} else if (arg_type == ARG_PTR_TO_CONST_STR) {
+		struct bpf_map *map = reg->map_ptr;
+		int map_off;
+		u64 map_addr;
+		char *str_ptr;
+
+		if (reg->type != PTR_TO_MAP_VALUE || !map ||
+		    !bpf_map_is_rdonly(map)) {
+			verbose(env, "R%d does not point to a readonly map'\n", regno);
+			return -EACCES;
+		}
+
+		if (!tnum_is_const(reg->var_off)) {
+			verbose(env, "R%d is not a constant address'\n", regno);
+			return -EACCES;
+		}
+
+		if (!map->ops->map_direct_value_addr) {
+			verbose(env, "no direct value access support for this map type\n");
+			return -EACCES;
+		}
+
+		err = check_map_access(env, regno, reg->off,
+				       map->value_size - reg->off, false);
+		if (err)
+			return err;
+
+		map_off = reg->off + reg->var_off.value;
+		err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
+		if (err) {
+			verbose(env, "direct value access on string failed\n");
+			return err;
+		}
+
+		str_ptr = (char *)(long)(map_addr);
+		if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
+			verbose(env, "string is not zero-terminated\n");
+			return -EINVAL;
+		}
 	}
 
 	return err;
-- 
cgit v1.2.3


From 7b15523a989b63927c2bb08e9b5b0bbc10b58bef Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 19 Apr 2021 17:52:40 +0200
Subject: bpf: Add a bpf_snprintf helper

The implementation takes inspiration from the existing bpf_trace_printk
helper but there are a few differences:

To allow for a large number of format-specifiers, parameters are
provided in an array, like in bpf_seq_printf.

Because the output string takes two arguments and the array of
parameters also takes two arguments, the format string needs to fit in
one argument. Thankfully, ARG_PTR_TO_CONST_STR is guaranteed to point to
a zero-terminated read-only map so we don't need a format string length
arg.

Because the format-string is known at verification time, we also do
a first pass of format string validation in the verifier logic. This
makes debugging easier.

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210419155243.1632274-4-revest@chromium.org
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 28 +++++++++++++++++++++++
 kernel/bpf/helpers.c           | 50 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 41 ++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++
 6 files changed, 150 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c160526fc8bf..f8a45f109e96 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1953,6 +1953,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
+extern const struct bpf_func_proto bpf_snprintf_proto;
 extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index df164a44bb41..ec6d85a81744 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4708,6 +4708,33 @@ union bpf_attr {
  *	Return
  *		The number of traversed map elements for success, **-EINVAL** for
  *		invalid **flags**.
+ *
+ * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len)
+ *	Description
+ *		Outputs a string into the **str** buffer of size **str_size**
+ *		based on a format string stored in a read-only map pointed by
+ *		**fmt**.
+ *
+ *		Each format specifier in **fmt** corresponds to one u64 element
+ *		in the **data** array. For strings and pointers where pointees
+ *		are accessed, only the pointer values are stored in the *data*
+ *		array. The *data_len* is the size of *data* in bytes.
+ *
+ *		Formats **%s** and **%p{i,I}{4,6}** require to read kernel
+ *		memory. Reading kernel memory may fail due to either invalid
+ *		address or valid address but requiring a major memory fault. If
+ *		reading kernel memory fails, the string for **%s** will be an
+ *		empty string, and the ip address for **%p{i,I}{4,6}** will be 0.
+ *		Not returning error to bpf program is consistent with what
+ *		**bpf_trace_printk**\ () does for now.
+ *
+ *	Return
+ *		The strictly positive length of the formatted string, including
+ *		the trailing zero character. If the return value is greater than
+ *		**str_size**, **str** contains a truncated string, guaranteed to
+ *		be zero-terminated except when **str_size** is 0.
+ *
+ *		Or **-EBUSY** if the per-CPU memory copy buffer is busy.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4875,6 +4902,7 @@ union bpf_attr {
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
 	FN(for_each_map_elem),		\
+	FN(snprintf),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9ca57eb1fc0d..85b26ca5aacd 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -925,6 +925,54 @@ out:
 	return err;
 }
 
+#define MAX_SNPRINTF_VARARGS		12
+
+BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
+	   const void *, data, u32, data_len)
+{
+	enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS];
+	u64 args[MAX_SNPRINTF_VARARGS];
+	int err, num_args;
+
+	if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
+	    (data_len && !data))
+		return -EINVAL;
+	num_args = data_len / 8;
+
+	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
+	 * can safely give an unbounded size.
+	 */
+	err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args);
+	if (err < 0)
+		return err;
+
+	/* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give
+	 * all of them to snprintf().
+	 */
+	err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod),
+		BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod),
+		BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod),
+		BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod),
+		BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod),
+		BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod),
+		BPF_CAST_FMT_ARG(11, args, mod));
+
+	bpf_printf_cleanup();
+
+	return err + 1;
+}
+
+const struct bpf_func_proto bpf_snprintf_proto = {
+	.func		= bpf_snprintf,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_CONST_STR,
+	.arg4_type	= ARG_PTR_TO_MEM_OR_NULL,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1013,6 +1061,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_probe_read_kernel_str_proto;
 	case BPF_FUNC_snprintf_btf:
 		return &bpf_snprintf_btf_proto;
+	case BPF_FUNC_snprintf:
+		return &bpf_snprintf_proto;
 	default:
 		return NULL;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5f46dd6f3383..994ef36c5f60 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5918,6 +5918,41 @@ static int check_reference_leak(struct bpf_verifier_env *env)
 	return state->acquired_refs ? -EINVAL : 0;
 }
 
+static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *regs)
+{
+	struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
+	struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
+	struct bpf_map *fmt_map = fmt_reg->map_ptr;
+	int err, fmt_map_off, num_args;
+	u64 fmt_addr;
+	char *fmt;
+
+	/* data must be an array of u64 */
+	if (data_len_reg->var_off.value % 8)
+		return -EINVAL;
+	num_args = data_len_reg->var_off.value / 8;
+
+	/* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
+	 * and map_direct_value_addr is set.
+	 */
+	fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
+	err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
+						  fmt_map_off);
+	if (err)
+		return err;
+	fmt = (char *)(long)fmt_addr + fmt_map_off;
+
+	/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
+	 * can focus on validating the format specifiers.
+	 */
+	err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args);
+	if (err < 0)
+		verbose(env, "Invalid format string\n");
+
+	return err;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -6032,6 +6067,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EINVAL;
 	}
 
+	if (func_id == BPF_FUNC_snprintf) {
+		err = check_bpf_snprintf_call(env, regs);
+		if (err < 0)
+			return err;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a13f8644b357..2a8bcdc927c7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1076,6 +1076,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_task_storage_delete_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
+	case BPF_FUNC_snprintf:
+		return &bpf_snprintf_proto;
 	default:
 		return NULL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index df164a44bb41..ec6d85a81744 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4708,6 +4708,33 @@ union bpf_attr {
  *	Return
  *		The number of traversed map elements for success, **-EINVAL** for
  *		invalid **flags**.
+ *
+ * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len)
+ *	Description
+ *		Outputs a string into the **str** buffer of size **str_size**
+ *		based on a format string stored in a read-only map pointed by
+ *		**fmt**.
+ *
+ *		Each format specifier in **fmt** corresponds to one u64 element
+ *		in the **data** array. For strings and pointers where pointees
+ *		are accessed, only the pointer values are stored in the *data*
+ *		array. The *data_len* is the size of *data* in bytes.
+ *
+ *		Formats **%s** and **%p{i,I}{4,6}** require to read kernel
+ *		memory. Reading kernel memory may fail due to either invalid
+ *		address or valid address but requiring a major memory fault. If
+ *		reading kernel memory fails, the string for **%s** will be an
+ *		empty string, and the ip address for **%p{i,I}{4,6}** will be 0.
+ *		Not returning error to bpf program is consistent with what
+ *		**bpf_trace_printk**\ () does for now.
+ *
+ *	Return
+ *		The strictly positive length of the formatted string, including
+ *		the trailing zero character. If the return value is greater than
+ *		**str_size**, **str** contains a truncated string, guaranteed to
+ *		be zero-terminated except when **str_size** is 0.
+ *
+ *		Or **-EBUSY** if the per-CPU memory copy buffer is busy.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -4875,6 +4902,7 @@ union bpf_attr {
 	FN(sock_from_file),		\
 	FN(check_mtu),			\
 	FN(for_each_map_elem),		\
+	FN(snprintf),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From c6400e3fc3fa821a26a58cf867331e0877a4c56b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 17 Apr 2021 14:38:07 +0300
Subject: netlink: simplify nl_set_extack_cookie_u64(),
 nl_set_extack_cookie_u32()

Taking address of a function argument directly works just fine.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 0bcf98098c5a..61b1c7fcc401 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -129,23 +129,19 @@ struct netlink_ext_ack {
 static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack,
 					    u64 cookie)
 {
-	u64 __cookie = cookie;
-
 	if (!extack)
 		return;
-	memcpy(extack->cookie, &__cookie, sizeof(__cookie));
-	extack->cookie_len = sizeof(__cookie);
+	memcpy(extack->cookie, &cookie, sizeof(cookie));
+	extack->cookie_len = sizeof(cookie);
 }
 
 static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack,
 					    u32 cookie)
 {
-	u32 __cookie = cookie;
-
 	if (!extack)
 		return;
-	memcpy(extack->cookie, &__cookie, sizeof(__cookie));
-	extack->cookie_len = sizeof(__cookie);
+	memcpy(extack->cookie, &cookie, sizeof(cookie));
+	extack->cookie_len = sizeof(cookie);
 }
 
 void netlink_kernel_release(struct sock *sk);
-- 
cgit v1.2.3


From da702f34e3cc4b6b87ed2d63c17d65d841fa81c6 Mon Sep 17 00:00:00 2001
From: "Radu Pirea (NXP OSS)" <radu-nicolae.pirea@oss.nxp.com>
Date: Mon, 19 Apr 2021 19:13:59 +0300
Subject: net: phy: add genphy_c45_pma_suspend/resume

Add generic PMA suspend and resume callback functions for C45 PHYs.

Signed-off-by: Radu Pirea (NXP OSS) <radu-nicolae.pirea@oss.nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-c45.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h       |  2 ++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index 91e3acb9e397..f4816b7d31b3 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -8,6 +8,49 @@
 #include <linux/mii.h>
 #include <linux/phy.h>
 
+/**
+ * genphy_c45_pma_can_sleep - checks if the PMA have sleep support
+ * @phydev: target phy_device struct
+ */
+static bool genphy_c45_pma_can_sleep(struct phy_device *phydev)
+{
+	int stat1;
+
+	stat1 = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_STAT1);
+	if (stat1 < 0)
+		return false;
+
+	return !!(stat1 & MDIO_STAT1_LPOWERABLE);
+}
+
+/**
+ * genphy_c45_pma_resume - wakes up the PMA module
+ * @phydev: target phy_device struct
+ */
+int genphy_c45_pma_resume(struct phy_device *phydev)
+{
+	if (!genphy_c45_pma_can_sleep(phydev))
+		return -EOPNOTSUPP;
+
+	return phy_clear_bits_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_CTRL1,
+				  MDIO_CTRL1_LPOWER);
+}
+EXPORT_SYMBOL_GPL(genphy_c45_pma_resume);
+
+/**
+ * genphy_c45_pma_suspend - suspends the PMA module
+ * @phydev: target phy_device struct
+ */
+int genphy_c45_pma_suspend(struct phy_device *phydev)
+{
+	if (!genphy_c45_pma_can_sleep(phydev))
+		return -EOPNOTSUPP;
+
+	return phy_set_bits_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_CTRL1,
+				MDIO_CTRL1_LPOWER);
+}
+EXPORT_SYMBOL_GPL(genphy_c45_pma_suspend);
+
 /**
  * genphy_c45_pma_setup_forced - configures a forced speed
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 98fb441dd72e..e3d4d583463b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1535,6 +1535,8 @@ int genphy_c45_pma_read_abilities(struct phy_device *phydev);
 int genphy_c45_read_status(struct phy_device *phydev);
 int genphy_c45_config_aneg(struct phy_device *phydev);
 int genphy_c45_loopback(struct phy_device *phydev, bool enable);
+int genphy_c45_pma_resume(struct phy_device *phydev);
+int genphy_c45_pma_suspend(struct phy_device *phydev);
 
 /* Generic C45 PHY driver */
 extern struct phy_driver genphy_c45_driver;
-- 
cgit v1.2.3


From 6980ffa0c5a8e65d53ff803d2cafdba3e2022714 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Sun, 21 Jun 2020 21:35:34 +0300
Subject: net/mlx5e: RX, Add checks for calculated Striding RQ attributes

Striding RQ attributes below are mutually dependent. An unaware
change to one might take the others out of the valid range derived
by the HW caps:
- The MPWQE size in bytes
- The number of strides in a MPWQE
- The stride size

Add checks to verify they are valid and comply to the HW spec
and SW assumptions/requirements.
This is not a fix, no particular issue exists today.

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/en/params.c    | 86 ++++++++++++++--------
 .../net/ethernet/mellanox/mlx5/core/en/params.h    | 20 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  5 +-
 include/linux/mlx5/device.h                        |  7 +-
 4 files changed, 76 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 69f1f41b2b83..f410c1268422 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -90,30 +90,39 @@ bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params,
 	return !params->lro_en && linear_frag_sz <= PAGE_SIZE;
 }
 
-#define MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ ((BIT(__mlx5_bit_sz(wq, log_wqe_stride_size)) - 1) + \
-					  MLX5_MPWQE_LOG_STRIDE_SZ_BASE)
-bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
-				  struct mlx5e_params *params,
-				  struct mlx5e_xsk_param *xsk)
+bool mlx5e_verify_rx_mpwqe_strides(struct mlx5_core_dev *mdev,
+				   u8 log_stride_sz, u8 log_num_strides)
 {
-	u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk);
-	s8 signed_log_num_strides_param;
-	u8 log_num_strides;
+	if (log_stride_sz + log_num_strides != MLX5_MPWRQ_LOG_WQE_SZ)
+		return false;
 
-	if (!mlx5e_rx_is_linear_skb(params, xsk))
+	if (log_stride_sz < MLX5_MPWQE_LOG_STRIDE_SZ_BASE ||
+	    log_stride_sz > MLX5_MPWQE_LOG_STRIDE_SZ_MAX)
 		return false;
 
-	if (order_base_2(linear_frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ)
+	if (log_num_strides > MLX5_MPWQE_LOG_NUM_STRIDES_MAX)
 		return false;
 
 	if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
-		return true;
+		return log_num_strides >= MLX5_MPWQE_LOG_NUM_STRIDES_EXT_BASE;
 
-	log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
-	signed_log_num_strides_param =
-		(s8)log_num_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE;
+	return log_num_strides >= MLX5_MPWQE_LOG_NUM_STRIDES_BASE;
+}
 
-	return signed_log_num_strides_param >= 0;
+bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
+				  struct mlx5e_params *params,
+				  struct mlx5e_xsk_param *xsk)
+{
+	s8 log_num_strides;
+	u8 log_stride_sz;
+
+	if (!mlx5e_rx_is_linear_skb(params, xsk))
+		return false;
+
+	log_stride_sz = order_base_2(mlx5e_rx_get_linear_frag_sz(params, xsk));
+	log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - log_stride_sz;
+
+	return mlx5e_verify_rx_mpwqe_strides(mdev, log_stride_sz, log_num_strides);
 }
 
 u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params,
@@ -462,26 +471,36 @@ static void mlx5e_build_rx_cq_param(struct mlx5_core_dev *mdev,
 	param->cq_period_mode = params->rx_cq_moderation.cq_period_mode;
 }
 
-void mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
-			  struct mlx5e_params *params,
-			  struct mlx5e_xsk_param *xsk,
-			  u16 q_counter,
-			  struct mlx5e_rq_param *param)
+int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
+			 struct mlx5e_params *params,
+			 struct mlx5e_xsk_param *xsk,
+			 u16 q_counter,
+			 struct mlx5e_rq_param *param)
 {
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	int ndsegs = 1;
 
 	switch (params->rq_wq_type) {
-	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: {
+		u8 log_wqe_num_of_strides = mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk);
+		u8 log_wqe_stride_size = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk);
+
+		if (!mlx5e_verify_rx_mpwqe_strides(mdev, log_wqe_stride_size,
+						   log_wqe_num_of_strides)) {
+			mlx5_core_err(mdev,
+				      "Bad RX MPWQE params: log_stride_size %u, log_num_strides %u\n",
+				      log_wqe_stride_size, log_wqe_num_of_strides);
+			return -EINVAL;
+		}
+
 		MLX5_SET(wq, wq, log_wqe_num_of_strides,
-			 mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk) -
-			 MLX5_MPWQE_LOG_NUM_STRIDES_BASE);
+			 log_wqe_num_of_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE);
 		MLX5_SET(wq, wq, log_wqe_stride_size,
-			 mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk) -
-			 MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
+			 log_wqe_stride_size - MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
 		MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params, xsk));
 		break;
+	}
 	default: /* MLX5_WQ_TYPE_CYCLIC */
 		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
 		mlx5e_build_rq_frags_info(mdev, params, xsk, &param->frags_info);
@@ -499,6 +518,8 @@ void mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
 
 	param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev));
 	mlx5e_build_rx_cq_param(mdev, params, xsk, &param->cqp);
+
+	return 0;
 }
 
 void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev,
@@ -643,14 +664,17 @@ void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev,
 	mlx5e_build_tx_cq_param(mdev, params, &param->cqp);
 }
 
-void mlx5e_build_channel_param(struct mlx5_core_dev *mdev,
-			       struct mlx5e_params *params,
-			       u16 q_counter,
-			       struct mlx5e_channel_param *cparam)
+int mlx5e_build_channel_param(struct mlx5_core_dev *mdev,
+			      struct mlx5e_params *params,
+			      u16 q_counter,
+			      struct mlx5e_channel_param *cparam)
 {
 	u8 icosq_log_wq_sz, async_icosq_log_wq_sz;
+	int err;
 
-	mlx5e_build_rq_param(mdev, params, NULL, q_counter, &cparam->rq);
+	err = mlx5e_build_rq_param(mdev, params, NULL, q_counter, &cparam->rq);
+	if (err)
+		return err;
 
 	icosq_log_wq_sz = mlx5e_build_icosq_log_wq_sz(params, &cparam->rq);
 	async_icosq_log_wq_sz = mlx5e_build_async_icosq_log_wq_sz(mdev);
@@ -659,4 +683,6 @@ void mlx5e_build_channel_param(struct mlx5_core_dev *mdev,
 	mlx5e_build_xdpsq_param(mdev, params, &cparam->xdp_sq);
 	mlx5e_build_icosq_param(mdev, icosq_log_wq_sz, &cparam->icosq);
 	mlx5e_build_async_icosq_param(mdev, async_icosq_log_wq_sz, &cparam->async_icosq);
+
+	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
index fcc51ec6084e..e9593f5f0661 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
@@ -96,6 +96,8 @@ void mlx5e_build_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *para
 void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
 void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
 
+bool mlx5e_verify_rx_mpwqe_strides(struct mlx5_core_dev *mdev,
+				   u8 log_stride_sz, u8 log_num_strides);
 u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
 				 struct mlx5e_xsk_param *xsk);
 u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params,
@@ -122,11 +124,11 @@ u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
 /* Build queue parameters */
 
 void mlx5e_build_create_cq_param(struct mlx5e_create_cq_param *ccp, struct mlx5e_channel *c);
-void mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
-			  struct mlx5e_params *params,
-			  struct mlx5e_xsk_param *xsk,
-			  u16 q_counter,
-			  struct mlx5e_rq_param *param);
+int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
+			 struct mlx5e_params *params,
+			 struct mlx5e_xsk_param *xsk,
+			 u16 q_counter,
+			 struct mlx5e_rq_param *param);
 void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev,
 			       u16 q_counter,
 			       struct mlx5e_rq_param *param);
@@ -141,10 +143,10 @@ void mlx5e_build_tx_cq_param(struct mlx5_core_dev *mdev,
 void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev,
 			     struct mlx5e_params *params,
 			     struct mlx5e_sq_param *param);
-void mlx5e_build_channel_param(struct mlx5_core_dev *mdev,
-			       struct mlx5e_params *params,
-			       u16 q_counter,
-			       struct mlx5e_channel_param *cparam);
+int mlx5e_build_channel_param(struct mlx5_core_dev *mdev,
+			      struct mlx5e_params *params,
+			      u16 q_counter,
+			      struct mlx5e_channel_param *cparam);
 
 u16 mlx5e_calc_sq_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
 int mlx5e_validate_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bc2d37d2806f..bca832cdc4cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2086,7 +2086,10 @@ int mlx5e_open_channels(struct mlx5e_priv *priv,
 	if (!chs->c || !cparam)
 		goto err_free;
 
-	mlx5e_build_channel_param(priv->mdev, &chs->params, priv->q_counter, cparam);
+	err = mlx5e_build_channel_param(priv->mdev, &chs->params, priv->q_counter, cparam);
+	if (err)
+		goto err_free;
+
 	for (i = 0; i < chs->num; i++) {
 		struct xsk_buff_pool *xsk_pool = NULL;
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 92a029a800a0..578c4ccae91c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -911,8 +911,11 @@ static inline u16 get_cqe_flow_tag(struct mlx5_cqe64 *cqe)
 	return be32_to_cpu(cqe->sop_drop_qpn) & 0xFFF;
 }
 
-#define MLX5_MPWQE_LOG_NUM_STRIDES_BASE	(9)
-#define MLX5_MPWQE_LOG_STRIDE_SZ_BASE	(6)
+#define MLX5_MPWQE_LOG_NUM_STRIDES_EXT_BASE	3
+#define MLX5_MPWQE_LOG_NUM_STRIDES_BASE		9
+#define MLX5_MPWQE_LOG_NUM_STRIDES_MAX		16
+#define MLX5_MPWQE_LOG_STRIDE_SZ_BASE		6
+#define MLX5_MPWQE_LOG_STRIDE_SZ_MAX		13
 
 struct mpwrq_cqe_bc {
 	__be16	filler_consumed_strides;
-- 
cgit v1.2.3


From 704cfecdd03d7b84403ed96ba0009ea07270e74e Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun, 28 Feb 2021 23:48:27 +0200
Subject: net/mlx5: mlx5_ifc updates for flex parser

Added the required definitions for supporting more protocols by flex parsers
(GTP-U, Geneve TLV options), and for using the right flex parser that was
configured for this protocol.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f2c51d6833c6..aa6effe1dd6d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -622,7 +622,19 @@ struct mlx5_ifc_fte_match_set_misc3_bits {
 
 	u8         geneve_tlv_option_0_data[0x20];
 
-	u8         reserved_at_140[0xc0];
+	u8	   gtpu_teid[0x20];
+
+	u8	   gtpu_msg_type[0x8];
+	u8	   gtpu_msg_flags[0x8];
+	u8	   reserved_at_170[0x10];
+
+	u8	   gtpu_dw_2[0x20];
+
+	u8	   gtpu_first_ext_dw_0[0x20];
+
+	u8	   gtpu_dw_0[0x20];
+
+	u8	   reserved_at_1e0[0x20];
 };
 
 struct mlx5_ifc_fte_match_set_misc4_bits {
@@ -1237,9 +1249,17 @@ enum {
 
 enum {
 	MLX5_FLEX_PARSER_GENEVE_ENABLED		= 1 << 3,
+	MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED	= 1 << 4,
+	mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED	= 1 << 5,
 	MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED	= 1 << 7,
 	MLX5_FLEX_PARSER_ICMP_V4_ENABLED	= 1 << 8,
 	MLX5_FLEX_PARSER_ICMP_V6_ENABLED	= 1 << 9,
+	MLX5_FLEX_PARSER_GENEVE_TLV_OPTION_0_ENABLED = 1 << 10,
+	MLX5_FLEX_PARSER_GTPU_ENABLED		= 1 << 11,
+	MLX5_FLEX_PARSER_GTPU_DW_2_ENABLED	= 1 << 16,
+	MLX5_FLEX_PARSER_GTPU_FIRST_EXT_DW_0_ENABLED = 1 << 17,
+	MLX5_FLEX_PARSER_GTPU_DW_0_ENABLED	= 1 << 18,
+	MLX5_FLEX_PARSER_GTPU_TEID_ENABLED	= 1 << 19,
 };
 
 enum {
@@ -1637,7 +1657,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cqe_compression_timeout[0x10];
 	u8         cqe_compression_max_num[0x10];
 
-	u8         reserved_at_5e0[0x10];
+	u8         reserved_at_5e0[0x8];
+	u8         flex_parser_id_gtpu_dw_0[0x4];
+	u8         reserved_at_5ec[0x4];
 	u8         tag_matching[0x1];
 	u8         rndv_offload_rc[0x1];
 	u8         rndv_offload_dc[0x1];
@@ -1648,7 +1670,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   affiliate_nic_vport_criteria[0x8];
 	u8	   native_port_num[0x8];
 	u8	   num_vhca_ports[0x8];
-	u8	   reserved_at_618[0x6];
+	u8         flex_parser_id_gtpu_teid[0x4];
+	u8         reserved_at_61c[0x2];
 	u8	   sw_owner_id[0x1];
 	u8         reserved_at_61f[0x1];
 
@@ -1683,7 +1706,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   reserved_at_6e0[0x10];
 	u8	   sf_base_id[0x10];
 
-	u8	   reserved_at_700[0x8];
+	u8         flex_parser_id_gtpu_dw_2[0x4];
+	u8         flex_parser_id_gtpu_first_ext_dw_0[0x4];
 	u8	   num_total_dynamic_vf_msix[0x18];
 	u8	   reserved_at_720[0x14];
 	u8	   dynamic_msix_table_size[0xc];
-- 
cgit v1.2.3


From 7304d603a57a1edecfecfbcc26f85edcda4cae81 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Mon, 2 Nov 2020 23:57:13 +0200
Subject: net/mlx5: DR, Add support for force-loopback QP

When supported by the device, SW steering RoCE RC QP that is used to
write/read to/from ICM will be created with force-loopback attribute.
Such QP doesn't require GID index upon creation.

Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/steering/dr_cmd.c  | 36 ++++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 34 +++++++++++++++++---
 .../mellanox/mlx5/core/steering/dr_types.h         |  7 +++++
 include/linux/mlx5/mlx5_ifc.h                      |  7 +++--
 4 files changed, 77 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
index 6f9d7aa9fb4c..68d898e144fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
@@ -85,15 +85,51 @@ int mlx5dr_cmd_query_esw_caps(struct mlx5_core_dev *mdev,
 	return 0;
 }
 
+static int dr_cmd_query_nic_vport_roce_en(struct mlx5_core_dev *mdev,
+					  u16 vport, bool *roce_en)
+{
+	u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {};
+	int err;
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(query_nic_vport_context_in, in, other_vport, !!vport);
+
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	*roce_en = MLX5_GET(query_nic_vport_context_out, out,
+			    nic_vport_context.roce_en);
+	return 0;
+}
+
 int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev,
 			    struct mlx5dr_cmd_caps *caps)
 {
+	bool roce_en;
+	int err;
+
 	caps->prio_tag_required	= MLX5_CAP_GEN(mdev, prio_tag_required);
 	caps->eswitch_manager	= MLX5_CAP_GEN(mdev, eswitch_manager);
 	caps->gvmi		= MLX5_CAP_GEN(mdev, vhca_id);
 	caps->flex_protocols	= MLX5_CAP_GEN(mdev, flex_parser_protocols);
 	caps->sw_format_ver	= MLX5_CAP_GEN(mdev, steering_format_version);
 
+	if (MLX5_CAP_GEN(mdev, roce)) {
+		err = dr_cmd_query_nic_vport_roce_en(mdev, 0, &roce_en);
+		if (err)
+			return err;
+
+		caps->roce_caps.roce_en = roce_en;
+		caps->roce_caps.fl_rc_qp_when_roce_disabled =
+			MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_disabled);
+		caps->roce_caps.fl_rc_qp_when_roce_enabled =
+			MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_enabled);
+	}
+
 	if (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED) {
 		caps->flex_parser_id_icmp_dw0 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw0);
 		caps->flex_parser_id_icmp_dw1 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index 37377d668057..69d623bedefe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -32,6 +32,7 @@ struct dr_qp_rtr_attr {
 	u8 min_rnr_timer;
 	u8 sgid_index;
 	u16 udp_src_port;
+	u8 fl:1;
 };
 
 struct dr_qp_rts_attr {
@@ -650,6 +651,7 @@ static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
 			 attr->udp_src_port);
 
 	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num);
+	MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl);
 	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
 
 	MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
@@ -658,6 +660,19 @@ static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
 	return mlx5_cmd_exec_in(mdev, init2rtr_qp, in);
 }
 
+static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps)
+{
+	/* Check whether RC RoCE QP creation with force loopback is allowed.
+	 * There are two separate capability bits for this:
+	 *  - force loopback when RoCE is enabled
+	 *  - force loopback when RoCE is disabled
+	 */
+	return ((caps->roce_caps.roce_en &&
+		 caps->roce_caps.fl_rc_qp_when_roce_enabled) ||
+		(!caps->roce_caps.roce_en &&
+		 caps->roce_caps.fl_rc_qp_when_roce_disabled));
+}
+
 static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
 {
 	struct mlx5dr_qp *dr_qp = dmn->send_ring->qp;
@@ -676,17 +691,26 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
 	}
 
 	/* RTR */
-	ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index, &rtr_attr.dgid_attr);
-	if (ret)
-		return ret;
-
 	rtr_attr.mtu		= mtu;
 	rtr_attr.qp_num		= dr_qp->qpn;
 	rtr_attr.min_rnr_timer	= 12;
 	rtr_attr.port_num	= port;
-	rtr_attr.sgid_index	= gid_index;
 	rtr_attr.udp_src_port	= dmn->info.caps.roce_min_src_udp;
 
+	/* If QP creation with force loopback is allowed, then there
+	 * is no need for GID index when creating the QP.
+	 * Otherwise we query GID attributes and use GID index.
+	 */
+	rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps);
+	if (!rtr_attr.fl) {
+		ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index,
+					   &rtr_attr.dgid_attr);
+		if (ret)
+			return ret;
+
+		rtr_attr.sgid_index = gid_index;
+	}
+
 	ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr);
 	if (ret) {
 		mlx5dr_err(dmn, "Failed modify QP init2rtr\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 7c1ab0b6417e..8de70566f85b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -747,6 +747,12 @@ struct mlx5dr_cmd_vport_cap {
 	u32 num;
 };
 
+struct mlx5dr_roce_cap {
+	u8 roce_en:1;
+	u8 fl_rc_qp_when_roce_disabled:1;
+	u8 fl_rc_qp_when_roce_enabled:1;
+};
+
 struct mlx5dr_cmd_caps {
 	u16 gvmi;
 	u64 nic_rx_drop_address;
@@ -783,6 +789,7 @@ struct mlx5dr_cmd_caps {
 	struct mlx5dr_esw_caps esw_caps;
 	struct mlx5dr_cmd_vport_cap *vports_caps;
 	bool prio_tag_required;
+	struct mlx5dr_roce_cap roce_caps;
 };
 
 struct mlx5dr_domain_rx_tx {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index aa6effe1dd6d..4d9569c4b96c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -961,7 +961,9 @@ struct mlx5_ifc_roce_cap_bits {
 	u8         roce_apm[0x1];
 	u8         reserved_at_1[0x3];
 	u8         sw_r_roce_src_udp_port[0x1];
-	u8         reserved_at_5[0x19];
+	u8         fl_rc_qp_when_roce_disabled[0x1];
+	u8         fl_rc_qp_when_roce_enabled[0x1];
+	u8         reserved_at_7[0x17];
 	u8	   qp_ts_format[0x2];
 
 	u8         reserved_at_20[0x60];
@@ -2942,7 +2944,8 @@ struct mlx5_ifc_qpc_bits {
 	u8         state[0x4];
 	u8         lag_tx_port_affinity[0x4];
 	u8         st[0x8];
-	u8         reserved_at_10[0x3];
+	u8         reserved_at_10[0x2];
+	u8	   isolate_vl_tc[0x1];
 	u8         pm_state[0x2];
 	u8         reserved_at_15[0x1];
 	u8         req_e2e_credit_mode[0x2];
-- 
cgit v1.2.3


From aeacb52a8de7046be5399ba311f49bce96e1b269 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 3 Nov 2020 01:31:53 +0200
Subject: net/mlx5: DR, Add support for isolate_vl_tc QP

When using SW steering, rule insertion rate depends on the RDMA RC QP
performance used for writing to the ICM. During stress this QP is competing
on the HW resources with all the other QPs that are used to send data.
To protect SW steering QP's performance in such cases, we set this QP to
use isolated VL. The VL number is reserved by FW and is not exposed to the
driver.
Support for this QP on isolated VL exists only when both force-loopback and
isolate_vl_tc capabilities are set.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c   | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c  | 7 +++++++
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 2 ++
 include/linux/mlx5/mlx5_ifc.h                               | 4 +++-
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
index 68d898e144fb..5970cb8fc0c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
@@ -130,6 +130,8 @@ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev,
 			MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_enabled);
 	}
 
+	caps->isolate_vl_tc = MLX5_CAP_GEN(mdev, isolate_vl_tc_new);
+
 	if (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED) {
 		caps->flex_parser_id_icmp_dw0 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw0);
 		caps->flex_parser_id_icmp_dw1 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index 69d623bedefe..12cf323a5943 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -46,6 +46,7 @@ struct dr_qp_init_attr {
 	u32 pdn;
 	u32 max_send_wr;
 	struct mlx5_uars_page *uar;
+	u8 isolate_vl_tc:1;
 };
 
 static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
@@ -158,6 +159,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc);
 	MLX5_SET(qpc, qpc, pd, attr->pdn);
 	MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
 	MLX5_SET(qpc, qpc, log_page_size,
@@ -924,6 +926,11 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
 	init_attr.pdn = dmn->pdn;
 	init_attr.uar = dmn->uar;
 	init_attr.max_send_wr = QUEUE_SIZE;
+
+	/* Isolated VL is applicable only if force loopback is supported */
+	if (dr_send_allow_fl(&dmn->info.caps))
+		init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc;
+
 	spin_lock_init(&dmn->send_ring->lock);
 
 	dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
index 8de70566f85b..67460c42a99b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h
@@ -790,6 +790,7 @@ struct mlx5dr_cmd_caps {
 	struct mlx5dr_cmd_vport_cap *vports_caps;
 	bool prio_tag_required;
 	struct mlx5dr_roce_cap roce_caps;
+	u8 isolate_vl_tc:1;
 };
 
 struct mlx5dr_domain_rx_tx {
@@ -1164,6 +1165,7 @@ struct mlx5dr_cmd_qp_create_attr {
 	u32 sq_wqe_cnt;
 	u32 rq_wqe_cnt;
 	u32 rq_wqe_shift;
+	u8 isolate_vl_tc:1;
 };
 
 int mlx5dr_cmd_query_gid(struct mlx5_core_dev *mdev, u8 vhca_port_num,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4d9569c4b96c..52b7cabcde08 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1319,7 +1319,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_srq_sz[0x8];
 	u8         log_max_qp_sz[0x8];
 	u8         event_cap[0x1];
-	u8         reserved_at_91[0x7];
+	u8         reserved_at_91[0x2];
+	u8         isolate_vl_tc_new[0x1];
+	u8         reserved_at_94[0x4];
 	u8         prio_tag_required[0x1];
 	u8         reserved_at_99[0x2];
 	u8         log_max_qp[0x5];
-- 
cgit v1.2.3


From 5d3c4c79384af06e3c8e25b7770b6247496b4417 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 12 Apr 2021 15:20:49 -0700
Subject: KVM: Stop looking for coalesced MMIO zones if the bus is destroyed

Abort the walk of coalesced MMIO zones if kvm_io_bus_unregister_dev()
fails to allocate memory for the new instance of the bus.  If it can't
instantiate a new bus, unregister_dev() destroys all devices _except_ the
target device.   But, it doesn't tell the caller that it obliterated the
bus and invoked the destructor for all devices that were on the bus.  In
the coalesced MMIO case, this can result in a deleted list entry
dereference due to attempting to continue iterating on coalesced_zones
after future entries (in the walk) have been deleted.

Opportunistically add curly braces to the for-loop, which encompasses
many lines but sneaks by without braces due to the guts being a single
if statement.

Fixes: f65886606c2d ("KVM: fix memory leak in kvm_io_bus_unregister_dev()")
Cc: stable@vger.kernel.org
Reported-by: Hao Sun <sunhao.th@gmail.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210412222050.876100-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h  |  4 ++--
 virt/kvm/coalesced_mmio.c | 19 +++++++++++++++++--
 virt/kvm/kvm_main.c       | 10 +++++-----
 3 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d17e3ff1138d..82b066db37cb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -192,8 +192,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev);
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			       struct kvm_io_device *dev);
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			      struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 					 gpa_t addr);
 
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 62bd908ecd58..f08f5e82460b 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -174,21 +174,36 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
 					   struct kvm_coalesced_mmio_zone *zone)
 {
 	struct kvm_coalesced_mmio_dev *dev, *tmp;
+	int r;
 
 	if (zone->pio != 1 && zone->pio != 0)
 		return -EINVAL;
 
 	mutex_lock(&kvm->slots_lock);
 
-	list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+	list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) {
 		if (zone->pio == dev->zone.pio &&
 		    coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
-			kvm_io_bus_unregister_dev(kvm,
+			r = kvm_io_bus_unregister_dev(kvm,
 				zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
 			kvm_iodevice_destructor(&dev->dev);
+
+			/*
+			 * On failure, unregister destroys all devices on the
+			 * bus _except_ the target device, i.e. coalesced_zones
+			 * has been modified.  No need to restart the walk as
+			 * there aren't any zones left.
+			 */
+			if (r)
+				break;
 		}
+	}
 
 	mutex_unlock(&kvm->slots_lock);
 
+	/*
+	 * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's
+	 * perspective, the coalesced MMIO is most definitely unregistered.
+	 */
 	return 0;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c771d40737c9..f84b126c32d6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4621,15 +4621,15 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 }
 
 /* Caller must hold slots_lock. */
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			       struct kvm_io_device *dev)
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			      struct kvm_io_device *dev)
 {
 	int i, j;
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm_get_bus(kvm, bus_idx);
 	if (!bus)
-		return;
+		return 0;
 
 	for (i = 0; i < bus->dev_count; i++)
 		if (bus->range[i].dev == dev) {
@@ -4637,7 +4637,7 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 		}
 
 	if (i == bus->dev_count)
-		return;
+		return 0;
 
 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
 			  GFP_KERNEL_ACCOUNT);
@@ -4662,7 +4662,7 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	}
 
 	kfree(bus);
-	return;
+	return new_bus ? 0 : -ENOMEM;
 }
 
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-- 
cgit v1.2.3


From 3a05d08f6cc75b74079290c33d6127b2857226fa Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 30 Mar 2021 16:11:38 +0100
Subject: PCI/MSI: Drop use of msi_controller from core code

As there is no driver using msi_controller, we can now safely
remove its use from the PCI probe code.

Link: https://lore.kernel.org/r/20210330151145.997953-8-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/msi.c   | 23 +----------------------
 drivers/pci/probe.c |  2 --
 include/linux/pci.h |  2 --
 3 files changed, 1 insertion(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 3162f88fe940..79b5a995bd02 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -64,39 +64,18 @@ static void pci_msi_teardown_msi_irqs(struct pci_dev *dev)
 /* Arch hooks */
 int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 {
-	struct msi_controller *chip = dev->bus->msi;
-	int err;
-
-	if (!chip || !chip->setup_irq)
-		return -EINVAL;
-
-	err = chip->setup_irq(chip, dev, desc);
-	if (err < 0)
-		return err;
-
-	irq_set_chip_data(desc->irq, chip);
-
-	return 0;
+	return -EINVAL;
 }
 
 void __weak arch_teardown_msi_irq(unsigned int irq)
 {
-	struct msi_controller *chip = irq_get_chip_data(irq);
-
-	if (!chip || !chip->teardown_irq)
-		return;
-
-	chip->teardown_irq(chip, irq);
 }
 
 int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	struct msi_controller *chip = dev->bus->msi;
 	struct msi_desc *entry;
 	int ret;
 
-	if (chip && chip->setup_irqs)
-		return chip->setup_irqs(chip, dev, nvec, type);
 	/*
 	 * If an architecture wants to support multiple MSI, it needs to
 	 * override arch_setup_msi_irqs()
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 953f15abc850..fb04fc81a8bd 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -895,7 +895,6 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 	/* Temporarily move resources off the list */
 	list_splice_init(&bridge->windows, &resources);
 	bus->sysdata = bridge->sysdata;
-	bus->msi = bridge->msi;
 	bus->ops = bridge->ops;
 	bus->number = bus->busn_res.start = bridge->busnr;
 #ifdef CONFIG_PCI_DOMAINS_GENERIC
@@ -1053,7 +1052,6 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
 		return NULL;
 
 	child->parent = parent;
-	child->msi = parent->msi;
 	child->sysdata = parent->sysdata;
 	child->bus_flags = parent->bus_flags;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 86c799c97b77..ebf557e59d87 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -540,7 +540,6 @@ struct pci_host_bridge {
 	int (*map_irq)(const struct pci_dev *, u8, u8);
 	void (*release_fn)(struct pci_host_bridge *);
 	void		*release_data;
-	struct msi_controller *msi;
 	unsigned int	ignore_reset_delay:1;	/* For entire hierarchy */
 	unsigned int	no_ext_tags:1;		/* No Extended Tags */
 	unsigned int	native_aer:1;		/* OS may use PCIe AER */
@@ -621,7 +620,6 @@ struct pci_bus {
 	struct resource busn_res;	/* Bus numbers routed to this bus */
 
 	struct pci_ops	*ops;		/* Configuration access functions */
-	struct msi_controller *msi;	/* MSI controller */
 	void		*sysdata;	/* Hook for sys-specific extension */
 	struct proc_dir_entry *procdir;	/* Directory entry in /proc/bus/pci */
 
-- 
cgit v1.2.3


From b227be0d7314d0869d4e28c199ac1fc7075cf06e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 30 Mar 2021 16:11:39 +0100
Subject: PCI/MSI: Kill msi_controller structure

msi_controller had a good, long life as the abstraction for
a driver providing MSIs to PCI devices. But it has been replaced
in all drivers by the more expressive generic MSI framework.

Farewell, struct msi_controller.

Link: https://lore.kernel.org/r/20210330151145.997953-9-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/msi.h | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index aef35fd1cf11..3f21e77b57b7 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -240,8 +240,7 @@ void pci_msi_unmask_irq(struct irq_data *data);
 /*
  * The arch hooks to setup up msi irqs. Default functions are implemented
  * as weak symbols so that they /can/ be overriden by architecture specific
- * code if needed. These hooks must be enabled by the architecture or by
- * drivers which depend on them via msi_controller based MSI handling.
+ * code if needed. These hooks can only be enabled by the architecture.
  *
  * If CONFIG_PCI_MSI_ARCH_FALLBACKS is not selected they are replaced by
  * stubs with warnings.
@@ -272,19 +271,6 @@ static inline void arch_teardown_msi_irqs(struct pci_dev *dev)
 void arch_restore_msi_irqs(struct pci_dev *dev);
 void default_restore_msi_irqs(struct pci_dev *dev);
 
-struct msi_controller {
-	struct module *owner;
-	struct device *dev;
-	struct device_node *of_node;
-	struct list_head list;
-
-	int (*setup_irq)(struct msi_controller *chip, struct pci_dev *dev,
-			 struct msi_desc *desc);
-	int (*setup_irqs)(struct msi_controller *chip, struct pci_dev *dev,
-			  int nvec, int type);
-	void (*teardown_irq)(struct msi_controller *chip, unsigned int irq);
-};
-
 #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
 
 #include <linux/irqhandler.h>
-- 
cgit v1.2.3


From f8bcf249d9cf292c6ceb3d9f5bd90815090f5286 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 30 Mar 2021 16:11:40 +0100
Subject: PCI/MSI: Kill default_teardown_msi_irqs()

It doesn't have any caller left.

Link: https://lore.kernel.org/r/20210330151145.997953-10-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/msi.c   | 11 +----------
 include/linux/msi.h |  1 -
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 79b5a995bd02..d9c73c173c14 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -94,11 +94,7 @@ int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	return 0;
 }
 
-/*
- * We have a default implementation available as a separate non-weak
- * function, as it is used by the Xen x86 PCI code
- */
-void default_teardown_msi_irqs(struct pci_dev *dev)
+void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
 {
 	int i;
 	struct msi_desc *entry;
@@ -108,11 +104,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev)
 			for (i = 0; i < entry->nvec_used; i++)
 				arch_teardown_msi_irq(entry->irq + i);
 }
-
-void __weak arch_teardown_msi_irqs(struct pci_dev *dev)
-{
-	return default_teardown_msi_irqs(dev);
-}
 #endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */
 
 static void default_restore_msi_irq(struct pci_dev *dev, int irq)
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 3f21e77b57b7..6aff469e511d 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -250,7 +250,6 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
 void arch_teardown_msi_irq(unsigned int irq);
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void arch_teardown_msi_irqs(struct pci_dev *dev);
-void default_teardown_msi_irqs(struct pci_dev *dev);
 #else
 static inline int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-- 
cgit v1.2.3


From 94e89b145371b68fa0ea294855adebcd03e0522e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 30 Mar 2021 16:11:41 +0100
Subject: PCI/MSI: Let PCI host bridges declare their reliance on MSI domains

There is a whole class of host bridges that cannot know whether
MSIs will be provided or not, as they rely on other blocks
to provide the MSI functionnality, using MSI domains.  This is
the case for example on systems that use the ARM GIC architecture.

Introduce a new attribute ('msi_domain') indicating that implicit
dependency, and use this property to set the NO_MSI flag when
no MSI domain is found at probe time.

Link: https://lore.kernel.org/r/20210330151145.997953-11-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/probe.c | 2 ++
 include/linux/pci.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index fb04fc81a8bd..aa6fba35f5d1 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -925,6 +925,8 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 	device_enable_async_suspend(bus->bridge);
 	pci_set_bus_of_node(bus);
 	pci_set_bus_msi_domain(bus);
+	if (bridge->msi_domain && !dev_get_msi_domain(&bus->dev))
+		bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
 
 	if (!parent)
 		set_dev_node(bus->bridge, pcibus_to_node(bus));
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ebf557e59d87..ede0aef2cfd4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -550,6 +550,7 @@ struct pci_host_bridge {
 	unsigned int	native_dpc:1;		/* OS may use PCIe DPC */
 	unsigned int	preserve_config:1;	/* Preserve FW resource setup */
 	unsigned int	size_windows:1;		/* Enable root bus sizing */
+	unsigned int	msi_domain:1;		/* Bridge wants MSI domain */
 
 	/* Resource alignment requirements */
 	resource_size_t (*align_resource)(struct pci_dev *dev,
-- 
cgit v1.2.3


From 67880f1bc342ed4c94e72cad7f8ca76e5121aae3 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Tue, 20 Apr 2021 10:16:11 -0700
Subject: platform/chrome: cros_ec: Add Type C hard reset

Update the EC command header to include the new event bit. This bit
is included in the latest version of the Chrome EC headers[1].

[1] https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/main/include/ec_commands.h

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20210420171617.3830902-1-pmalani@chromium.org
---
 include/linux/platform_data/cros_ec_commands.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 5ff8597ceabd..9156078c6fc6 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -5678,6 +5678,7 @@ enum tcpc_cc_polarity {
 
 #define PD_STATUS_EVENT_SOP_DISC_DONE		BIT(0)
 #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE	BIT(1)
+#define PD_STATUS_EVENT_HARD_RESET		BIT(2)
 
 struct ec_params_typec_status {
 	uint8_t port;
-- 
cgit v1.2.3


From db2e718a47984b9d71ed890eb2ea36ecf150de18 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Tue, 20 Apr 2021 08:43:34 -0500
Subject: capabilities: require CAP_SETFCAP to map uid 0

cap_setfcap is required to create file capabilities.

Since commit 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"),
a process running as uid 0 but without cap_setfcap is able to work
around this as follows: unshare a new user namespace which maps parent
uid 0 into the child namespace.

While this task will not have new capabilities against the parent
namespace, there is a loophole due to the way namespaced file
capabilities are represented as xattrs.  File capabilities valid in
userns 1 are distinguished from file capabilities valid in userns 2 by
the kuid which underlies uid 0.  Therefore the restricted root process
can unshare a new self-mapping namespace, add a namespaced file
capability onto a file, then use that file capability in the parent
namespace.

To prevent that, do not allow mapping parent uid 0 if the process which
opened the uid_map file does not have CAP_SETFCAP, which is the
capability for setting file capabilities.

As a further wrinkle: a task can unshare its user namespace, then open
its uid_map file itself, and map (only) its own uid.  In this case we do
not have the credential from before unshare, which was potentially more
restricted.  So, when creating a user namespace, we record whether the
creator had CAP_SETFCAP.  Then we can use that during map_write().

With this patch:

1. Unprivileged user can still unshare -Ur

   ubuntu@caps:~$ unshare -Ur
   root@caps:~# logout

2. Root user can still unshare -Ur

   ubuntu@caps:~$ sudo bash
   root@caps:/home/ubuntu# unshare -Ur
   root@caps:/home/ubuntu# logout

3. Root user without CAP_SETFCAP cannot unshare -Ur:

   root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap --
   root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap
   unable to set CAP_SETFCAP effective capability: Operation not permitted
   root@caps:/home/ubuntu# unshare -Ur
   unshare: write failed /proc/self/uid_map: Operation not permitted

Note: an alternative solution would be to allow uid 0 mappings by
processes without CAP_SETFCAP, but to prevent such a namespace from
writing any file capabilities.  This approach can be seen at [1].

Background history: commit 95ebabde382 ("capabilities: Don't allow
writing ambiguous v3 file capabilities") tried to fix the issue by
preventing v3 fscaps to be written to disk when the root uid would map
to the same uid in nested user namespaces.  This led to regressions for
various workloads.  For example, see [2].  Ultimately this is a valid
use-case we have to support meaning we had to revert this change in
3b0c2d3eaa83 ("Revert 95ebabde382c ("capabilities: Don't allow writing
ambiguous v3 file capabilities")").

Link: https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4 [1]
Link: https://github.com/containers/buildah/issues/3071 [2]
Signed-off-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Andrew G. Morgan <morgan@kernel.org>
Tested-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Tested-by: Giuseppe Scrivano <gscrivan@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/user_namespace.h  |  3 ++
 include/uapi/linux/capability.h |  3 +-
 kernel/user_namespace.c         | 65 +++++++++++++++++++++++++++++++++++++++--
 3 files changed, 67 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 64cf8ebdc4ec..f6c5f784be5a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -63,6 +63,9 @@ struct user_namespace {
 	kgid_t			group;
 	struct ns_common	ns;
 	unsigned long		flags;
+	/* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
+	 * in its effective capability set at the child ns creation time. */
+	bool			parent_could_setfcap;
 
 #ifdef CONFIG_KEYS
 	/* List of joinable keyrings in this namespace.  Modification access of
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index c6ca33034147..2ddb4226cd23 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -335,7 +335,8 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_CONTROL    30
 
-/* Set or remove capabilities on files */
+/* Set or remove capabilities on files.
+   Map uid=0 into a child user namespace. */
 
 #define CAP_SETFCAP	     31
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index af612945a4d0..9a4b980d695b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -106,6 +106,7 @@ int create_user_ns(struct cred *new)
 	if (!ns)
 		goto fail_dec;
 
+	ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
 	ret = ns_alloc_inum(&ns->ns);
 	if (ret)
 		goto fail_free;
@@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map)
 	return 0;
 }
 
+/**
+ * verify_root_map() - check the uid 0 mapping
+ * @file: idmapping file
+ * @map_ns: user namespace of the target process
+ * @new_map: requested idmap
+ *
+ * If a process requests mapping parent uid 0 into the new ns, verify that the
+ * process writing the map had the CAP_SETFCAP capability as the target process
+ * will be able to write fscaps that are valid in ancestor user namespaces.
+ *
+ * Return: true if the mapping is allowed, false if not.
+ */
+static bool verify_root_map(const struct file *file,
+			    struct user_namespace *map_ns,
+			    struct uid_gid_map *new_map)
+{
+	int idx;
+	const struct user_namespace *file_ns = file->f_cred->user_ns;
+	struct uid_gid_extent *extent0 = NULL;
+
+	for (idx = 0; idx < new_map->nr_extents; idx++) {
+		if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			extent0 = &new_map->extent[idx];
+		else
+			extent0 = &new_map->forward[idx];
+		if (extent0->lower_first == 0)
+			break;
+
+		extent0 = NULL;
+	}
+
+	if (!extent0)
+		return true;
+
+	if (map_ns == file_ns) {
+		/* The process unshared its ns and is writing to its own
+		 * /proc/self/uid_map.  User already has full capabilites in
+		 * the new namespace.  Verify that the parent had CAP_SETFCAP
+		 * when it unshared.
+		 * */
+		if (!file_ns->parent_could_setfcap)
+			return false;
+	} else {
+		/* Process p1 is writing to uid_map of p2, who is in a child
+		 * user namespace to p1's.  Verify that the opener of the map
+		 * file has CAP_SETFCAP against the parent of the new map
+		 * namespace */
+		if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
+			return false;
+	}
+
+	return true;
+}
+
 static ssize_t map_write(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos,
 			 int cap_setid,
@@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 			 struct uid_gid_map *parent_map)
 {
 	struct seq_file *seq = file->private_data;
-	struct user_namespace *ns = seq->private;
+	struct user_namespace *map_ns = seq->private;
 	struct uid_gid_map new_map;
 	unsigned idx;
 	struct uid_gid_extent extent;
@@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	/*
 	 * Adjusting namespace settings requires capabilities on the target.
 	 */
-	if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
+	if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
 		goto out;
 
 	/* Parse the user data */
@@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 
 	ret = -EPERM;
 	/* Validate the user is allowed to use user id's mapped to. */
-	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
+	if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
 		goto out;
 
 	ret = -EPERM;
@@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file,
 				struct uid_gid_map *new_map)
 {
 	const struct cred *cred = file->f_cred;
+
+	if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
+		return false;
+
 	/* Don't allow mappings that would allow anything that wouldn't
 	 * be allowed without the establishment of unprivileged mappings.
 	 */
-- 
cgit v1.2.3


From 014068dcb5b17dae110354c4de241833124edba1 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 19 Apr 2021 15:01:02 +0200
Subject: net: phy: genphy_loopback: add link speed configuration

In case of loopback, in most cases we need to disable autoneg support
and force some speed configuration. Otherwise, depending on currently
active auto negotiated link speed, the loopback may or may not work.

This patch was tested with following PHYs: TJA1102, KSZ8081, KSZ9031,
AT8035, AR9331.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        |  3 ++-
 drivers/net/phy/phy_device.c | 28 ++++++++++++++++++++++++++--
 include/linux/phy.h          |  1 +
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index fc2e7cb5b2e5..1f0512e39c65 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -701,7 +701,7 @@ out:
 }
 EXPORT_SYMBOL(phy_start_cable_test_tdr);
 
-static int phy_config_aneg(struct phy_device *phydev)
+int phy_config_aneg(struct phy_device *phydev)
 {
 	if (phydev->drv->config_aneg)
 		return phydev->drv->config_aneg(phydev);
@@ -714,6 +714,7 @@ static int phy_config_aneg(struct phy_device *phydev)
 
 	return genphy_config_aneg(phydev);
 }
+EXPORT_SYMBOL(phy_config_aneg);
 
 /**
  * phy_check_link_status - check link status and set state accordingly
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 320a3e5cd10a..0a2d8bedf73d 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2565,8 +2565,32 @@ EXPORT_SYMBOL(genphy_resume);
 
 int genphy_loopback(struct phy_device *phydev, bool enable)
 {
-	return phy_modify(phydev, MII_BMCR, BMCR_LOOPBACK,
-			  enable ? BMCR_LOOPBACK : 0);
+	if (enable) {
+		u16 val, ctl = BMCR_LOOPBACK;
+		int ret;
+
+		if (phydev->speed == SPEED_1000)
+			ctl |= BMCR_SPEED1000;
+		else if (phydev->speed == SPEED_100)
+			ctl |= BMCR_SPEED100;
+
+		if (phydev->duplex == DUPLEX_FULL)
+			ctl |= BMCR_FULLDPLX;
+
+		phy_modify(phydev, MII_BMCR, ~0, ctl);
+
+		ret = phy_read_poll_timeout(phydev, MII_BMSR, val,
+					    val & BMSR_LSTATUS,
+				    5000, 500000, true);
+		if (ret)
+			return ret;
+	} else {
+		phy_modify(phydev, MII_BMCR, BMCR_LOOPBACK, 0);
+
+		phy_config_aneg(phydev);
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL(genphy_loopback);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e3d4d583463b..60d2b26026a2 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1410,6 +1410,7 @@ void phy_disconnect(struct phy_device *phydev);
 void phy_detach(struct phy_device *phydev);
 void phy_start(struct phy_device *phydev);
 void phy_stop(struct phy_device *phydev);
+int phy_config_aneg(struct phy_device *phydev);
 int phy_start_aneg(struct phy_device *phydev);
 int phy_aneg_done(struct phy_device *phydev);
 int phy_speed_down(struct phy_device *phydev, bool sync);
-- 
cgit v1.2.3


From a978f7c479ea68d68a6267a37cbd44362bdd9811 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Tue, 20 Apr 2021 09:54:03 +0200
Subject: net: phy: marvell: add support for Amethyst internal PHY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for Amethyst internal PHY.

The only difference from Peridot is HWMON.

Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell.c   | 117 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/marvell_phy.h |   1 +
 2 files changed, 115 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index e505060d0743..1cce86b280af 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -118,10 +118,21 @@
 #define MII_88E6390_MISC_TEST_TEMP_SENSOR_ENABLE_ONESHOT	(0x2 << 14)
 #define MII_88E6390_MISC_TEST_TEMP_SENSOR_DISABLE		(0x3 << 14)
 #define MII_88E6390_MISC_TEST_TEMP_SENSOR_MASK			(0x3 << 14)
+#define MII_88E6393_MISC_TEST_SAMPLES_2048	(0x0 << 11)
+#define MII_88E6393_MISC_TEST_SAMPLES_4096	(0x1 << 11)
+#define MII_88E6393_MISC_TEST_SAMPLES_8192	(0x2 << 11)
+#define MII_88E6393_MISC_TEST_SAMPLES_16384	(0x3 << 11)
+#define MII_88E6393_MISC_TEST_SAMPLES_MASK	(0x3 << 11)
+#define MII_88E6393_MISC_TEST_RATE_2_3MS	(0x5 << 8)
+#define MII_88E6393_MISC_TEST_RATE_6_4MS	(0x6 << 8)
+#define MII_88E6393_MISC_TEST_RATE_11_9MS	(0x7 << 8)
+#define MII_88E6393_MISC_TEST_RATE_MASK		(0x7 << 8)
 
 #define MII_88E6390_TEMP_SENSOR		0x1c
-#define MII_88E6390_TEMP_SENSOR_MASK	0xff
-#define MII_88E6390_TEMP_SENSOR_SAMPLES 10
+#define MII_88E6393_TEMP_SENSOR_THRESHOLD_MASK	0xff00
+#define MII_88E6393_TEMP_SENSOR_THRESHOLD_SHIFT	8
+#define MII_88E6390_TEMP_SENSOR_MASK		0xff
+#define MII_88E6390_TEMP_SENSOR_SAMPLES		10
 
 #define MII_88E1318S_PHY_MSCR1_REG	16
 #define MII_88E1318S_PHY_MSCR1_PAD_ODD	BIT(6)
@@ -2217,6 +2228,7 @@ static int marvell_vct7_cable_test_get_status(struct phy_device *phydev,
 
 #ifdef CONFIG_HWMON
 struct marvell_hwmon_ops {
+	int (*config)(struct phy_device *phydev);
 	int (*get_temp)(struct phy_device *phydev, long *temp);
 	int (*get_temp_critical)(struct phy_device *phydev, long *temp);
 	int (*set_temp_critical)(struct phy_device *phydev, long temp);
@@ -2391,6 +2403,65 @@ error:
 	return ret;
 }
 
+static int m88e6393_get_temp(struct phy_device *phydev, long *temp)
+{
+	int err;
+
+	err = m88e1510_get_temp(phydev, temp);
+
+	/* 88E1510 measures T + 25, while the PHY on 88E6393X switch
+	 * T + 75, so we have to subtract another 50
+	 */
+	*temp -= 50000;
+
+	return err;
+}
+
+static int m88e6393_get_temp_critical(struct phy_device *phydev, long *temp)
+{
+	int ret;
+
+	*temp = 0;
+
+	ret = phy_read_paged(phydev, MII_MARVELL_MISC_TEST_PAGE,
+			     MII_88E6390_TEMP_SENSOR);
+	if (ret < 0)
+		return ret;
+
+	*temp = (((ret & MII_88E6393_TEMP_SENSOR_THRESHOLD_MASK) >>
+		  MII_88E6393_TEMP_SENSOR_THRESHOLD_SHIFT) - 75) * 1000;
+
+	return 0;
+}
+
+static int m88e6393_set_temp_critical(struct phy_device *phydev, long temp)
+{
+	temp = (temp / 1000) + 75;
+
+	return phy_modify_paged(phydev, MII_MARVELL_MISC_TEST_PAGE,
+				MII_88E6390_TEMP_SENSOR,
+				MII_88E6393_TEMP_SENSOR_THRESHOLD_MASK,
+				temp << MII_88E6393_TEMP_SENSOR_THRESHOLD_SHIFT);
+}
+
+static int m88e6393_hwmon_config(struct phy_device *phydev)
+{
+	int err;
+
+	err = m88e6393_set_temp_critical(phydev, 100000);
+	if (err)
+		return err;
+
+	return phy_modify_paged(phydev, MII_MARVELL_MISC_TEST_PAGE,
+				MII_88E6390_MISC_TEST,
+				MII_88E6390_MISC_TEST_TEMP_SENSOR_MASK |
+				MII_88E6393_MISC_TEST_SAMPLES_MASK |
+				MII_88E6393_MISC_TEST_RATE_MASK,
+				MII_88E6390_MISC_TEST_TEMP_SENSOR_ENABLE |
+				MII_88E6393_MISC_TEST_SAMPLES_2048 |
+				MII_88E6393_MISC_TEST_RATE_2_3MS);
+}
+
 static int marvell_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
 			      u32 attr, int channel, long *temp)
 {
@@ -2535,8 +2606,13 @@ static int marvell_hwmon_probe(struct phy_device *phydev)
 
 	priv->hwmon_dev = devm_hwmon_device_register_with_info(
 		dev, priv->hwmon_name, phydev, &marvell_hwmon_chip_info, NULL);
+	if (IS_ERR(priv->hwmon_dev))
+		return PTR_ERR(priv->hwmon_dev);
 
-	return PTR_ERR_OR_ZERO(priv->hwmon_dev);
+	if (ops->config)
+		err = ops->config(phydev);
+
+	return err;
 }
 
 static const struct marvell_hwmon_ops m88e1121_hwmon_ops = {
@@ -2554,6 +2630,14 @@ static const struct marvell_hwmon_ops m88e6390_hwmon_ops = {
 	.get_temp = m88e6390_get_temp,
 };
 
+static const struct marvell_hwmon_ops m88e6393_hwmon_ops = {
+	.config = m88e6393_hwmon_config,
+	.get_temp = m88e6393_get_temp,
+	.get_temp_critical = m88e6393_get_temp_critical,
+	.set_temp_critical = m88e6393_set_temp_critical,
+	.get_temp_alarm = m88e1510_get_temp_alarm,
+};
+
 #define DEF_MARVELL_HWMON_OPS(s) (&(s))
 
 #else
@@ -2948,6 +3032,32 @@ static struct phy_driver marvell_drivers[] = {
 		.cable_test_tdr_start = marvell_vct5_cable_test_tdr_start,
 		.cable_test_get_status = marvell_vct7_cable_test_get_status,
 	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E6393_FAMILY,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E6393 Family",
+		.driver_data = DEF_MARVELL_HWMON_OPS(m88e6393_hwmon_ops),
+		/* PHY_GBIT_FEATURES */
+		.flags = PHY_POLL_CABLE_TEST,
+		.probe = marvell_probe,
+		.config_init = marvell_config_init,
+		.config_aneg = m88e1510_config_aneg,
+		.read_status = marvell_read_status,
+		.config_intr = marvell_config_intr,
+		.handle_interrupt = marvell_handle_interrupt,
+		.resume = genphy_resume,
+		.suspend = genphy_suspend,
+		.read_page = marvell_read_page,
+		.write_page = marvell_write_page,
+		.get_sset_count = marvell_get_sset_count,
+		.get_strings = marvell_get_strings,
+		.get_stats = marvell_get_stats,
+		.get_tunable = m88e1540_get_tunable,
+		.set_tunable = m88e1540_set_tunable,
+		.cable_test_start = marvell_vct7_cable_test_start,
+		.cable_test_tdr_start = marvell_vct5_cable_test_tdr_start,
+		.cable_test_get_status = marvell_vct7_cable_test_get_status,
+	},
 	{
 		.phy_id = MARVELL_PHY_ID_88E1340S,
 		.phy_id_mask = MARVELL_PHY_ID_MASK,
@@ -3014,6 +3124,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E6341_FAMILY, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E6390_FAMILY, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E6393_FAMILY, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1340S, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1548P, MARVELL_PHY_ID_MASK },
 	{ }
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index f61d82c53f30..acee44b9db26 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -39,6 +39,7 @@
  */
 #define MARVELL_PHY_ID_88E6341_FAMILY	0x01410f41
 #define MARVELL_PHY_ID_88E6390_FAMILY	0x01410f90
+#define MARVELL_PHY_ID_88E6393_FAMILY	0x002b0b9b
 
 #define MARVELL_PHY_FAMILY_ID(id)	((id) >> 4)
 
-- 
cgit v1.2.3


From d99f2487e1de23a2e902d1a359a85a48bfd21fe7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 21 Apr 2021 07:48:43 -0400
Subject: NFS: The 'fattr_valid' field in struct nfs_server should be unsigned
 int

Fix up a static compiler warning:
"fs/nfs/nfs4proc.c:3882 _nfs4_server_capabilities() warn: was expecting
a 64 bit value instead of '(1 << 11)'"

The fix is to convert the fattr_valid field to match the type of the
'valid' field in struct nfs_fattr.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs_sb.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d28d7a62864f..70057b2e606e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -156,6 +156,7 @@ struct nfs_server {
 #define NFS_MOUNT_WRITE_EAGER		0x01000000
 #define NFS_MOUNT_WRITE_WAIT		0x02000000
 
+	unsigned int		fattr_valid;	/* Valid attributes */
 	unsigned int		caps;		/* server capabilities */
 	unsigned int		rsize;		/* read size */
 	unsigned int		rpages;		/* read size (in pages) */
@@ -191,8 +192,6 @@ struct nfs_server {
 	dev_t			s_dev;		/* superblock dev numbers */
 	struct nfs_auth_info	auth_info;	/* parsed auth flavors */
 
-	__u64			fattr_valid;	/* Valid attributes */
-
 #ifdef CONFIG_NFS_FSCACHE
 	struct nfs_fscache_key	*fscache_key;	/* unique key for superblock */
 	struct fscache_cookie	*fscache;	/* superblock cookie */
-- 
cgit v1.2.3


From c006fac556e401a62054d065da168099ea5a5b10 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 16 Apr 2021 14:29:36 -0700
Subject: sched: Warn on long periods of pending need_resched

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. If it goes off, it is
likely that there is a missing cond_resched() somewhere. Monitoring is
done via the tick and the accuracy is hence limited to jiffy scale. This
also means that we won't trigger the warning if the tick is disabled.

This feature (LATENCY_WARN) is default disabled.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210416212936.390566-1-joshdon@google.com
---
 include/linux/sched/sysctl.h |  3 ++
 kernel/sched/core.c          | 70 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/debug.c         | 13 ++++++++
 kernel/sched/features.h      |  2 ++
 kernel/sched/sched.h         | 10 +++++++
 5 files changed, 97 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 0a3f34638cf5..db2c0f34aaaf 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,6 +48,9 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
 #endif
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e6c714b9b211..fcb35ae15619 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,17 @@ const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
 #undef SCHED_FEAT
-#endif
+
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * LATENCY_WARN is enabled).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ */
+__read_mostly int sysctl_resched_latency_warn_ms = 100;
+__read_mostly int sysctl_resched_latency_warn_once = 1;
+#endif /* CONFIG_SCHED_DEBUG */
 
 /*
  * Number of tasks to iterate in a single balance run.
@@ -4527,6 +4537,55 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+static u64 cpu_resched_latency(struct rq *rq)
+{
+	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+	u64 resched_latency, now = rq_clock(rq);
+	static bool warned_once;
+
+	if (sysctl_resched_latency_warn_once && warned_once)
+		return 0;
+
+	if (!need_resched() || !latency_warn_ms)
+		return 0;
+
+	if (system_state == SYSTEM_BOOTING)
+		return 0;
+
+	if (!rq->last_seen_need_resched_ns) {
+		rq->last_seen_need_resched_ns = now;
+		rq->ticks_without_resched = 0;
+		return 0;
+	}
+
+	rq->ticks_without_resched++;
+	resched_latency = now - rq->last_seen_need_resched_ns;
+	if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+		return 0;
+
+	warned_once = true;
+
+	return resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+	long val;
+
+	if ((kstrtol(str, 0, &val))) {
+		pr_warn("Unable to set resched_latency_warn_ms\n");
+		return 1;
+	}
+
+	sysctl_resched_latency_warn_ms = val;
+	return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
+#endif /* CONFIG_SCHED_DEBUG */
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4538,6 +4597,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
 	unsigned long thermal_pressure;
+	u64 resched_latency;
 
 	arch_scale_freq_tick();
 	sched_clock_tick();
@@ -4548,10 +4608,15 @@ void scheduler_tick(void)
 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
+	if (sched_feat(LATENCY_WARN))
+		resched_latency = cpu_resched_latency(rq);
 	calc_global_load_tick(rq);
 
 	rq_unlock(rq, &rf);
 
+	if (sched_feat(LATENCY_WARN) && resched_latency)
+		resched_latency_warn(cpu, resched_latency);
+
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
@@ -5046,6 +5111,9 @@ static void __sched notrace __schedule(bool preempt)
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+	rq->last_seen_need_resched_ns = 0;
+#endif
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 461342f598ef..7251fc477a11 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -309,6 +309,9 @@ static __init int sched_init_debug(void)
 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
 	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
 
+	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+	debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+
 #ifdef CONFIG_SMP
 	debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
 	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
@@ -1027,3 +1030,13 @@ void proc_sched_set_task(struct task_struct *p)
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 }
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+	static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+	WARN(__ratelimit(&latency_check_ratelimit),
+	     "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
+	     "without schedule\n",
+	     cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 011c5ec7b7b5..7f8dace0964c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -91,5 +91,7 @@ SCHED_FEAT(WA_BIAS, true)
 SCHED_FEAT(UTIL_EST, true)
 SCHED_FEAT(UTIL_EST_FASTUP, true)
 
+SCHED_FEAT(LATENCY_WARN, false)
+
 SCHED_FEAT(ALT_PERIOD, true)
 SCHED_FEAT(BASE_SLICE, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bde72482cbb8..a189bec13729 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/profile.h>
 #include <linux/psi.h>
+#include <linux/ratelimit.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/security.h>
 #include <linux/stop_machine.h>
@@ -971,6 +972,11 @@ struct rq {
 
 	atomic_t		nr_iowait;
 
+#ifdef CONFIG_SCHED_DEBUG
+	u64 last_seen_need_resched_ns;
+	int ticks_without_resched;
+#endif
+
 #ifdef CONFIG_MEMBARRIER
 	int membarrier_state;
 #endif
@@ -2371,6 +2377,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu);
 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+extern void resched_latency_warn(int cpu, u64 latency);
 #ifdef CONFIG_NUMA_BALANCING
 extern void
 show_numa_stats(struct task_struct *p, struct seq_file *m);
@@ -2378,6 +2386,8 @@ extern void
 print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
 	unsigned long tpf, unsigned long gsf, unsigned long gpf);
 #endif /* CONFIG_NUMA_BALANCING */
+#else
+static inline void resched_latency_warn(int cpu, u64 latency) {}
 #endif /* CONFIG_SCHED_DEBUG */
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-- 
cgit v1.2.3


From ddd8d94ca31e768c76cf8bfe34ba7b10136b3694 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Sat, 17 Apr 2021 11:38:39 +0300
Subject: gpio: omap: Save and restore sysconfig

As we are using cpu_pm to save and restore context, we must also save and
restore the GPIO sysconfig register. This is needed because we are not
calling PM runtime functions at all with cpu_pm.

We need to save the sysconfig on idle as it's value can get reconfigured by
PM runtime and can be different from the init time value. Device specific
flags like "ti,no-idle-on-init" can affect the init value.

Fixes: b764a5863fd8 ("gpio: omap: Remove custom PM calls and use cpu_pm instead")
Cc: Aaro Koskinen <aaro.koskinen@iki.fi>
Cc: Adam Ford <aford173@gmail.com>
Cc: Andreas Kemnade <andreas@kemnade.info>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Cc: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Acked-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
---
 drivers/gpio/gpio-omap.c                | 9 +++++++++
 include/linux/platform_data/gpio-omap.h | 3 +++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index 41952bb818ad..56152263ab38 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -29,6 +29,7 @@
 #define OMAP4_GPIO_DEBOUNCINGTIME_MASK 0xFF
 
 struct gpio_regs {
+	u32 sysconfig;
 	u32 irqenable1;
 	u32 irqenable2;
 	u32 wake_en;
@@ -1069,6 +1070,7 @@ static void omap_gpio_init_context(struct gpio_bank *p)
 	const struct omap_gpio_reg_offs *regs = p->regs;
 	void __iomem *base = p->base;
 
+	p->context.sysconfig	= readl_relaxed(base + regs->sysconfig);
 	p->context.ctrl		= readl_relaxed(base + regs->ctrl);
 	p->context.oe		= readl_relaxed(base + regs->direction);
 	p->context.wake_en	= readl_relaxed(base + regs->wkup_en);
@@ -1088,6 +1090,7 @@ static void omap_gpio_restore_context(struct gpio_bank *bank)
 	const struct omap_gpio_reg_offs *regs = bank->regs;
 	void __iomem *base = bank->base;
 
+	writel_relaxed(bank->context.sysconfig, base + regs->sysconfig);
 	writel_relaxed(bank->context.wake_en, base + regs->wkup_en);
 	writel_relaxed(bank->context.ctrl, base + regs->ctrl);
 	writel_relaxed(bank->context.leveldetect0, base + regs->leveldetect0);
@@ -1115,6 +1118,10 @@ static void omap_gpio_idle(struct gpio_bank *bank, bool may_lose_context)
 
 	bank->saved_datain = readl_relaxed(base + bank->regs->datain);
 
+	/* Save syconfig, it's runtime value can be different from init value */
+	if (bank->loses_context)
+		bank->context.sysconfig = readl_relaxed(base + bank->regs->sysconfig);
+
 	if (!bank->enabled_non_wakeup_gpios)
 		goto update_gpio_context_count;
 
@@ -1279,6 +1286,7 @@ out_unlock:
 
 static const struct omap_gpio_reg_offs omap2_gpio_regs = {
 	.revision =		OMAP24XX_GPIO_REVISION,
+	.sysconfig =		OMAP24XX_GPIO_SYSCONFIG,
 	.direction =		OMAP24XX_GPIO_OE,
 	.datain =		OMAP24XX_GPIO_DATAIN,
 	.dataout =		OMAP24XX_GPIO_DATAOUT,
@@ -1302,6 +1310,7 @@ static const struct omap_gpio_reg_offs omap2_gpio_regs = {
 
 static const struct omap_gpio_reg_offs omap4_gpio_regs = {
 	.revision =		OMAP4_GPIO_REVISION,
+	.sysconfig =		OMAP4_GPIO_SYSCONFIG,
 	.direction =		OMAP4_GPIO_OE,
 	.datain =		OMAP4_GPIO_DATAIN,
 	.dataout =		OMAP4_GPIO_DATAOUT,
diff --git a/include/linux/platform_data/gpio-omap.h b/include/linux/platform_data/gpio-omap.h
index 8b30b14b47d3..f377817ce75c 100644
--- a/include/linux/platform_data/gpio-omap.h
+++ b/include/linux/platform_data/gpio-omap.h
@@ -85,6 +85,7 @@
  * omap2+ specific GPIO registers
  */
 #define OMAP24XX_GPIO_REVISION		0x0000
+#define OMAP24XX_GPIO_SYSCONFIG		0x0010
 #define OMAP24XX_GPIO_IRQSTATUS1	0x0018
 #define OMAP24XX_GPIO_IRQSTATUS2	0x0028
 #define OMAP24XX_GPIO_IRQENABLE2	0x002c
@@ -108,6 +109,7 @@
 #define OMAP24XX_GPIO_SETDATAOUT	0x0094
 
 #define OMAP4_GPIO_REVISION		0x0000
+#define OMAP4_GPIO_SYSCONFIG		0x0010
 #define OMAP4_GPIO_EOI			0x0020
 #define OMAP4_GPIO_IRQSTATUSRAW0	0x0024
 #define OMAP4_GPIO_IRQSTATUSRAW1	0x0028
@@ -148,6 +150,7 @@
 #ifndef __ASSEMBLER__
 struct omap_gpio_reg_offs {
 	u16 revision;
+	u16 sysconfig;
 	u16 direction;
 	u16 datain;
 	u16 dataout;
-- 
cgit v1.2.3


From c968b89a1d446ec4a1ed3022ebd79d36de5ea1eb Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 16 Apr 2021 23:12:44 -0700
Subject: of: linux/of.h: fix kernel-doc warnings

Correct kernel-doc notation warnings:

../include/linux/of.h:1211: warning: Function parameter or member 'output' not described in 'of_property_read_string_index'
../include/linux/of.h:1211: warning: Excess function parameter 'out_string' description in 'of_property_read_string_index'
../include/linux/of.h:1477: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst
 * Overlay support

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20210417061244.2262-1-rdunlap@infradead.org
---
 include/linux/of.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index ef6b161d1f91..d8db8d3592fd 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1192,7 +1192,7 @@ static inline int of_property_count_strings(const struct device_node *np,
  * @np:		device node from which the property value is to be read.
  * @propname:	name of the property to be searched.
  * @index:	index of the string in the list of strings
- * @out_string:	pointer to null terminated return string, modified only if
+ * @output:	pointer to null terminated return string, modified only if
  *		return value is 0.
  *
  * Search for a property in a device tree node and retrieve a null
@@ -1473,7 +1473,7 @@ static inline bool of_device_is_system_power_controller(const struct device_node
 	return of_property_read_bool(np, "system-power-controller");
 }
 
-/**
+/*
  * Overlay support
  */
 
-- 
cgit v1.2.3


From 54526d1fd59338fd6a381dbd806b7ccbae3aa4aa Mon Sep 17 00:00:00 2001
From: Nathan Tempelman <natet@google.com>
Date: Thu, 8 Apr 2021 22:32:14 +0000
Subject: KVM: x86: Support KVM VMs sharing SEV context

Add a capability for userspace to mirror SEV encryption context from
one vm to another. On our side, this is intended to support a
Migration Helper vCPU, but it can also be used generically to support
other in-guest workloads scheduled by the host. The intention is for
the primary guest and the mirror to have nearly identical memslots.

The primary benefits of this are that:
1) The VMs do not share KVM contexts (think APIC/MSRs/etc), so they
can't accidentally clobber each other.
2) The VMs can have different memory-views, which is necessary for post-copy
migration (the migration vCPUs on the target need to read and write to
pages, when the primary guest would VMEXIT).

This does not change the threat model for AMD SEV. Any memory involved
is still owned by the primary guest and its initial state is still
attested to through the normal SEV_LAUNCH_* flows. If userspace wanted
to circumvent SEV, they could achieve the same effect by simply attaching
a vCPU to the primary VM.
This patch deliberately leaves userspace in charge of the memslots for the
mirror, as it already has the power to mess with them in the primary guest.

This patch does not support SEV-ES (much less SNP), as it does not
handle handing off attested VMSAs to the mirror.

For additional context, we need a Migration Helper because SEV PSP
migration is far too slow for our live migration on its own. Using
an in-guest migrator lets us speed this up significantly.

Signed-off-by: Nathan Tempelman <natet@google.com>
Message-Id: <20210408223214.2582277-1-natet@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  | 18 ++++++++-
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm/sev.c          | 90 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm/svm.c          |  2 +
 arch/x86/kvm/svm/svm.h          |  2 +
 arch/x86/kvm/x86.c              |  7 +++-
 include/linux/kvm_host.h        |  1 +
 include/uapi/linux/kvm.h        |  1 +
 virt/kvm/kvm_main.c             |  6 +++
 9 files changed, 126 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 414b7fe1cf7b..fd4a84911355 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6207,6 +6207,22 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them.
 This capability can be used to check / enable 2nd DAWR feature provided
 by POWER10 processor.
 
+7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM
+-------------------------------------
+
+Architectures: x86 SEV enabled
+Type: vm
+Parameters: args[0] is the fd of the source vm
+Returns: 0 on success; ENOTTY on error
+
+This capability enables userspace to copy encryption context from the vm
+indicated by the fd to the vm this is called on.
+
+This is intended to support in-guest workloads scheduled by the host. This
+allows the in-guest workload to maintain its own NPTs and keeps the two vms
+from accidentally clobbering each other with interrupts and the like (separate
+APIC/MSRs/etc).
+
 7.25 KVM_CAP_SGX_ATTRIBUTE
 ----------------------
 
@@ -6749,4 +6765,4 @@ in the kernel based fast path. If they can not be handled by the kernel,
 they will get passed on to user space. So user space still has to have
 an implementation for these despite the in kernel acceleration.
 
-This capability is always enabled.
\ No newline at end of file
+This capability is always enabled.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c5c96386cc76..6e195f7df4f0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1349,6 +1349,7 @@ struct kvm_x86_ops {
 	int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
 	int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 	int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+	int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
 
 	int (*get_msr_feature)(struct kvm_msr_entry *entry);
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index bb4bf5ffb104..4bf79dbc3eeb 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -66,6 +66,11 @@ static int sev_flush_asids(void)
 	return ret;
 }
 
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+	return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+}
+
 /* Must be called with the sev_bitmap_lock held */
 static bool __sev_recycle_asids(int min_asid, int max_asid)
 {
@@ -1122,6 +1127,12 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 
 	mutex_lock(&kvm->lock);
 
+	/* enc_context_owner handles all memory enc operations */
+	if (is_mirroring_enc_context(kvm)) {
+		r = -EINVAL;
+		goto out;
+	}
+
 	switch (sev_cmd.id) {
 	case KVM_SEV_ES_INIT:
 		if (!sev_es) {
@@ -1185,6 +1196,10 @@ int svm_register_enc_region(struct kvm *kvm,
 	if (!sev_guest(kvm))
 		return -ENOTTY;
 
+	/* If kvm is mirroring encryption context it isn't responsible for it */
+	if (is_mirroring_enc_context(kvm))
+		return -EINVAL;
+
 	if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
 		return -EINVAL;
 
@@ -1251,6 +1266,10 @@ int svm_unregister_enc_region(struct kvm *kvm,
 	struct enc_region *region;
 	int ret;
 
+	/* If kvm is mirroring encryption context it isn't responsible for it */
+	if (is_mirroring_enc_context(kvm))
+		return -EINVAL;
+
 	mutex_lock(&kvm->lock);
 
 	if (!sev_guest(kvm)) {
@@ -1281,6 +1300,71 @@ failed:
 	return ret;
 }
 
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
+{
+	struct file *source_kvm_file;
+	struct kvm *source_kvm;
+	struct kvm_sev_info *mirror_sev;
+	unsigned int asid;
+	int ret;
+
+	source_kvm_file = fget(source_fd);
+	if (!file_is_kvm(source_kvm_file)) {
+		ret = -EBADF;
+		goto e_source_put;
+	}
+
+	source_kvm = source_kvm_file->private_data;
+	mutex_lock(&source_kvm->lock);
+
+	if (!sev_guest(source_kvm)) {
+		ret = -EINVAL;
+		goto e_source_unlock;
+	}
+
+	/* Mirrors of mirrors should work, but let's not get silly */
+	if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+		ret = -EINVAL;
+		goto e_source_unlock;
+	}
+
+	asid = to_kvm_svm(source_kvm)->sev_info.asid;
+
+	/*
+	 * The mirror kvm holds an enc_context_owner ref so its asid can't
+	 * disappear until we're done with it
+	 */
+	kvm_get_kvm(source_kvm);
+
+	fput(source_kvm_file);
+	mutex_unlock(&source_kvm->lock);
+	mutex_lock(&kvm->lock);
+
+	if (sev_guest(kvm)) {
+		ret = -EINVAL;
+		goto e_mirror_unlock;
+	}
+
+	/* Set enc_context_owner and copy its encryption context over */
+	mirror_sev = &to_kvm_svm(kvm)->sev_info;
+	mirror_sev->enc_context_owner = source_kvm;
+	mirror_sev->asid = asid;
+	mirror_sev->active = true;
+
+	mutex_unlock(&kvm->lock);
+	return 0;
+
+e_mirror_unlock:
+	mutex_unlock(&kvm->lock);
+	kvm_put_kvm(source_kvm);
+	return ret;
+e_source_unlock:
+	mutex_unlock(&source_kvm->lock);
+e_source_put:
+	fput(source_kvm_file);
+	return ret;
+}
+
 void sev_vm_destroy(struct kvm *kvm)
 {
 	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -1290,6 +1374,12 @@ void sev_vm_destroy(struct kvm *kvm)
 	if (!sev_guest(kvm))
 		return;
 
+	/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+	if (is_mirroring_enc_context(kvm)) {
+		kvm_put_kvm(sev->enc_context_owner);
+		return;
+	}
+
 	mutex_lock(&kvm->lock);
 
 	/*
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 86e42d42637e..cd8c333ed2dc 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4589,6 +4589,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.mem_enc_reg_region = svm_register_enc_region,
 	.mem_enc_unreg_region = svm_unregister_enc_region,
 
+	.vm_copy_enc_context_from = svm_vm_copy_asid_from,
+
 	.can_emulate_instruction = svm_can_emulate_instruction,
 
 	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7deb7a057004..454da1c1d9b7 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -68,6 +68,7 @@ struct kvm_sev_info {
 	unsigned long pages_locked; /* Number of pages locked */
 	struct list_head regions_list;  /* List of registered regions */
 	u64 ap_jump_table;	/* SEV-ES AP Jump Table address */
+	struct kvm *enc_context_owner; /* Owner of copied encryption context */
 };
 
 struct kvm_svm {
@@ -580,6 +581,7 @@ int svm_register_enc_region(struct kvm *kvm,
 			    struct kvm_enc_region *range);
 int svm_unregister_enc_region(struct kvm *kvm,
 			      struct kvm_enc_region *range);
+int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd);
 void pre_sev_run(struct vcpu_svm *svm, int cpu);
 void __init sev_hardware_setup(void);
 void sev_hardware_teardown(void);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d4cdab15238b..a5eeca55810f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3808,6 +3808,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 #ifdef CONFIG_X86_SGX_KVM
 	case KVM_CAP_SGX_ATTRIBUTE:
 #endif
+	case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -4714,7 +4715,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 			kvm_update_pv_runtime(vcpu);
 
 		return 0;
-
 	default:
 		return -EINVAL;
 	}
@@ -5413,6 +5413,11 @@ split_irqchip_unlock:
 		break;
 	}
 #endif
+	case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+		r = -EINVAL;
+		if (kvm_x86_ops.vm_copy_enc_context_from)
+			r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
+		return r;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 82b066db37cb..c306b4c3674e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -654,6 +654,7 @@ void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
+bool file_is_kvm(struct file *file);
 void kvm_put_kvm_no_destroy(struct kvm *kvm);
 
 static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 162b7e044c8b..37f0a329da6a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1080,6 +1080,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_DAWR1 194
 #define KVM_CAP_SET_GUEST_DEBUG2 195
 #define KVM_CAP_SGX_ATTRIBUTE 196
+#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 545a28387c33..48c1e4842908 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4197,6 +4197,12 @@ static struct file_operations kvm_vm_fops = {
 	KVM_COMPAT(kvm_vm_compat_ioctl),
 };
 
+bool file_is_kvm(struct file *file)
+{
+	return file && file->f_op == &kvm_vm_fops;
+}
+EXPORT_SYMBOL_GPL(file_is_kvm);
+
 static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
 	int r;
-- 
cgit v1.2.3


From 52acd22faa1af8a0514ccd075a6978ac97986425 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Fri, 16 Apr 2021 11:08:10 +0800
Subject: KVM: Boost vCPU candidate in user mode which is delivering interrupt

Both lock holder vCPU and IPI receiver that has halted are condidate for
boost. However, the PLE handler was originally designed to deal with the
lock holder preemption problem. The Intel PLE occurs when the spinlock
waiter is in kernel mode. This assumption doesn't hold for IPI receiver,
they can be in either kernel or user mode. the vCPU candidate in user mode
will not be boosted even if they should respond to IPIs. Some benchmarks
like pbzip2, swaptions etc do the TLB shootdown in kernel mode and most
of the time they are running in user mode. It can lead to a large number
of continuous PLE events because the IPI sender causes PLE events
repeatedly until the receiver is scheduled while the receiver is not
candidate for a boost.

This patch boosts the vCPU candidiate in user mode which is delivery
interrupt. We can observe the speed of pbzip2 improves 10% in 96 vCPUs
VM in over-subscribe scenario (The host machine is 2 socket, 48 cores,
96 HTs Intel CLX box). There is no performance regression for other
benchmarks like Unixbench spawn (most of the time contend read/write
lock in kernel mode), ebizzy (most of the time contend read/write sem
and TLB shoodtdown in kernel mode).

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1618542490-14756-1-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c       | 8 ++++++++
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 8 +++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a5eeca55810f..c9ba6f2d9bcd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11093,6 +11093,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
 	return false;
 }
 
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+		return true;
+
+	return false;
+}
+
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.preempted_in_kernel;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c306b4c3674e..8895b95b6a22 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -960,6 +960,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 48c1e4842908..2799c6660cce 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3137,6 +3137,11 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
 	return false;
 }
 
+bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
 	struct kvm *kvm = me->kvm;
@@ -3170,7 +3175,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 			    !vcpu_dy_runnable(vcpu))
 				continue;
 			if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
-				!kvm_arch_vcpu_in_kernel(vcpu))
+			    !kvm_arch_dy_has_pending_interrupt(vcpu) &&
+			    !kvm_arch_vcpu_in_kernel(vcpu))
 				continue;
 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
 				continue;
-- 
cgit v1.2.3


From 4cfdd47d6d95aca4fb8d6cfbe73392472d353f82 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 15 Apr 2021 15:53:14 +0000
Subject: KVM: SVM: Add KVM_SEV SEND_START command

The command is used to create an outgoing SEV guest encryption context.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Steve Rutherford <srutherford@google.com>
Reviewed-by: Venu Busireddy <venu.busireddy@oracle.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Message-Id: <2f1686d0164e0f1b3d6a41d620408393e0a48376.1618498113.git.ashish.kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/amd-memory-encryption.rst |  30 ++++++
 arch/x86/kvm/svm/sev.c                           | 128 +++++++++++++++++++++++
 include/linux/psp-sev.h                          |   8 +-
 include/uapi/linux/kvm.h                         |  12 +++
 4 files changed, 174 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index 34ce2d1fcb89..db5c3fb2bab5 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -290,6 +290,36 @@ Returns: 0 on success, -negative on error
                 __u32 len;
         };
 
+11. KVM_SEV_SEND_START
+----------------------
+
+The KVM_SEV_SEND_START command can be used by the hypervisor to create an
+outgoing guest encryption context.
+
+If session_len is zero on entry, the length of the guest session information is
+written to session_len and all other fields are not used.
+
+Parameters (in): struct kvm_sev_send_start
+
+Returns: 0 on success, -negative on error
+
+::
+        struct kvm_sev_send_start {
+                __u32 policy;                 /* guest policy */
+
+                __u64 pdh_cert_uaddr;         /* platform Diffie-Hellman certificate */
+                __u32 pdh_cert_len;
+
+                __u64 plat_certs_uaddr;        /* platform certificate chain */
+                __u32 plat_certs_len;
+
+                __u64 amd_certs_uaddr;        /* AMD certificate */
+                __u32 amd_certs_len;
+
+                __u64 session_uaddr;          /* Guest session information */
+                __u32 session_len;
+        };
+
 References
 ==========
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4bf79dbc3eeb..9f5312956d5e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1111,6 +1111,131 @@ e_free:
 	return ret;
 }
 
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+				      struct kvm_sev_send_start *params)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_send_start *data;
+	int ret;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+	if (data == NULL)
+		return -ENOMEM;
+
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, data, &argp->error);
+	if (ret < 0)
+		goto out;
+
+	params->session_len = data->session_len;
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+				sizeof(struct kvm_sev_send_start)))
+		ret = -EFAULT;
+
+out:
+	kfree(data);
+	return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_send_start *data;
+	struct kvm_sev_send_start params;
+	void *amd_certs, *session_data;
+	void *pdh_cert, *plat_certs;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+				sizeof(struct kvm_sev_send_start)))
+		return -EFAULT;
+
+	/* if session_len is zero, userspace wants to query the session length */
+	if (!params.session_len)
+		return __sev_send_start_query_session_length(kvm, argp,
+				&params);
+
+	/* some sanity checks */
+	if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+	    !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+		return -EINVAL;
+
+	/* allocate the memory to hold the session data blob */
+	session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+	if (!session_data)
+		return -ENOMEM;
+
+	/* copy the certificate blobs from userspace */
+	pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+				params.pdh_cert_len);
+	if (IS_ERR(pdh_cert)) {
+		ret = PTR_ERR(pdh_cert);
+		goto e_free_session;
+	}
+
+	plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+				params.plat_certs_len);
+	if (IS_ERR(plat_certs)) {
+		ret = PTR_ERR(plat_certs);
+		goto e_free_pdh;
+	}
+
+	amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+				params.amd_certs_len);
+	if (IS_ERR(amd_certs)) {
+		ret = PTR_ERR(amd_certs);
+		goto e_free_plat_cert;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+	if (data == NULL) {
+		ret = -ENOMEM;
+		goto e_free_amd_cert;
+	}
+
+	/* populate the FW SEND_START field with system physical address */
+	data->pdh_cert_address = __psp_pa(pdh_cert);
+	data->pdh_cert_len = params.pdh_cert_len;
+	data->plat_certs_address = __psp_pa(plat_certs);
+	data->plat_certs_len = params.plat_certs_len;
+	data->amd_certs_address = __psp_pa(amd_certs);
+	data->amd_certs_len = params.amd_certs_len;
+	data->session_address = __psp_pa(session_data);
+	data->session_len = params.session_len;
+	data->handle = sev->handle;
+
+	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, data, &argp->error);
+
+	if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+			session_data, params.session_len)) {
+		ret = -EFAULT;
+		goto e_free;
+	}
+
+	params.policy = data->policy;
+	params.session_len = data->session_len;
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+				sizeof(struct kvm_sev_send_start)))
+		ret = -EFAULT;
+
+e_free:
+	kfree(data);
+e_free_amd_cert:
+	kfree(amd_certs);
+e_free_plat_cert:
+	kfree(plat_certs);
+e_free_pdh:
+	kfree(pdh_cert);
+e_free_session:
+	kfree(session_data);
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1173,6 +1298,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_GET_ATTESTATION_REPORT:
 		r = sev_get_attestation_report(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SEND_START:
+		r = sev_send_start(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index b801ead1e2bb..73da511b9423 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -326,11 +326,11 @@ struct sev_data_send_start {
 	u64 pdh_cert_address;			/* In */
 	u32 pdh_cert_len;			/* In */
 	u32 reserved1;
-	u64 plat_cert_address;			/* In */
-	u32 plat_cert_len;			/* In */
+	u64 plat_certs_address;			/* In */
+	u32 plat_certs_len;			/* In */
 	u32 reserved2;
-	u64 amd_cert_address;			/* In */
-	u32 amd_cert_len;			/* In */
+	u64 amd_certs_address;			/* In */
+	u32 amd_certs_len;			/* In */
 	u32 reserved3;
 	u64 session_address;			/* In */
 	u32 session_len;			/* In/Out */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 37f0a329da6a..11b3e528d35d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1732,6 +1732,18 @@ struct kvm_sev_attestation_report {
 	__u32 len;
 };
 
+struct kvm_sev_send_start {
+	__u32 policy;
+	__u64 pdh_cert_uaddr;
+	__u32 pdh_cert_len;
+	__u64 plat_certs_uaddr;
+	__u32 plat_certs_len;
+	__u64 amd_certs_uaddr;
+	__u32 amd_certs_len;
+	__u64 session_uaddr;
+	__u32 session_len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-- 
cgit v1.2.3


From 5569e2e7a650dfffd4df7635662b2f92162d6501 Mon Sep 17 00:00:00 2001
From: Steve Rutherford <srutherford@google.com>
Date: Tue, 20 Apr 2021 05:01:20 -0400
Subject: KVM: SVM: Add support for KVM_SEV_SEND_CANCEL command

After completion of SEND_START, but before SEND_FINISH, the source VMM can
issue the SEND_CANCEL command to stop a migration. This is necessary so
that a cancelled migration can restart with a new target later.

Reviewed-by: Nathan Tempelman <natet@google.com>
Reviewed-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Steve Rutherford <srutherford@google.com>
Message-Id: <20210412194408.2458827-1-srutherford@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/amd-memory-encryption.rst |  9 +++++++++
 arch/x86/kvm/svm/sev.c                           | 23 +++++++++++++++++++++++
 drivers/crypto/ccp/sev-dev.c                     |  1 +
 include/linux/psp-sev.h                          | 10 ++++++++++
 include/uapi/linux/kvm.h                         |  2 ++
 5 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst
index 03f2518cfbeb..c36a12975763 100644
--- a/Documentation/virt/kvm/amd-memory-encryption.rst
+++ b/Documentation/virt/kvm/amd-memory-encryption.rst
@@ -356,6 +356,15 @@ issued by the hypervisor to delete the encryption context.
 
 Returns: 0 on success, -negative on error
 
+14. KVM_SEV_SEND_CANCEL
+------------------------
+
+After completion of SEND_START, but before SEND_FINISH, the source VMM can issue the
+SEND_CANCEL command to stop a migration. This is necessary so that a cancelled
+migration can restart with a new target later.
+
+Returns: 0 on success, -negative on error
+
 References
 ==========
 
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f25c52a2dc14..552f47787143 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1377,6 +1377,26 @@ static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+	struct sev_data_send_cancel *data;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, data, &argp->error);
+
+	kfree(data);
+	return ret;
+}
+
 int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -1448,6 +1468,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_SEND_FINISH:
 		r = sev_send_finish(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_SEND_CANCEL:
+		r = sev_send_cancel(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index cb9b4c4e371e..4172a1afa0db 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -129,6 +129,7 @@ static int sev_cmd_buffer_len(int cmd)
 	case SEV_CMD_DOWNLOAD_FIRMWARE:		return sizeof(struct sev_data_download_firmware);
 	case SEV_CMD_GET_ID:			return sizeof(struct sev_data_get_id);
 	case SEV_CMD_ATTESTATION_REPORT:	return sizeof(struct sev_data_attestation_report);
+	case SEV_CMD_SEND_CANCEL:			return sizeof(struct sev_data_send_cancel);
 	default:				return 0;
 	}
 
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 73da511b9423..d48a7192e881 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -73,6 +73,7 @@ enum sev_cmd {
 	SEV_CMD_SEND_UPDATE_DATA	= 0x041,
 	SEV_CMD_SEND_UPDATE_VMSA	= 0x042,
 	SEV_CMD_SEND_FINISH		= 0x043,
+	SEV_CMD_SEND_CANCEL		= 0x044,
 
 	/* Guest migration commands (incoming) */
 	SEV_CMD_RECEIVE_START		= 0x050,
@@ -392,6 +393,15 @@ struct sev_data_send_finish {
 	u32 handle;				/* In */
 } __packed;
 
+/**
+ * struct sev_data_send_cancel - SEND_CANCEL command parameters
+ *
+ * @handle: handle of the VM to process
+ */
+struct sev_data_send_cancel {
+	u32 handle;				/* In */
+} __packed;
+
 /**
  * struct sev_data_receive_start - RECEIVE_START command parameters
  *
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d07be69f402f..5d1adeb13739 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1674,6 +1674,8 @@ enum sev_cmd_id {
 	KVM_SEV_CERT_EXPORT,
 	/* Attestation report */
 	KVM_SEV_GET_ATTESTATION_REPORT,
+	/* Guest Migration Extension */
+	KVM_SEV_SEND_CANCEL,
 
 	KVM_SEV_NR_MAX,
 };
-- 
cgit v1.2.3


From 1de15e99a242a66ef4f803fe1ad357f86b3a75f8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 12 Apr 2021 17:07:39 +0300
Subject: pinctrl: Keep enum pin_config_param ordered by name

It seems the ordering is by name. Keep it that way.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210412140741.39946-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf-generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 6aeb711f7cd1..188db8d84162 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -90,6 +90,7 @@ struct pinctrl_map;
  * 	value on the line. Use argument 1 to indicate high level, argument 0 to
  *	indicate low level. (Please see Documentation/driver-api/pinctl.rst,
  *	section "GPIO mode pitfalls" for a discussion around this parameter.)
+ * @PIN_CONFIG_PERSIST_STATE: retain pin state across sleep or controller reset
  * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power
  *	supplies, the argument to this parameter (on a custom format) tells
  *	the driver which alternative power source to use.
@@ -101,7 +102,6 @@ struct pinctrl_map;
  *	or latch delay (on outputs) this parameter (in a custom format)
  *	specifies the clock skew or latch delay. It typically controls how
  *	many double inverters are put in front of the line.
- * @PIN_CONFIG_PERSIST_STATE: retain pin state across sleep or controller reset
  * @PIN_CONFIG_END: this is the last enumerator for pin configurations, if
  *	you need to pass in custom configurations to the pin controller, use
  *	PIN_CONFIG_END+1 as the base offset.
@@ -127,11 +127,11 @@ enum pin_config_param {
 	PIN_CONFIG_LOW_POWER_MODE,
 	PIN_CONFIG_OUTPUT_ENABLE,
 	PIN_CONFIG_OUTPUT,
+	PIN_CONFIG_PERSIST_STATE,
 	PIN_CONFIG_POWER_SOURCE,
 	PIN_CONFIG_SLEEP_HARDWARE_STATE,
 	PIN_CONFIG_SLEW_RATE,
 	PIN_CONFIG_SKEW_DELAY,
-	PIN_CONFIG_PERSIST_STATE,
 	PIN_CONFIG_END = 0x7F,
 	PIN_CONFIG_MAX = 0xFF,
 };
-- 
cgit v1.2.3


From 31f9a421a1d01538776db37ec9c5419a3a49d650 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 12 Apr 2021 17:07:40 +0300
Subject: pinctrl: Introduce MODE group in enum pin_config_param

Better to have a MODE group of settings to keep them together
when ordered alphabetically. Hence, rename PIN_CONFIG_LOW_POWER_MODE
to PIN_CONFIG_MODE_LOW_POWER.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210412140741.39946-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 6 +++---
 drivers/pinctrl/pinctrl-lpc18xx.c       | 4 ++--
 drivers/pinctrl/pinctrl-single.c        | 6 +++---
 drivers/pinctrl/pinctrl-zynq.c          | 4 ++--
 drivers/pinctrl/pxa/pinctrl-pxa2xx.c    | 4 ++--
 drivers/soc/tegra/pmc.c                 | 4 ++--
 include/linux/pinctrl/pinconf-generic.h | 4 ++--
 7 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 1e225d513988..22e8d4c4040e 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -43,7 +43,7 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_INPUT_ENABLE, "input enabled", NULL, false),
 	PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT, "input schmitt trigger", NULL, false),
 	PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT_ENABLE, "input schmitt enabled", NULL, false),
-	PCONFDUMP(PIN_CONFIG_LOW_POWER_MODE, "pin low power", "mode", true),
+	PCONFDUMP(PIN_CONFIG_MODE_LOW_POWER, "pin low power", "mode", true),
 	PCONFDUMP(PIN_CONFIG_OUTPUT_ENABLE, "output enabled", NULL, false),
 	PCONFDUMP(PIN_CONFIG_OUTPUT, "pin output", "level", true),
 	PCONFDUMP(PIN_CONFIG_POWER_SOURCE, "pin power source", "selector", true),
@@ -174,8 +174,8 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "input-schmitt", PIN_CONFIG_INPUT_SCHMITT, 0 },
 	{ "input-schmitt-disable", PIN_CONFIG_INPUT_SCHMITT_ENABLE, 0 },
 	{ "input-schmitt-enable", PIN_CONFIG_INPUT_SCHMITT_ENABLE, 1 },
-	{ "low-power-disable", PIN_CONFIG_LOW_POWER_MODE, 0 },
-	{ "low-power-enable", PIN_CONFIG_LOW_POWER_MODE, 1 },
+	{ "low-power-disable", PIN_CONFIG_MODE_LOW_POWER, 0 },
+	{ "low-power-enable", PIN_CONFIG_MODE_LOW_POWER, 1 },
 	{ "output-disable", PIN_CONFIG_OUTPUT_ENABLE, 0 },
 	{ "output-enable", PIN_CONFIG_OUTPUT_ENABLE, 1 },
 	{ "output-high", PIN_CONFIG_OUTPUT, 1, },
diff --git a/drivers/pinctrl/pinctrl-lpc18xx.c b/drivers/pinctrl/pinctrl-lpc18xx.c
index 7b2f885e68bd..ed9bf2c89998 100644
--- a/drivers/pinctrl/pinctrl-lpc18xx.c
+++ b/drivers/pinctrl/pinctrl-lpc18xx.c
@@ -646,7 +646,7 @@ static const struct pin_config_item lpc18xx_conf_items[ARRAY_SIZE(lpc18xx_params
 static int lpc18xx_pconf_get_usb1(enum pin_config_param param, int *arg, u32 reg)
 {
 	switch (param) {
-	case PIN_CONFIG_LOW_POWER_MODE:
+	case PIN_CONFIG_MODE_LOW_POWER:
 		if (reg & LPC18XX_SCU_USB1_EPWR)
 			*arg = 0;
 		else
@@ -904,7 +904,7 @@ static int lpc18xx_pconf_set_usb1(struct pinctrl_dev *pctldev,
 				  u32 param_val, u32 *reg)
 {
 	switch (param) {
-	case PIN_CONFIG_LOW_POWER_MODE:
+	case PIN_CONFIG_MODE_LOW_POWER:
 		if (param_val)
 			*reg &= ~LPC18XX_SCU_USB1_EPWR;
 		else
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index 39aac32ed09c..2c9c9835f375 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -533,7 +533,7 @@ static int pcs_pinconf_get(struct pinctrl_dev *pctldev,
 			break;
 		case PIN_CONFIG_DRIVE_STRENGTH:
 		case PIN_CONFIG_SLEW_RATE:
-		case PIN_CONFIG_LOW_POWER_MODE:
+		case PIN_CONFIG_MODE_LOW_POWER:
 		default:
 			*config = data;
 			break;
@@ -571,7 +571,7 @@ static int pcs_pinconf_set(struct pinctrl_dev *pctldev,
 			case PIN_CONFIG_INPUT_SCHMITT:
 			case PIN_CONFIG_DRIVE_STRENGTH:
 			case PIN_CONFIG_SLEW_RATE:
-			case PIN_CONFIG_LOW_POWER_MODE:
+			case PIN_CONFIG_MODE_LOW_POWER:
 				shift = ffs(func->conf[i].mask) - 1;
 				data &= ~func->conf[i].mask;
 				data |= (arg << shift) & func->conf[i].mask;
@@ -919,7 +919,7 @@ static int pcs_parse_pinconf(struct pcs_device *pcs, struct device_node *np,
 		{ "pinctrl-single,drive-strength", PIN_CONFIG_DRIVE_STRENGTH, },
 		{ "pinctrl-single,slew-rate", PIN_CONFIG_SLEW_RATE, },
 		{ "pinctrl-single,input-schmitt", PIN_CONFIG_INPUT_SCHMITT, },
-		{ "pinctrl-single,low-power-mode", PIN_CONFIG_LOW_POWER_MODE, },
+		{ "pinctrl-single,low-power-mode", PIN_CONFIG_MODE_LOW_POWER, },
 	};
 	static const struct pcs_conf_type prop4[] = {
 		{ "pinctrl-single,bias-pullup", PIN_CONFIG_BIAS_PULL_UP, },
diff --git a/drivers/pinctrl/pinctrl-zynq.c b/drivers/pinctrl/pinctrl-zynq.c
index c6052a0e827a..5fb924a2eedd 100644
--- a/drivers/pinctrl/pinctrl-zynq.c
+++ b/drivers/pinctrl/pinctrl-zynq.c
@@ -1016,7 +1016,7 @@ static int zynq_pinconf_cfg_get(struct pinctrl_dev *pctldev,
 	case PIN_CONFIG_SLEW_RATE:
 		arg = !!(reg & ZYNQ_PINCONF_SPEED);
 		break;
-	case PIN_CONFIG_LOW_POWER_MODE:
+	case PIN_CONFIG_MODE_LOW_POWER:
 	{
 		enum zynq_io_standards iostd = zynq_pinconf_iostd_get(reg);
 
@@ -1087,7 +1087,7 @@ static int zynq_pinconf_cfg_set(struct pinctrl_dev *pctldev,
 			reg &= ~ZYNQ_PINCONF_IOTYPE_MASK;
 			reg |= arg << ZYNQ_PINCONF_IOTYPE_SHIFT;
 			break;
-		case PIN_CONFIG_LOW_POWER_MODE:
+		case PIN_CONFIG_MODE_LOW_POWER:
 			if (arg)
 				reg |= ZYNQ_PINCONF_DISABLE_RECVR;
 			else
diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
index eab029a21643..d2568dab8c78 100644
--- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
+++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
@@ -194,7 +194,7 @@ static int pxa2xx_pconf_group_get(struct pinctrl_dev *pctldev,
 
 	spin_lock_irqsave(&pctl->lock, flags);
 	val = readl_relaxed(pgsr) & BIT(pin % 32);
-	*config = val ? PIN_CONFIG_LOW_POWER_MODE : 0;
+	*config = val ? PIN_CONFIG_MODE_LOW_POWER : 0;
 	spin_unlock_irqrestore(&pctl->lock, flags);
 
 	dev_dbg(pctl->dev, "get sleep gpio state(pin=%d) %d\n",
@@ -217,7 +217,7 @@ static int pxa2xx_pconf_group_set(struct pinctrl_dev *pctldev,
 
 	for (i = 0; i < num_configs; i++) {
 		switch (pinconf_to_config_param(configs[i])) {
-		case PIN_CONFIG_LOW_POWER_MODE:
+		case PIN_CONFIG_MODE_LOW_POWER:
 			is_set = pinconf_to_config_argument(configs[i]);
 			break;
 		default:
diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c
index df9a5ca8c99c..46bcdbef1675 100644
--- a/drivers/soc/tegra/pmc.c
+++ b/drivers/soc/tegra/pmc.c
@@ -1738,7 +1738,7 @@ static int tegra_io_pad_pinconf_get(struct pinctrl_dev *pctl_dev,
 		arg = ret;
 		break;
 
-	case PIN_CONFIG_LOW_POWER_MODE:
+	case PIN_CONFIG_MODE_LOW_POWER:
 		ret = tegra_io_pad_is_powered(pmc, pad->id);
 		if (ret < 0)
 			return ret;
@@ -1775,7 +1775,7 @@ static int tegra_io_pad_pinconf_set(struct pinctrl_dev *pctl_dev,
 		arg = pinconf_to_config_argument(configs[i]);
 
 		switch (param) {
-		case PIN_CONFIG_LOW_POWER_MODE:
+		case PIN_CONFIG_MODE_LOW_POWER:
 			if (arg)
 				err = tegra_io_pad_power_disable(pad->id);
 			else
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 188db8d84162..189e701832ea 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -76,7 +76,7 @@ struct pinctrl_map;
  * @PIN_CONFIG_INPUT_SCHMITT_ENABLE: control schmitt-trigger mode on the pin.
  *      If the argument != 0, schmitt-trigger mode is enabled. If it's 0,
  *      schmitt-trigger mode is disabled.
- * @PIN_CONFIG_LOW_POWER_MODE: this will configure the pin for low power
+ * @PIN_CONFIG_MODE_LOW_POWER: this will configure the pin for low power
  *	operation, if several modes of operation are supported these can be
  *	passed in the argument on a custom form, else just use argument 1
  *	to indicate low power mode, argument 0 turns low power mode off.
@@ -124,7 +124,7 @@ enum pin_config_param {
 	PIN_CONFIG_INPUT_ENABLE,
 	PIN_CONFIG_INPUT_SCHMITT,
 	PIN_CONFIG_INPUT_SCHMITT_ENABLE,
-	PIN_CONFIG_LOW_POWER_MODE,
+	PIN_CONFIG_MODE_LOW_POWER,
 	PIN_CONFIG_OUTPUT_ENABLE,
 	PIN_CONFIG_OUTPUT,
 	PIN_CONFIG_PERSIST_STATE,
-- 
cgit v1.2.3


From 09e11caaa4cffac681963688b774e1aa3063b3a9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 12 Apr 2021 17:07:41 +0300
Subject: pinctrl: Add PIN_CONFIG_MODE_PWM to enum pin_config_param

It seems that we will have more and more pin controllers that support
PWM function on the (selected) pins. Due to it being a part of pin
controller IP the idea is to have some code that will switch the mode
and attach the corresponding driver, for example, via using it as
a library. Meanwhile, put a corresponding item to the pin_config_param
enumerator.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210412140741.39946-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf-generic.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 189e701832ea..e18ab3d5908f 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -80,6 +80,7 @@ struct pinctrl_map;
  *	operation, if several modes of operation are supported these can be
  *	passed in the argument on a custom form, else just use argument 1
  *	to indicate low power mode, argument 0 turns low power mode off.
+ * @PIN_CONFIG_MODE_PWM: this will configure the pin for PWM
  * @PIN_CONFIG_OUTPUT_ENABLE: this will enable the pin's output mode
  * 	without driving a value there. For most platforms this reduces to
  * 	enable the output buffers and then let the pin controller current
@@ -125,6 +126,7 @@ enum pin_config_param {
 	PIN_CONFIG_INPUT_SCHMITT,
 	PIN_CONFIG_INPUT_SCHMITT_ENABLE,
 	PIN_CONFIG_MODE_LOW_POWER,
+	PIN_CONFIG_MODE_PWM,
 	PIN_CONFIG_OUTPUT_ENABLE,
 	PIN_CONFIG_OUTPUT,
 	PIN_CONFIG_PERSIST_STATE,
-- 
cgit v1.2.3


From 85367040511f8402d7e4054d8c17b053c75e33ff Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 21 Apr 2021 23:45:26 +0800
Subject: scsi: blk-mq: Fix build warning when making htmldocs

Fixes the following warning when running 'make htmldocs':

  include/linux/blk-mq.h:395: warning: Function parameter or member
  'set_rq_budget_token' not described in 'blk_mq_ops'
  include/linux/blk-mq.h:395: warning: Function parameter or member
  'get_rq_budget_token' not described in 'blk_mq_ops'

[mkp: added warning messages]

Link: https://lore.kernel.org/r/20210421154526.1954174-1-ming.lei@redhat.com
Fixes: d022d18c045f ("scsi: blk-mq: Add callbacks for storing & retrieving budget token")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blk-mq.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3bd3ee651143..359486940fa0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -313,12 +313,12 @@ struct blk_mq_ops {
 	 */
 	void (*put_budget)(struct request_queue *, int);
 
-	/*
-	 * @set_rq_budget_toekn: store rq's budget token
+	/**
+	 * @set_rq_budget_token: store rq's budget token
 	 */
 	void (*set_rq_budget_token)(struct request *, int);
-	/*
-	 * @get_rq_budget_toekn: retrieve rq's budget token
+	/**
+	 * @get_rq_budget_token: retrieve rq's budget token
 	 */
 	int (*get_rq_budget_token)(struct request *);
 
-- 
cgit v1.2.3


From a943d76352dbb4707a5e5537bbe696c00f5ddd36 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Wed, 21 Apr 2021 21:11:32 +0300
Subject: devm-helpers: Fix devm_delayed_work_autocancel() kerneldoc

The kerneldoc for devm_delayed_work_autocancel() contains invalid
parameter description.

Fix the parameter description. And while at it - make it more obvous that
this function operates on delayed_work. That helps differentiating with
resource-managed INIT_WORK description (which should follow in near future)

Fixes: 0341ce544394 ("workqueue: Add resource managed version of delayed work init")
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Link: https://lore.kernel.org/r/db3a8b4b8899fdf109a0cc760807de12d3b4f09b.1619028482.git.matti.vaittinen@fi.rohmeurope.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/devm-helpers.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/devm-helpers.h b/include/linux/devm-helpers.h
index f64e0c9f3763..f40f77717a24 100644
--- a/include/linux/devm-helpers.h
+++ b/include/linux/devm-helpers.h
@@ -32,13 +32,14 @@ static inline void devm_delayed_work_drop(void *res)
 }
 
 /**
- * devm_delayed_work_autocancel - Resource-managed work allocation
- * @dev: Device which lifetime work is bound to
- * @pdata: work to be cancelled when driver is detached
+ * devm_delayed_work_autocancel - Resource-managed delayed work allocation
+ * @dev:	Device which lifetime work is bound to
+ * @w:		Work item to be queued
+ * @worker:	Worker function
  *
- * Initialize work which is automatically cancelled when driver is detached.
- * A few drivers need delayed work which must be cancelled before driver
- * is detached to avoid accessing removed resources.
+ * Initialize delayed work which is automatically cancelled when driver is
+ * detached. A few drivers need delayed work which must be cancelled before
+ * driver is detached to avoid accessing removed resources.
  * devm_delayed_work_autocancel() can be used to omit the explicit
  * cancelleation when driver is detached.
  */
-- 
cgit v1.2.3


From fa989ae7c7b38efbc6c3370571fb8a6f7350029a Mon Sep 17 00:00:00 2001
From: Sai Krishna Potthuri <lakshmi.sai.krishna.potthuri@xilinx.com>
Date: Thu, 22 Apr 2021 14:00:00 +0530
Subject: firmware: xilinx: Add pinctrl support

Adding pinctrl support to query platform specific information (pins)
from firmware.

Signed-off-by: Sai Krishna Potthuri <lakshmi.sai.krishna.potthuri@xilinx.com>
Acked-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/1619080202-31924-2-git-send-email-lakshmi.sai.krishna.potthuri@xilinx.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/firmware/xilinx/zynqmp.c     | 114 +++++++++++++++++++++++++++++++++++
 include/linux/firmware/xlnx-zynqmp.h |  90 +++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index 7eb9958662dd..fc01a3632815 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -811,6 +811,120 @@ int zynqmp_pm_fpga_get_status(u32 *value)
 }
 EXPORT_SYMBOL_GPL(zynqmp_pm_fpga_get_status);
 
+/**
+ * zynqmp_pm_pinctrl_request - Request Pin from firmware
+ * @pin: Pin number to request
+ *
+ * This function requests pin from firmware.
+ *
+ * Return: Returns status, either success or error+reason.
+ */
+int zynqmp_pm_pinctrl_request(const u32 pin)
+{
+	return zynqmp_pm_invoke_fn(PM_PINCTRL_REQUEST, pin, 0, 0, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_request);
+
+/**
+ * zynqmp_pm_pinctrl_release - Inform firmware that Pin control is released
+ * @pin: Pin number to release
+ *
+ * This function release pin from firmware.
+ *
+ * Return: Returns status, either success or error+reason.
+ */
+int zynqmp_pm_pinctrl_release(const u32 pin)
+{
+	return zynqmp_pm_invoke_fn(PM_PINCTRL_RELEASE, pin, 0, 0, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_release);
+
+/**
+ * zynqmp_pm_pinctrl_get_function - Read function id set for the given pin
+ * @pin: Pin number
+ * @id: Buffer to store function ID
+ *
+ * This function provides the function currently set for the given pin.
+ *
+ * Return: Returns status, either success or error+reason
+ */
+int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id)
+{
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	int ret;
+
+	if (!id)
+		return -EINVAL;
+
+	ret = zynqmp_pm_invoke_fn(PM_PINCTRL_GET_FUNCTION, pin, 0,
+				  0, 0, ret_payload);
+	*id = ret_payload[1];
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_get_function);
+
+/**
+ * zynqmp_pm_pinctrl_set_function - Set requested function for the pin
+ * @pin: Pin number
+ * @id: Function ID to set
+ *
+ * This function sets requested function for the given pin.
+ *
+ * Return: Returns status, either success or error+reason.
+ */
+int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id)
+{
+	return zynqmp_pm_invoke_fn(PM_PINCTRL_SET_FUNCTION, pin, id,
+				   0, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_set_function);
+
+/**
+ * zynqmp_pm_pinctrl_get_config - Get configuration parameter for the pin
+ * @pin: Pin number
+ * @param: Parameter to get
+ * @value: Buffer to store parameter value
+ *
+ * This function gets requested configuration parameter for the given pin.
+ *
+ * Return: Returns status, either success or error+reason.
+ */
+int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param,
+				 u32 *value)
+{
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	int ret;
+
+	if (!value)
+		return -EINVAL;
+
+	ret = zynqmp_pm_invoke_fn(PM_PINCTRL_CONFIG_PARAM_GET, pin, param,
+				  0, 0, ret_payload);
+	*value = ret_payload[1];
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_get_config);
+
+/**
+ * zynqmp_pm_pinctrl_set_config - Set configuration parameter for the pin
+ * @pin: Pin number
+ * @param: Parameter to set
+ * @value: Parameter value to set
+ *
+ * This function sets requested configuration parameter for the given pin.
+ *
+ * Return: Returns status, either success or error+reason.
+ */
+int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param,
+				 u32 value)
+{
+	return zynqmp_pm_invoke_fn(PM_PINCTRL_CONFIG_PARAM_SET, pin,
+				   param, value, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(zynqmp_pm_pinctrl_set_config);
+
 /**
  * zynqmp_pm_init_finalize() - PM call to inform firmware that the caller
  *			       master has initialized its own power management
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 71177b17eee5..8285a4bcfc2d 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -72,6 +72,12 @@ enum pm_api_id {
 	PM_FPGA_LOAD = 22,
 	PM_FPGA_GET_STATUS = 23,
 	PM_GET_CHIPID = 24,
+	PM_PINCTRL_REQUEST = 28,
+	PM_PINCTRL_RELEASE = 29,
+	PM_PINCTRL_GET_FUNCTION = 30,
+	PM_PINCTRL_SET_FUNCTION = 31,
+	PM_PINCTRL_CONFIG_PARAM_GET = 32,
+	PM_PINCTRL_CONFIG_PARAM_SET = 33,
 	PM_IOCTL = 34,
 	PM_QUERY_DATA = 35,
 	PM_CLOCK_ENABLE = 36,
@@ -122,6 +128,12 @@ enum pm_query_id {
 	PM_QID_CLOCK_GET_FIXEDFACTOR_PARAMS = 3,
 	PM_QID_CLOCK_GET_PARENTS = 4,
 	PM_QID_CLOCK_GET_ATTRIBUTES = 5,
+	PM_QID_PINCTRL_GET_NUM_PINS = 6,
+	PM_QID_PINCTRL_GET_NUM_FUNCTIONS = 7,
+	PM_QID_PINCTRL_GET_NUM_FUNCTION_GROUPS = 8,
+	PM_QID_PINCTRL_GET_FUNCTION_NAME = 9,
+	PM_QID_PINCTRL_GET_FUNCTION_GROUPS = 10,
+	PM_QID_PINCTRL_GET_PIN_GROUPS = 11,
 	PM_QID_CLOCK_GET_NUM_CLOCKS = 12,
 	PM_QID_CLOCK_GET_MAX_DIVISOR = 13,
 };
@@ -285,6 +297,44 @@ enum dll_reset_type {
 	PM_DLL_RESET_PULSE = 2,
 };
 
+enum pm_pinctrl_config_param {
+	PM_PINCTRL_CONFIG_SLEW_RATE = 0,
+	PM_PINCTRL_CONFIG_BIAS_STATUS = 1,
+	PM_PINCTRL_CONFIG_PULL_CTRL = 2,
+	PM_PINCTRL_CONFIG_SCHMITT_CMOS = 3,
+	PM_PINCTRL_CONFIG_DRIVE_STRENGTH = 4,
+	PM_PINCTRL_CONFIG_VOLTAGE_STATUS = 5,
+	PM_PINCTRL_CONFIG_TRI_STATE = 6,
+	PM_PINCTRL_CONFIG_MAX = 7,
+};
+
+enum pm_pinctrl_slew_rate {
+	PM_PINCTRL_SLEW_RATE_FAST = 0,
+	PM_PINCTRL_SLEW_RATE_SLOW = 1,
+};
+
+enum pm_pinctrl_bias_status {
+	PM_PINCTRL_BIAS_DISABLE = 0,
+	PM_PINCTRL_BIAS_ENABLE = 1,
+};
+
+enum pm_pinctrl_pull_ctrl {
+	PM_PINCTRL_BIAS_PULL_DOWN = 0,
+	PM_PINCTRL_BIAS_PULL_UP = 1,
+};
+
+enum pm_pinctrl_schmitt_cmos {
+	PM_PINCTRL_INPUT_TYPE_CMOS = 0,
+	PM_PINCTRL_INPUT_TYPE_SCHMITT = 1,
+};
+
+enum pm_pinctrl_drive_strength {
+	PM_PINCTRL_DRIVE_STRENGTH_2MA = 0,
+	PM_PINCTRL_DRIVE_STRENGTH_4MA = 1,
+	PM_PINCTRL_DRIVE_STRENGTH_8MA = 2,
+	PM_PINCTRL_DRIVE_STRENGTH_12MA = 3,
+};
+
 enum zynqmp_pm_shutdown_type {
 	ZYNQMP_PM_SHUTDOWN_TYPE_SHUTDOWN = 0,
 	ZYNQMP_PM_SHUTDOWN_TYPE_RESET = 1,
@@ -353,6 +403,14 @@ int zynqmp_pm_write_pggs(u32 index, u32 value);
 int zynqmp_pm_read_pggs(u32 index, u32 *value);
 int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype);
 int zynqmp_pm_set_boot_health_status(u32 value);
+int zynqmp_pm_pinctrl_request(const u32 pin);
+int zynqmp_pm_pinctrl_release(const u32 pin);
+int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id);
+int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id);
+int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param,
+				 u32 *value);
+int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param,
+				 u32 value);
 #else
 static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void)
 {
@@ -537,6 +595,38 @@ static inline int zynqmp_pm_set_boot_health_status(u32 value)
 {
 	return -ENODEV;
 }
+
+static inline int zynqmp_pm_pinctrl_request(const u32 pin)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_pinctrl_release(const u32 pin)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param,
+					       u32 *value)
+{
+	return -ENODEV;
+}
+
+static inline int zynqmp_pm_pinctrl_set_config(const u32 pin, const u32 param,
+					       u32 value)
+{
+	return -ENODEV;
+}
 #endif
 
 #endif /* __FIRMWARE_ZYNQMP_H__ */
-- 
cgit v1.2.3


From 75f4e830fa9c47637054a3b7201765f2a314bda2 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Fri, 16 Apr 2021 16:05:55 +0200
Subject: serial: do not restore interrupt state in sysrq helper

The uart_unlock_and_check_sysrq() helper can be used to defer processing
of sysrq until the interrupt handler has released the port lock and is
about to return.

Since commit 81e2073c175b ("genirq: Disable interrupts for force
threaded handlers") interrupt handlers that are not explicitly requested
as threaded are always called with interrupts disabled and there is no
need to save the interrupt state when taking the port lock.

Instead of adding another sysrq helper for when the interrupt state has
not needlessly been saved, drop the state parameter from
uart_unlock_and_check_sysrq() and update its callers to no longer
explicitly disable interrupts in their interrupt handlers.

Cc: Joel Stanley <joel@jms.id.au>
Cc: Andrew Jeffery <andrew@aj.id.au>
Cc: Andy Gross <agross@kernel.org>
Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20210416140557.25177-2-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_aspeed_vuart.c |  5 ++---
 drivers/tty/serial/8250/8250_fsl.c          | 11 ++++++-----
 drivers/tty/serial/8250/8250_omap.c         |  6 +++---
 drivers/tty/serial/8250/8250_port.c         |  6 +++---
 drivers/tty/serial/qcom_geni_serial.c       |  6 +++---
 include/linux/serial_core.h                 | 10 +++++-----
 6 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
index 3c239d98747f..61550f24a2d3 100644
--- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
@@ -320,7 +320,6 @@ static int aspeed_vuart_handle_irq(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned int iir, lsr;
-	unsigned long flags;
 	int space, count;
 
 	iir = serial_port_in(port, UART_IIR);
@@ -328,7 +327,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port)
 	if (iir & UART_IIR_NO_INT)
 		return 0;
 
-	spin_lock_irqsave(&port->lock, flags);
+	spin_lock(&port->lock);
 
 	lsr = serial_port_in(port, UART_LSR);
 
@@ -364,7 +363,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port)
 	if (lsr & UART_LSR_THRE)
 		serial8250_tx_chars(up);
 
-	uart_unlock_and_check_sysrq(port, flags);
+	uart_unlock_and_check_sysrq(port);
 
 	return 1;
 }
diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
index cd19400b65ae..4e75d2e4f87c 100644
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -30,15 +30,14 @@ struct fsl8250_data {
 int fsl8250_handle_irq(struct uart_port *port)
 {
 	unsigned char lsr, orig_lsr;
-	unsigned long flags;
 	unsigned int iir;
 	struct uart_8250_port *up = up_to_u8250p(port);
 
-	spin_lock_irqsave(&up->port.lock, flags);
+	spin_lock(&up->port.lock);
 
 	iir = port->serial_in(port, UART_IIR);
 	if (iir & UART_IIR_NO_INT) {
-		spin_unlock_irqrestore(&up->port.lock, flags);
+		spin_unlock(&up->port.lock);
 		return 0;
 	}
 
@@ -46,7 +45,7 @@ int fsl8250_handle_irq(struct uart_port *port)
 	if (unlikely(up->lsr_saved_flags & UART_LSR_BI)) {
 		up->lsr_saved_flags &= ~UART_LSR_BI;
 		port->serial_in(port, UART_RX);
-		spin_unlock_irqrestore(&up->port.lock, flags);
+		spin_unlock(&up->port.lock);
 		return 1;
 	}
 
@@ -82,7 +81,9 @@ int fsl8250_handle_irq(struct uart_port *port)
 		serial8250_tx_chars(up);
 
 	up->lsr_saved_flags = orig_lsr;
-	uart_unlock_and_check_sysrq(&up->port, flags);
+
+	uart_unlock_and_check_sysrq(&up->port);
+
 	return 1;
 }
 EXPORT_SYMBOL_GPL(fsl8250_handle_irq);
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 23e0decde33e..8ac11eaeca51 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -1143,7 +1143,6 @@ static int omap_8250_dma_handle_irq(struct uart_port *port)
 	struct uart_8250_port *up = up_to_u8250p(port);
 	struct omap8250_priv *priv = up->port.private_data;
 	unsigned char status;
-	unsigned long flags;
 	u8 iir;
 
 	serial8250_rpm_get(up);
@@ -1154,7 +1153,7 @@ static int omap_8250_dma_handle_irq(struct uart_port *port)
 		return IRQ_HANDLED;
 	}
 
-	spin_lock_irqsave(&port->lock, flags);
+	spin_lock(&port->lock);
 
 	status = serial_port_in(port, UART_LSR);
 
@@ -1179,7 +1178,8 @@ static int omap_8250_dma_handle_irq(struct uart_port *port)
 		}
 	}
 
-	uart_unlock_and_check_sysrq(port, flags);
+	uart_unlock_and_check_sysrq(port);
+
 	serial8250_rpm_put(up);
 	return 1;
 }
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 9019f8f626bb..d45dab1ab316 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1879,14 +1879,13 @@ static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
 {
 	unsigned char status;
-	unsigned long flags;
 	struct uart_8250_port *up = up_to_u8250p(port);
 	bool skip_rx = false;
 
 	if (iir & UART_IIR_NO_INT)
 		return 0;
 
-	spin_lock_irqsave(&port->lock, flags);
+	spin_lock(&port->lock);
 
 	status = serial_port_in(port, UART_LSR);
 
@@ -1912,7 +1911,8 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
 		(up->ier & UART_IER_THRI))
 		serial8250_tx_chars(up);
 
-	uart_unlock_and_check_sysrq(port, flags);
+	uart_unlock_and_check_sysrq(port);
+
 	return 1;
 }
 EXPORT_SYMBOL_GPL(serial8250_handle_irq);
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index 0d85b55ea823..00bb88a71606 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -818,7 +818,6 @@ static irqreturn_t qcom_geni_serial_isr(int isr, void *dev)
 	u32 s_irq_status;
 	u32 geni_status;
 	struct uart_port *uport = dev;
-	unsigned long flags;
 	bool drop_rx = false;
 	struct tty_port *tport = &uport->state->port;
 	struct qcom_geni_serial_port *port = to_dev_port(uport, uport);
@@ -826,7 +825,8 @@ static irqreturn_t qcom_geni_serial_isr(int isr, void *dev)
 	if (uport->suspended)
 		return IRQ_NONE;
 
-	spin_lock_irqsave(&uport->lock, flags);
+	spin_lock(&uport->lock);
+
 	m_irq_status = readl(uport->membase + SE_GENI_M_IRQ_STATUS);
 	s_irq_status = readl(uport->membase + SE_GENI_S_IRQ_STATUS);
 	geni_status = readl(uport->membase + SE_GENI_STATUS);
@@ -861,7 +861,7 @@ static irqreturn_t qcom_geni_serial_isr(int isr, void *dev)
 		qcom_geni_serial_handle_rx(uport, drop_rx);
 
 out_unlock:
-	uart_unlock_and_check_sysrq(uport, flags);
+	uart_unlock_and_check_sysrq(uport);
 
 	return IRQ_HANDLED;
 }
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index e1b684e33841..d7ed00f1594e 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -500,19 +500,19 @@ static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int c
 	return 0;
 }
 
-static inline void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags)
+static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
 {
 	int sysrq_ch;
 
 	if (!port->has_sysrq) {
-		spin_unlock_irqrestore(&port->lock, irqflags);
+		spin_unlock(&port->lock);
 		return;
 	}
 
 	sysrq_ch = port->sysrq_ch;
 	port->sysrq_ch = 0;
 
-	spin_unlock_irqrestore(&port->lock, irqflags);
+	spin_unlock(&port->lock);
 
 	if (sysrq_ch)
 		handle_sysrq(sysrq_ch);
@@ -526,9 +526,9 @@ static inline int uart_prepare_sysrq_char(struct uart_port *port, unsigned int c
 {
 	return 0;
 }
-static inline void uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags)
+static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
 {
-	spin_unlock_irqrestore(&port->lock, irqflags);
+	spin_unlock(&port->lock);
 }
 #endif	/* CONFIG_MAGIC_SYSRQ_SERIAL */
 
-- 
cgit v1.2.3


From d60d6e7adfc3814f6de03c978ff1daab21478f87 Mon Sep 17 00:00:00 2001
From: Thara Gopinath <thara.gopinath@linaro.org>
Date: Thu, 21 Jan 2021 21:34:05 -0500
Subject: thermal/core: Remove thermal_notify_framework

thermal_notify_framework just updates for a single trip point where as
thermal_zone_device_update does other bookkeeping like updating the
temperature of the thermal zone and setting the next trip point. The only
driver that was using thermal_notify_framework was updated in the previous
patch to use thermal_zone_device_update instead. Since there are no users
for thermal_notify_framework remove it.

Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20210122023406.3500424-3-thara.gopinath@linaro.org
---
 drivers/thermal/thermal_core.c | 18 ------------------
 include/linux/thermal.h        |  5 -----
 2 files changed, 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 3566fd291399..d20b25f40d19 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -561,24 +561,6 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_update);
 
-/**
- * thermal_notify_framework - Sensor drivers use this API to notify framework
- * @tz:		thermal zone device
- * @trip:	indicates which trip point has been crossed
- *
- * This function handles the trip events from sensor drivers. It starts
- * throttling the cooling devices according to the policy configured.
- * For CRITICAL and HOT trip points, this notifies the respective drivers,
- * and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
- * The throttling policy is based on the configured platform data; if no
- * platform data is provided, this uses the step_wise throttling policy.
- */
-void thermal_notify_framework(struct thermal_zone_device *tz, int trip)
-{
-	handle_thermal_trip(tz, trip);
-}
-EXPORT_SYMBOL_GPL(thermal_notify_framework);
-
 static void thermal_zone_device_check(struct work_struct *work)
 {
 	struct thermal_zone_device *tz = container_of(work, struct
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 169502164364..d296f3b88fb9 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -390,7 +390,6 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
 int thermal_zone_get_slope(struct thermal_zone_device *tz);
 int thermal_zone_get_offset(struct thermal_zone_device *tz);
 
-void thermal_notify_framework(struct thermal_zone_device *, int);
 int thermal_zone_device_enable(struct thermal_zone_device *tz);
 int thermal_zone_device_disable(struct thermal_zone_device *tz);
 void thermal_zone_device_critical(struct thermal_zone_device *tz);
@@ -436,10 +435,6 @@ static inline int thermal_zone_get_offset(
 		struct thermal_zone_device *tz)
 { return -ENODEV; }
 
-static inline void thermal_notify_framework(struct thermal_zone_device *tz,
-	int trip)
-{ }
-
 static inline int thermal_zone_device_enable(struct thermal_zone_device *tz)
 { return -ENODEV; }
 
-- 
cgit v1.2.3


From 7f318847a0f37b96d8927e8d30ae7b8f149b11f1 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 14 Apr 2021 14:44:09 +0100
Subject: perf: Get rid of oprofile leftovers

perf_pmu_name() and perf_num_counters() are unused. Drop them.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210414134409.1266357-6-maz@kernel.org
---
 include/linux/perf_event.h | 2 --
 kernel/events/core.c       | 5 -----
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3f7f89ea5e51..51154ed9a206 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -951,8 +951,6 @@ extern void perf_event_itrace_started(struct perf_event *event);
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
-extern int perf_num_counters(void);
-extern const char *perf_pmu_name(void);
 extern void __perf_event_task_sched_in(struct task_struct *prev,
 				       struct task_struct *task);
 extern void __perf_event_task_sched_out(struct task_struct *prev,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03db40f6cba9..88cb0ba5690b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -580,11 +580,6 @@ static u64 perf_event_time(struct perf_event *event);
 
 void __weak perf_event_print_debug(void)	{ }
 
-extern __weak const char *perf_pmu_name(void)
-{
-	return "pmu";
-}
-
 static inline u64 perf_clock(void)
 {
 	return local_clock();
-- 
cgit v1.2.3


From 46135d6f878ab00261d4a2082d620bfb41019aab Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Wed, 17 Mar 2021 10:07:19 +0000
Subject: irqchip/gic-v4.1: Disable vSGI upon (GIC CPUIF < v4.1) detection

GIC CPU interfaces versions predating GIC v4.1 were not built to
accommodate vINTID within the vSGI range; as reported in the GIC
specifications (8.2 "Changes to the CPU interface"), it is
CONSTRAINED UNPREDICTABLE to deliver a vSGI to a PE with
ID_AA64PFR0_EL1.GIC < b0011.

Check the GIC CPUIF version by reading the SYS_ID_AA64_PFR0_EL1.

Disable vSGIs if a CPUIF version < 4.1 is detected to prevent using
vSGIs on systems where they may misbehave.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210317100719.3331-2-lorenzo.pieralisi@arm.com
---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c |  4 ++--
 drivers/irqchip/irq-gic-v4.c       | 27 +++++++++++++++++++++++++--
 include/linux/irqchip/arm-gic-v4.h |  2 ++
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 15a6c98ee92f..2f1b156021a6 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -86,7 +86,7 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
 		}
 		break;
 	case GICD_TYPER2:
-		if (kvm_vgic_global_state.has_gicv4_1)
+		if (kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi())
 			value = GICD_TYPER2_nASSGIcap;
 		break;
 	case GICD_IIDR:
@@ -119,7 +119,7 @@ static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
 		dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
 
 		/* Not a GICv4.1? No HW SGIs */
-		if (!kvm_vgic_global_state.has_gicv4_1)
+		if (!kvm_vgic_global_state.has_gicv4_1 || !gic_cpuif_has_vsgi())
 			val &= ~GICD_CTLR_nASSGIreq;
 
 		/* Dist stays enabled? nASSGIreq is RO */
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 5d1dc9915272..4ea71b28f9f5 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -87,17 +87,40 @@ static struct irq_domain *gic_domain;
 static const struct irq_domain_ops *vpe_domain_ops;
 static const struct irq_domain_ops *sgi_domain_ops;
 
+#ifdef CONFIG_ARM64
+#include <asm/cpufeature.h>
+
+bool gic_cpuif_has_vsgi(void)
+{
+	unsigned long fld, reg = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+	fld = cpuid_feature_extract_unsigned_field(reg, ID_AA64PFR0_GIC_SHIFT);
+
+	return fld >= 0x3;
+}
+#else
+bool gic_cpuif_has_vsgi(void)
+{
+	return false;
+}
+#endif
+
 static bool has_v4_1(void)
 {
 	return !!sgi_domain_ops;
 }
 
+static bool has_v4_1_sgi(void)
+{
+	return has_v4_1() && gic_cpuif_has_vsgi();
+}
+
 static int its_alloc_vcpu_sgis(struct its_vpe *vpe, int idx)
 {
 	char *name;
 	int sgi_base;
 
-	if (!has_v4_1())
+	if (!has_v4_1_sgi())
 		return 0;
 
 	name = kasprintf(GFP_KERNEL, "GICv4-sgi-%d", task_pid_nr(current));
@@ -182,7 +205,7 @@ static void its_free_sgi_irqs(struct its_vm *vm)
 {
 	int i;
 
-	if (!has_v4_1())
+	if (!has_v4_1_sgi())
 		return;
 
 	for (i = 0; i < vm->nr_vpes; i++) {
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 943c3411ca10..2c63375bbd43 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -145,4 +145,6 @@ int its_init_v4(struct irq_domain *domain,
 		const struct irq_domain_ops *vpe_ops,
 		const struct irq_domain_ops *sgi_ops);
 
+bool gic_cpuif_has_vsgi(void);
+
 #endif
-- 
cgit v1.2.3


From 1a0b05e435544cd53cd3936bdab425d88784b71a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Fri, 2 Apr 2021 16:02:37 +0100
Subject: irqdomain: Get rid of irq_create_strict_mappings()

No user of this helper is left, remove it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 include/linux/irqdomain.h |  3 ---
 kernel/irq/irqdomain.c    | 32 --------------------------------
 2 files changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index d2c61de208a8..7a1dd7b969b6 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -415,9 +415,6 @@ static inline unsigned int irq_linear_revmap(struct irq_domain *domain,
 extern unsigned int irq_find_mapping(struct irq_domain *host,
 				     irq_hw_number_t hwirq);
 extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
-extern int irq_create_strict_mappings(struct irq_domain *domain,
-				      unsigned int irq_base,
-				      irq_hw_number_t hwirq_base, int count);
 
 extern const struct irq_domain_ops irq_domain_simple_ops;
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 35c5a99f8884..24a3cefb3afe 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -703,38 +703,6 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
 }
 EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
 
-/**
- * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
- * @domain: domain owning the interrupt range
- * @irq_base: beginning of linux IRQ range
- * @hwirq_base: beginning of hardware IRQ range
- * @count: Number of interrupts to map
- *
- * This routine is used for allocating and mapping a range of hardware
- * irqs to linux irqs where the linux irq numbers are at pre-defined
- * locations. For use by controllers that already have static mappings
- * to insert in to the domain.
- *
- * 0 is returned upon success, while any failure to establish a static
- * mapping is treated as an error.
- */
-int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
-			       irq_hw_number_t hwirq_base, int count)
-{
-	struct device_node *of_node;
-	int ret;
-
-	of_node = irq_domain_get_of_node(domain);
-	ret = irq_alloc_descs(irq_base, irq_base, count,
-			      of_node_to_nid(of_node));
-	if (unlikely(ret < 0))
-		return ret;
-
-	irq_domain_associate_many(domain, irq_base, hwirq_base, count);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-
 static int irq_domain_translate(struct irq_domain *d,
 				struct irq_fwspec *fwspec,
 				irq_hw_number_t *hwirq, unsigned int *type)
-- 
cgit v1.2.3


From c0dcaa55f91d925c9ac2c950ff84138534337a6c Mon Sep 17 00:00:00 2001
From: Michal Swiatkowski <michal.swiatkowski@intel.com>
Date: Tue, 2 Mar 2021 10:12:01 -0800
Subject: ice: Allow ignoring opcodes on specific VF

Declare bitmap of allowed commands on VF. Initialize default
opcodes list that should be always supported. Declare array of
supported opcodes for each caps used in virtchnl code.

Change allowed bitmap by setting or clearing corresponding
bit to allowlist (bit set) or denylist (bit clear).

Signed-off-by: Michal Swiatkowski <michal.swiatkowski@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/Makefile            |   1 +
 .../ethernet/intel/ice/ice_virtchnl_allowlist.c    | 165 +++++++++++++++++++++
 .../ethernet/intel/ice/ice_virtchnl_allowlist.h    |  13 ++
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c   |  18 +++
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h   |   1 +
 include/linux/avf/virtchnl.h                       |   1 +
 6 files changed, 199 insertions(+)
 create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
 create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile
index f391691e2c7e..07fe857e9e3a 100644
--- a/drivers/net/ethernet/intel/ice/Makefile
+++ b/drivers/net/ethernet/intel/ice/Makefile
@@ -26,6 +26,7 @@ ice-y := ice_main.o	\
 	 ice_fw_update.o \
 	 ice_lag.o	\
 	 ice_ethtool.o
+ice-$(CONFIG_PCI_IOV) += ice_virtchnl_allowlist.o
 ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o ice_virtchnl_fdir.o
 ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o
 ice-$(CONFIG_RFS_ACCEL) += ice_arfs.o
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
new file mode 100644
index 000000000000..5a0fbb47346f
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021, Intel Corporation. */
+
+#include "ice_virtchnl_allowlist.h"
+
+/* Purpose of this file is to share functionality to allowlist or denylist
+ * opcodes used in PF <-> VF communication. Group of opcodes:
+ * - default -> should be always allowed after creating VF,
+ *   default_allowlist_opcodes
+ * - opcodes needed by VF to work correctly, but not associated with caps ->
+ *   should be allowed after successful VF resources allocation,
+ *   working_allowlist_opcodes
+ * - opcodes needed by VF when caps are activated
+ *
+ * Caps that don't use new opcodes (no opcodes should be allowed):
+ * - VIRTCHNL_VF_OFFLOAD_RSS_AQ
+ * - VIRTCHNL_VF_OFFLOAD_RSS_REG
+ * - VIRTCHNL_VF_OFFLOAD_WB_ON_ITR
+ * - VIRTCHNL_VF_OFFLOAD_CRC
+ * - VIRTCHNL_VF_OFFLOAD_RX_POLLING
+ * - VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2
+ * - VIRTCHNL_VF_OFFLOAD_ENCAP
+ * - VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM
+ * - VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM
+ * - VIRTCHNL_VF_OFFLOAD_USO
+ */
+
+/* default opcodes to communicate with VF */
+static const u32 default_allowlist_opcodes[] = {
+	VIRTCHNL_OP_GET_VF_RESOURCES, VIRTCHNL_OP_VERSION, VIRTCHNL_OP_RESET_VF,
+};
+
+/* opcodes supported after successful VIRTCHNL_OP_GET_VF_RESOURCES */
+static const u32 working_allowlist_opcodes[] = {
+	VIRTCHNL_OP_CONFIG_TX_QUEUE, VIRTCHNL_OP_CONFIG_RX_QUEUE,
+	VIRTCHNL_OP_CONFIG_VSI_QUEUES, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+	VIRTCHNL_OP_ENABLE_QUEUES, VIRTCHNL_OP_DISABLE_QUEUES,
+	VIRTCHNL_OP_GET_STATS, VIRTCHNL_OP_EVENT,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_L2 */
+static const u32 l2_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_ETH_ADDR, VIRTCHNL_OP_DEL_ETH_ADDR,
+	VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_REQ_QUEUES */
+static const u32 req_queues_allowlist_opcodes[] = {
+	VIRTCHNL_OP_REQUEST_QUEUES,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_VLAN */
+static const u32 vlan_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_VLAN, VIRTCHNL_OP_DEL_VLAN,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_RSS_PF */
+static const u32 rss_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_CONFIG_RSS_KEY, VIRTCHNL_OP_CONFIG_RSS_LUT,
+	VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA,
+};
+
+/* VIRTCHNL_VF_OFFLOAD_FDIR_PF */
+static const u32 fdir_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_FDIR_FILTER, VIRTCHNL_OP_DEL_FDIR_FILTER,
+};
+
+struct allowlist_opcode_info {
+	const u32 *opcodes;
+	size_t size;
+};
+
+#define BIT_INDEX(caps) (HWEIGHT((caps) - 1))
+#define ALLOW_ITEM(caps, list) \
+	[BIT_INDEX(caps)] = { \
+		.opcodes = list, \
+		.size = ARRAY_SIZE(list) \
+	}
+static const struct allowlist_opcode_info allowlist_opcodes[] = {
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_L2, l2_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_REQ_QUEUES, req_queues_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN, vlan_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RSS_PF, rss_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_FDIR_PF, fdir_pf_allowlist_opcodes),
+};
+
+/**
+ * ice_vc_is_opcode_allowed - check if this opcode is allowed on this VF
+ * @vf: pointer to VF structure
+ * @opcode: virtchnl opcode
+ *
+ * Return true if message is allowed on this VF
+ */
+bool ice_vc_is_opcode_allowed(struct ice_vf *vf, u32 opcode)
+{
+	if (opcode >= VIRTCHNL_OP_MAX)
+		return false;
+
+	return test_bit(opcode, vf->opcodes_allowlist);
+}
+
+/**
+ * ice_vc_allowlist_opcodes - allowlist selected opcodes
+ * @vf: pointer to VF structure
+ * @opcodes: array of opocodes to allowlist
+ * @size: size of opcodes array
+ *
+ * Function should be called to allowlist opcodes on VF.
+ */
+static void
+ice_vc_allowlist_opcodes(struct ice_vf *vf, const u32 *opcodes, size_t size)
+{
+	unsigned int i;
+
+	for (i = 0; i < size; i++)
+		set_bit(opcodes[i], vf->opcodes_allowlist);
+}
+
+/**
+ * ice_vc_clear_allowlist - clear all allowlist opcodes
+ * @vf: pointer to VF structure
+ */
+static void ice_vc_clear_allowlist(struct ice_vf *vf)
+{
+	bitmap_zero(vf->opcodes_allowlist, VIRTCHNL_OP_MAX);
+}
+
+/**
+ * ice_vc_set_default_allowlist - allowlist default opcodes for VF
+ * @vf: pointer to VF structure
+ */
+void ice_vc_set_default_allowlist(struct ice_vf *vf)
+{
+	ice_vc_clear_allowlist(vf);
+	ice_vc_allowlist_opcodes(vf, default_allowlist_opcodes,
+				 ARRAY_SIZE(default_allowlist_opcodes));
+}
+
+/**
+ * ice_vc_set_working_allowlist - allowlist opcodes needed to by VF to work
+ * @vf: pointer to VF structure
+ *
+ * allowlist opcodes that aren't associated with specific caps, but
+ * are needed by VF to work.
+ */
+void ice_vc_set_working_allowlist(struct ice_vf *vf)
+{
+	ice_vc_allowlist_opcodes(vf, working_allowlist_opcodes,
+				 ARRAY_SIZE(working_allowlist_opcodes));
+}
+
+/**
+ * ice_vc_set_caps_allowlist - allowlist VF opcodes according caps
+ * @vf: pointer to VF structure
+ */
+void ice_vc_set_caps_allowlist(struct ice_vf *vf)
+{
+	unsigned long caps = vf->driver_caps;
+	unsigned int i;
+
+	for_each_set_bit(i, &caps, ARRAY_SIZE(allowlist_opcodes))
+		ice_vc_allowlist_opcodes(vf, allowlist_opcodes[i].opcodes,
+					 allowlist_opcodes[i].size);
+}
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h
new file mode 100644
index 000000000000..d3ae86ded219
--- /dev/null
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021, Intel Corporation. */
+
+#ifndef _ICE_VIRTCHNL_ALLOWLIST_H_
+#define _ICE_VIRTCHNL_ALLOWLIST_H_
+#include "ice.h"
+
+bool ice_vc_is_opcode_allowed(struct ice_vf *vf, u32 opcode);
+
+void ice_vc_set_default_allowlist(struct ice_vf *vf);
+void ice_vc_set_working_allowlist(struct ice_vf *vf);
+void ice_vc_set_caps_allowlist(struct ice_vf *vf);
+#endif /* _ICE_VIRTCHNL_ALLOWLIST_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index a3ed4b84bba6..ccd6b3e8a5a9 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -5,6 +5,7 @@
 #include "ice_base.h"
 #include "ice_lib.h"
 #include "ice_fltr.h"
+#include "ice_virtchnl_allowlist.h"
 
 /**
  * ice_validate_vf_id - helper to check if VF ID is valid
@@ -1314,6 +1315,9 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr)
 	ice_for_each_vf(pf, v) {
 		vf = &pf->vf[v];
 
+		vf->driver_caps = 0;
+		ice_vc_set_default_allowlist(vf);
+
 		ice_vf_fdir_exit(vf);
 		/* clean VF control VSI when resetting VFs since it should be
 		 * setup only when VF creates its first FDIR rule.
@@ -1418,6 +1422,9 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr)
 		usleep_range(10, 20);
 	}
 
+	vf->driver_caps = 0;
+	ice_vc_set_default_allowlist(vf);
+
 	/* Display a warning if VF didn't manage to reset in time, but need to
 	 * continue on with the operation.
 	 */
@@ -1625,6 +1632,7 @@ static void ice_set_dflt_settings_vfs(struct ice_pf *pf)
 		set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vf->vf_caps);
 		vf->spoofchk = true;
 		vf->num_vf_qs = pf->num_qps_per_vf;
+		ice_vc_set_default_allowlist(vf);
 
 		/* ctrl_vsi_idx will be set to a valid value only when VF
 		 * creates its first fdir rule.
@@ -2127,6 +2135,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	/* match guest capabilities */
 	vf->driver_caps = vfres->vf_cap_flags;
 
+	ice_vc_set_caps_allowlist(vf);
+	ice_vc_set_working_allowlist(vf);
+
 	set_bit(ICE_VF_STATE_ACTIVE, vf->vf_states);
 
 err:
@@ -3840,6 +3851,13 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event)
 			err = -EINVAL;
 	}
 
+	if (!ice_vc_is_opcode_allowed(vf, v_opcode)) {
+		ice_vc_send_msg_to_vf(vf, v_opcode,
+				      VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL,
+				      0);
+		return;
+	}
+
 error_handler:
 	if (err) {
 		ice_vc_send_msg_to_vf(vf, v_opcode, VIRTCHNL_STATUS_ERR_PARAM,
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
index bcc2890c930a..d800ed83d6c3 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
@@ -103,6 +103,7 @@ struct ice_vf {
 	u16 num_vf_qs;			/* num of queue configured per VF */
 	struct ice_mdd_vf_events mdd_rx_events;
 	struct ice_mdd_vf_events mdd_tx_events;
+	DECLARE_BITMAP(opcodes_allowlist, VIRTCHNL_OP_MAX);
 };
 
 #ifdef CONFIG_PCI_IOV
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 40dd6afbfd81..debdd196773b 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -139,6 +139,7 @@ enum virtchnl_ops {
 	/* opcode 34 - 46 are reserved */
 	VIRTCHNL_OP_ADD_FDIR_FILTER = 47,
 	VIRTCHNL_OP_DEL_FDIR_FILTER = 48,
+	VIRTCHNL_OP_MAX,
 };
 
 /* These macros are used to generate compilation errors if a structure/union
-- 
cgit v1.2.3


From 142da08c4dc0afd07f9136b4812d5386bd6e1717 Mon Sep 17 00:00:00 2001
From: Brett Creeley <brett.creeley@intel.com>
Date: Tue, 2 Mar 2021 10:12:12 -0800
Subject: ice: Advertise virtchnl UDP segmentation offload capability

As the hardware is capable of supporting UDP segmentation offload, add a
capability bit to virtchnl.h to communicate this and have the driver
advertise its support.

Suggested-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Brett Creeley <brett.creeley@intel.com>
Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 3 +++
 include/linux/avf/virtchnl.h                     | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index ccd6b3e8a5a9..1292a0b06eb5 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -2118,6 +2118,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	if (vf->driver_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED;
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_USO)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_USO;
+
 	vfres->num_vsis = 1;
 	/* Tx and Rx queue are equal for VF */
 	vfres->num_queue_pairs = vsi->num_txq;
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index debdd196773b..9e0341cf2c36 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -251,6 +251,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM		0X00200000
 #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	0X00400000
 #define VIRTCHNL_VF_OFFLOAD_ADQ			0X00800000
+#define VIRTCHNL_VF_OFFLOAD_USO			0X02000000
 #define VIRTCHNL_VF_OFFLOAD_FDIR_PF		0X10000000
 
 /* Define below the capability flags that are not offloads */
-- 
cgit v1.2.3


From 222a8ab01698148c00c271cda82d96f4e6e7b0a8 Mon Sep 17 00:00:00 2001
From: Qi Zhang <qi.z.zhang@intel.com>
Date: Tue, 13 Apr 2021 08:48:39 +0800
Subject: ice: Enable RSS configure for AVF

Currently, RSS hash input is not available to AVF by ethtool, it is set
by the PF directly.

Add the RSS configure support for AVF through new virtchnl message, and
define the capability flag VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF to query this
new RSS offload support.

Signed-off-by: Jia Guo <jia.guo@intel.com>
Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
Tested-by: Bo Chen <BoX.C.Chen@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_flow.h          |   3 +
 .../ethernet/intel/ice/ice_virtchnl_allowlist.c    |   6 +
 drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c   | 453 +++++++++++++++++++++
 include/linux/avf/virtchnl.h                       |  25 +-
 4 files changed, 486 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/intel/ice/ice_flow.h b/drivers/net/ethernet/intel/ice/ice_flow.h
index eec9def8ffca..2f68b59ace7e 100644
--- a/drivers/net/ethernet/intel/ice/ice_flow.h
+++ b/drivers/net/ethernet/intel/ice/ice_flow.h
@@ -8,6 +8,9 @@
 #define ICE_FLOW_FLD_OFF_INVAL		0xffff
 
 /* Generate flow hash field from flow field type(s) */
+#define ICE_FLOW_HASH_ETH	\
+	(BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_DA) | \
+	 BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_SA))
 #define ICE_FLOW_HASH_IPV4	\
 	(BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | \
 	 BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA))
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
index 5a0fbb47346f..9feebe5f556c 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
@@ -61,6 +61,11 @@ static const u32 rss_pf_allowlist_opcodes[] = {
 	VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA,
 };
 
+/* VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF */
+static const u32 adv_rss_pf_allowlist_opcodes[] = {
+	VIRTCHNL_OP_ADD_RSS_CFG, VIRTCHNL_OP_DEL_RSS_CFG,
+};
+
 /* VIRTCHNL_VF_OFFLOAD_FDIR_PF */
 static const u32 fdir_pf_allowlist_opcodes[] = {
 	VIRTCHNL_OP_ADD_FDIR_FILTER, VIRTCHNL_OP_DEL_FDIR_FILTER,
@@ -82,6 +87,7 @@ static const struct allowlist_opcode_info allowlist_opcodes[] = {
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_REQ_QUEUES, req_queues_allowlist_opcodes),
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN, vlan_allowlist_opcodes),
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RSS_PF, rss_pf_allowlist_opcodes),
+	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF, adv_rss_pf_allowlist_opcodes),
 	ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_FDIR_PF, fdir_pf_allowlist_opcodes),
 };
 
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
index baada80c98ab..ca778a80d363 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -5,8 +5,248 @@
 #include "ice_base.h"
 #include "ice_lib.h"
 #include "ice_fltr.h"
+#include "ice_flow.h"
 #include "ice_virtchnl_allowlist.h"
 
+#define FIELD_SELECTOR(proto_hdr_field) \
+		BIT((proto_hdr_field) & PROTO_HDR_FIELD_MASK)
+
+struct ice_vc_hdr_match_type {
+	u32 vc_hdr;	/* virtchnl headers (VIRTCHNL_PROTO_HDR_XXX) */
+	u32 ice_hdr;	/* ice headers (ICE_FLOW_SEG_HDR_XXX) */
+};
+
+static const struct ice_vc_hdr_match_type ice_vc_hdr_list_os[] = {
+	{VIRTCHNL_PROTO_HDR_NONE,	ICE_FLOW_SEG_HDR_NONE},
+	{VIRTCHNL_PROTO_HDR_IPV4,	ICE_FLOW_SEG_HDR_IPV4 |
+					ICE_FLOW_SEG_HDR_IPV_OTHER},
+	{VIRTCHNL_PROTO_HDR_IPV6,	ICE_FLOW_SEG_HDR_IPV6 |
+					ICE_FLOW_SEG_HDR_IPV_OTHER},
+	{VIRTCHNL_PROTO_HDR_TCP,	ICE_FLOW_SEG_HDR_TCP},
+	{VIRTCHNL_PROTO_HDR_UDP,	ICE_FLOW_SEG_HDR_UDP},
+	{VIRTCHNL_PROTO_HDR_SCTP,	ICE_FLOW_SEG_HDR_SCTP},
+};
+
+static const struct ice_vc_hdr_match_type ice_vc_hdr_list_comms[] = {
+	{VIRTCHNL_PROTO_HDR_NONE,	ICE_FLOW_SEG_HDR_NONE},
+	{VIRTCHNL_PROTO_HDR_ETH,	ICE_FLOW_SEG_HDR_ETH},
+	{VIRTCHNL_PROTO_HDR_S_VLAN,	ICE_FLOW_SEG_HDR_VLAN},
+	{VIRTCHNL_PROTO_HDR_C_VLAN,	ICE_FLOW_SEG_HDR_VLAN},
+	{VIRTCHNL_PROTO_HDR_IPV4,	ICE_FLOW_SEG_HDR_IPV4 |
+					ICE_FLOW_SEG_HDR_IPV_OTHER},
+	{VIRTCHNL_PROTO_HDR_IPV6,	ICE_FLOW_SEG_HDR_IPV6 |
+					ICE_FLOW_SEG_HDR_IPV_OTHER},
+	{VIRTCHNL_PROTO_HDR_TCP,	ICE_FLOW_SEG_HDR_TCP},
+	{VIRTCHNL_PROTO_HDR_UDP,	ICE_FLOW_SEG_HDR_UDP},
+	{VIRTCHNL_PROTO_HDR_SCTP,	ICE_FLOW_SEG_HDR_SCTP},
+	{VIRTCHNL_PROTO_HDR_PPPOE,	ICE_FLOW_SEG_HDR_PPPOE},
+	{VIRTCHNL_PROTO_HDR_GTPU_IP,	ICE_FLOW_SEG_HDR_GTPU_IP},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH,	ICE_FLOW_SEG_HDR_GTPU_EH},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN,
+					ICE_FLOW_SEG_HDR_GTPU_DWN},
+	{VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP,
+					ICE_FLOW_SEG_HDR_GTPU_UP},
+	{VIRTCHNL_PROTO_HDR_L2TPV3,	ICE_FLOW_SEG_HDR_L2TPV3},
+	{VIRTCHNL_PROTO_HDR_ESP,	ICE_FLOW_SEG_HDR_ESP},
+	{VIRTCHNL_PROTO_HDR_AH,		ICE_FLOW_SEG_HDR_AH},
+	{VIRTCHNL_PROTO_HDR_PFCP,	ICE_FLOW_SEG_HDR_PFCP_SESSION},
+};
+
+struct ice_vc_hash_field_match_type {
+	u32 vc_hdr;		/* virtchnl headers
+				 * (VIRTCHNL_PROTO_HDR_XXX)
+				 */
+	u32 vc_hash_field;	/* virtchnl hash fields selector
+				 * FIELD_SELECTOR((VIRTCHNL_PROTO_HDR_ETH_XXX))
+				 */
+	u64 ice_hash_field;	/* ice hash fields
+				 * (BIT_ULL(ICE_FLOW_FIELD_IDX_XXX))
+				 */
+};
+
+static const struct
+ice_vc_hash_field_match_type ice_vc_hash_field_list_os[] = {
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		ICE_FLOW_HASH_IPV4},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
+		ICE_FLOW_HASH_IPV6},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		ICE_FLOW_HASH_IPV6 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
+		ICE_FLOW_HASH_TCP_PORT},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
+		ICE_FLOW_HASH_UDP_PORT},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
+		ICE_FLOW_HASH_SCTP_PORT},
+};
+
+static const struct
+ice_vc_hash_field_match_type ice_vc_hash_field_list_comms[] = {
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_SA)},
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_DA)},
+	{VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_DST),
+		ICE_FLOW_HASH_ETH},
+	{VIRTCHNL_PROTO_HDR_ETH,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_TYPE)},
+	{VIRTCHNL_PROTO_HDR_S_VLAN,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_S_VLAN_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_S_VLAN)},
+	{VIRTCHNL_PROTO_HDR_C_VLAN,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_C_VLAN_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_C_VLAN)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST),
+		ICE_FLOW_HASH_IPV4},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST),
+		ICE_FLOW_HASH_IPV6},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA) |
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		ICE_FLOW_HASH_IPV6 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_TCP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT),
+		ICE_FLOW_HASH_TCP_PORT},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_UDP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT),
+		ICE_FLOW_HASH_UDP_PORT},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)},
+	{VIRTCHNL_PROTO_HDR_SCTP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) |
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT),
+		ICE_FLOW_HASH_SCTP_PORT},
+	{VIRTCHNL_PROTO_HDR_PPPOE,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID)},
+	{VIRTCHNL_PROTO_HDR_GTPU_IP,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_GTPU_IP_TEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_IP_TEID)},
+	{VIRTCHNL_PROTO_HDR_L2TPV3,
+		FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID)},
+	{VIRTCHNL_PROTO_HDR_ESP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ESP_SPI),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_ESP_SPI)},
+	{VIRTCHNL_PROTO_HDR_AH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_AH_SPI),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_AH_SPI)},
+	{VIRTCHNL_PROTO_HDR_PFCP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PFCP_SEID),
+		BIT_ULL(ICE_FLOW_FIELD_IDX_PFCP_SEID)},
+};
+
 /**
  * ice_get_vf_vsi - get VF's VSI based on the stored index
  * @vf: VF used to get VSI
@@ -2121,6 +2361,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg)
 	if (vf->driver_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED;
 
+	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF)
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF;
+
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_USO)
 		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_USO;
 
@@ -2234,6 +2477,210 @@ static bool ice_vc_isvalid_ring_len(u16 ring_len)
 		!(ring_len % ICE_REQ_DESC_MULTIPLE));
 }
 
+/**
+ * ice_vc_parse_rss_cfg - parses hash fields and headers from
+ * a specific virtchnl RSS cfg
+ * @hw: pointer to the hardware
+ * @rss_cfg: pointer to the virtchnl RSS cfg
+ * @addl_hdrs: pointer to the protocol header fields (ICE_FLOW_SEG_HDR_*)
+ * to configure
+ * @hash_flds: pointer to the hash bit fields (ICE_FLOW_HASH_*) to configure
+ *
+ * Return true if all the protocol header and hash fields in the RSS cfg could
+ * be parsed, else return false
+ *
+ * This function parses the virtchnl RSS cfg to be the intended
+ * hash fields and the intended header for RSS configuration
+ */
+static bool
+ice_vc_parse_rss_cfg(struct ice_hw *hw, struct virtchnl_rss_cfg *rss_cfg,
+		     u32 *addl_hdrs, u64 *hash_flds)
+{
+	const struct ice_vc_hash_field_match_type *hf_list;
+	const struct ice_vc_hdr_match_type *hdr_list;
+	int i, hf_list_len, hdr_list_len;
+
+	if (!strncmp(hw->active_pkg_name, "ICE COMMS Package",
+		     sizeof(hw->active_pkg_name))) {
+		hf_list = ice_vc_hash_field_list_comms;
+		hf_list_len = ARRAY_SIZE(ice_vc_hash_field_list_comms);
+		hdr_list = ice_vc_hdr_list_comms;
+		hdr_list_len = ARRAY_SIZE(ice_vc_hdr_list_comms);
+	} else {
+		hf_list = ice_vc_hash_field_list_os;
+		hf_list_len = ARRAY_SIZE(ice_vc_hash_field_list_os);
+		hdr_list = ice_vc_hdr_list_os;
+		hdr_list_len = ARRAY_SIZE(ice_vc_hdr_list_os);
+	}
+
+	for (i = 0; i < rss_cfg->proto_hdrs.count; i++) {
+		struct virtchnl_proto_hdr *proto_hdr =
+					&rss_cfg->proto_hdrs.proto_hdr[i];
+		bool hdr_found = false;
+		int j;
+
+		/* Find matched ice headers according to virtchnl headers. */
+		for (j = 0; j < hdr_list_len; j++) {
+			struct ice_vc_hdr_match_type hdr_map = hdr_list[j];
+
+			if (proto_hdr->type == hdr_map.vc_hdr) {
+				*addl_hdrs |= hdr_map.ice_hdr;
+				hdr_found = true;
+			}
+		}
+
+		if (!hdr_found)
+			return false;
+
+		/* Find matched ice hash fields according to
+		 * virtchnl hash fields.
+		 */
+		for (j = 0; j < hf_list_len; j++) {
+			struct ice_vc_hash_field_match_type hf_map = hf_list[j];
+
+			if (proto_hdr->type == hf_map.vc_hdr &&
+			    proto_hdr->field_selector == hf_map.vc_hash_field) {
+				*hash_flds |= hf_map.ice_hash_field;
+				break;
+			}
+		}
+	}
+
+	return true;
+}
+
+/**
+ * ice_vf_adv_rss_offload_ena - determine if capabilities support advanced
+ * RSS offloads
+ * @caps: VF driver negotiated capabilities
+ *
+ * Return true if VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF capability is set,
+ * else return false
+ */
+static bool ice_vf_adv_rss_offload_ena(u32 caps)
+{
+	return !!(caps & VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF);
+}
+
+/**
+ * ice_vc_handle_rss_cfg
+ * @vf: pointer to the VF info
+ * @msg: pointer to the message buffer
+ * @add: add a RSS config if true, otherwise delete a RSS config
+ *
+ * This function adds/deletes a RSS config
+ */
+static int ice_vc_handle_rss_cfg(struct ice_vf *vf, u8 *msg, bool add)
+{
+	u32 v_opcode = add ? VIRTCHNL_OP_ADD_RSS_CFG : VIRTCHNL_OP_DEL_RSS_CFG;
+	struct virtchnl_rss_cfg *rss_cfg = (struct virtchnl_rss_cfg *)msg;
+	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
+	struct device *dev = ice_pf_to_dev(vf->pf);
+	struct ice_hw *hw = &vf->pf->hw;
+	struct ice_vsi *vsi;
+
+	if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but RSS is not supported by the PF\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+		goto error_param;
+	}
+
+	if (!ice_vf_adv_rss_offload_ena(vf->driver_caps)) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but Advanced RSS offload is not supported\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (rss_cfg->proto_hdrs.count > VIRTCHNL_MAX_NUM_PROTO_HDRS ||
+	    rss_cfg->rss_algorithm < VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC ||
+	    rss_cfg->rss_algorithm > VIRTCHNL_RSS_ALG_XOR_SYMMETRIC) {
+		dev_dbg(dev, "VF %d attempting to configure RSS, but RSS configuration is not valid\n",
+			vf->vf_id);
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	vsi = ice_get_vf_vsi(vf);
+	if (!vsi) {
+		v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		goto error_param;
+	}
+
+	if (rss_cfg->rss_algorithm == VIRTCHNL_RSS_ALG_R_ASYMMETRIC) {
+		struct ice_vsi_ctx *ctx;
+		enum ice_status status;
+		u8 lut_type, hash_type;
+
+		lut_type = ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI;
+		hash_type = add ? ICE_AQ_VSI_Q_OPT_RSS_XOR :
+				ICE_AQ_VSI_Q_OPT_RSS_TPLZ;
+
+		ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+		if (!ctx) {
+			v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
+			goto error_param;
+		}
+
+		ctx->info.q_opt_rss = ((lut_type <<
+					ICE_AQ_VSI_Q_OPT_RSS_LUT_S) &
+				       ICE_AQ_VSI_Q_OPT_RSS_LUT_M) |
+				       (hash_type &
+					ICE_AQ_VSI_Q_OPT_RSS_HASH_M);
+
+		/* Preserve existing queueing option setting */
+		ctx->info.q_opt_rss |= (vsi->info.q_opt_rss &
+					  ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M);
+		ctx->info.q_opt_tc = vsi->info.q_opt_tc;
+		ctx->info.q_opt_flags = vsi->info.q_opt_rss;
+
+		ctx->info.valid_sections =
+				cpu_to_le16(ICE_AQ_VSI_PROP_Q_OPT_VALID);
+
+		status = ice_update_vsi(hw, vsi->idx, ctx, NULL);
+		if (status) {
+			dev_err(dev, "update VSI for RSS failed, err %s aq_err %s\n",
+				ice_stat_str(status),
+				ice_aq_str(hw->adminq.sq_last_status));
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+		} else {
+			vsi->info.q_opt_rss = ctx->info.q_opt_rss;
+		}
+
+		kfree(ctx);
+	} else {
+		u32 addl_hdrs = ICE_FLOW_SEG_HDR_NONE;
+		u64 hash_flds = ICE_HASH_INVALID;
+
+		if (!ice_vc_parse_rss_cfg(hw, rss_cfg, &addl_hdrs,
+					  &hash_flds)) {
+			v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+			goto error_param;
+		}
+
+		if (add) {
+			if (ice_add_rss_cfg(hw, vsi->idx, hash_flds,
+					    addl_hdrs)) {
+				v_ret = VIRTCHNL_STATUS_ERR_PARAM;
+				dev_err(dev, "ice_add_rss_cfg failed for vsi = %d, v_ret = %d\n",
+					vsi->vsi_num, v_ret);
+			}
+		} else {
+			v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED;
+			dev_err(dev, "RSS removal not supported\n");
+		}
+	}
+
+error_param:
+	return ice_vc_send_msg_to_vf(vf, v_opcode, v_ret, NULL, 0);
+}
+
 /**
  * ice_vc_config_rss_key
  * @vf: pointer to the VF info
@@ -3931,6 +4378,12 @@ error_handler:
 	case VIRTCHNL_OP_DEL_FDIR_FILTER:
 		err = ice_vc_del_fdir_fltr(vf, msg);
 		break;
+	case VIRTCHNL_OP_ADD_RSS_CFG:
+		err = ice_vc_handle_rss_cfg(vf, msg, true);
+		break;
+	case VIRTCHNL_OP_DEL_RSS_CFG:
+		err = ice_vc_handle_rss_cfg(vf, msg, false);
+		break;
 	case VIRTCHNL_OP_UNKNOWN:
 	default:
 		dev_err(dev, "Unsupported opcode %d from VF %d\n", v_opcode,
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 9e0341cf2c36..565deea6ffe8 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -136,7 +136,9 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_DISABLE_CHANNELS = 31,
 	VIRTCHNL_OP_ADD_CLOUD_FILTER = 32,
 	VIRTCHNL_OP_DEL_CLOUD_FILTER = 33,
-	/* opcode 34 - 46 are reserved */
+	/* opcode 34 - 44 are reserved */
+	VIRTCHNL_OP_ADD_RSS_CFG = 45,
+	VIRTCHNL_OP_DEL_RSS_CFG = 46,
 	VIRTCHNL_OP_ADD_FDIR_FILTER = 47,
 	VIRTCHNL_OP_DEL_FDIR_FILTER = 48,
 	VIRTCHNL_OP_MAX,
@@ -252,6 +254,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM	0X00400000
 #define VIRTCHNL_VF_OFFLOAD_ADQ			0X00800000
 #define VIRTCHNL_VF_OFFLOAD_USO			0X02000000
+#define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF		0X08000000
 #define VIRTCHNL_VF_OFFLOAD_FDIR_PF		0X10000000
 
 /* Define below the capability flags that are not offloads */
@@ -677,6 +680,14 @@ enum virtchnl_vfr_states {
 	VIRTCHNL_VFR_VFACTIVE,
 };
 
+/* Type of RSS algorithm */
+enum virtchnl_rss_algorithm {
+	VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC	= 0,
+	VIRTCHNL_RSS_ALG_R_ASYMMETRIC		= 1,
+	VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC	= 2,
+	VIRTCHNL_RSS_ALG_XOR_SYMMETRIC		= 3,
+};
+
 #define VIRTCHNL_MAX_NUM_PROTO_HDRS	32
 #define PROTO_HDR_SHIFT			5
 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT)
@@ -832,6 +843,14 @@ struct virtchnl_proto_hdrs {
 
 VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs);
 
+struct virtchnl_rss_cfg {
+	struct virtchnl_proto_hdrs proto_hdrs;	   /* protocol headers */
+	enum virtchnl_rss_algorithm rss_algorithm; /* RSS algorithm type */
+	u8 reserved[128];			   /* reserve for future */
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(2444, virtchnl_rss_cfg);
+
 /* action configuration for FDIR */
 struct virtchnl_filter_action {
 	enum virtchnl_action type;
@@ -1100,6 +1119,10 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_DEL_CLOUD_FILTER:
 		valid_len = sizeof(struct virtchnl_filter);
 		break;
+	case VIRTCHNL_OP_ADD_RSS_CFG:
+	case VIRTCHNL_OP_DEL_RSS_CFG:
+		valid_len = sizeof(struct virtchnl_rss_cfg);
+		break;
 	case VIRTCHNL_OP_ADD_FDIR_FILTER:
 		valid_len = sizeof(struct virtchnl_fdir_add);
 		break;
-- 
cgit v1.2.3


From 1aea7808372eee4ad01f98e064c88c57f1e94855 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Thu, 22 Apr 2021 17:41:15 +0200
Subject: LSM: Infrastructure management of the superblock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move management of the superblock->sb_security blob out of the
individual security modules and into the security infrastructure.
Instead of allocating the blobs from within the modules, the modules
tell the infrastructure how much space is required, and the space is
allocated there.

Cc: John Johansen <john.johansen@canonical.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Mickaël Salaün <mic@linux.microsoft.com>
Reviewed-by: Stephen Smalley <stephen.smalley.work@gmail.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210422154123.13086-6-mic@digikod.net
Signed-off-by: James Morris <jamorris@linux.microsoft.com>
---
 include/linux/lsm_hooks.h         |  1 +
 security/security.c               | 46 ++++++++++++++++++++++++++-----
 security/selinux/hooks.c          | 58 +++++++++++++++------------------------
 security/selinux/include/objsec.h |  6 ++++
 security/selinux/ss/services.c    |  3 +-
 security/smack/smack.h            |  6 ++++
 security/smack/smack_lsm.c        | 35 ++++++-----------------
 7 files changed, 85 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index fb7f3193753d..75715998a95f 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1573,6 +1573,7 @@ struct lsm_blob_sizes {
 	int	lbs_cred;
 	int	lbs_file;
 	int	lbs_inode;
+	int	lbs_superblock;
 	int	lbs_ipc;
 	int	lbs_msg_msg;
 	int	lbs_task;
diff --git a/security/security.c b/security/security.c
index 5ac96b16f8fa..e9c29480eb18 100644
--- a/security/security.c
+++ b/security/security.c
@@ -203,6 +203,7 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
 	lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
 	lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
 	lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
+	lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
 	lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
 }
 
@@ -333,12 +334,13 @@ static void __init ordered_lsm_init(void)
 	for (lsm = ordered_lsms; *lsm; lsm++)
 		prepare_lsm(*lsm);
 
-	init_debug("cred blob size     = %d\n", blob_sizes.lbs_cred);
-	init_debug("file blob size     = %d\n", blob_sizes.lbs_file);
-	init_debug("inode blob size    = %d\n", blob_sizes.lbs_inode);
-	init_debug("ipc blob size      = %d\n", blob_sizes.lbs_ipc);
-	init_debug("msg_msg blob size  = %d\n", blob_sizes.lbs_msg_msg);
-	init_debug("task blob size     = %d\n", blob_sizes.lbs_task);
+	init_debug("cred blob size       = %d\n", blob_sizes.lbs_cred);
+	init_debug("file blob size       = %d\n", blob_sizes.lbs_file);
+	init_debug("inode blob size      = %d\n", blob_sizes.lbs_inode);
+	init_debug("ipc blob size        = %d\n", blob_sizes.lbs_ipc);
+	init_debug("msg_msg blob size    = %d\n", blob_sizes.lbs_msg_msg);
+	init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
+	init_debug("task blob size       = %d\n", blob_sizes.lbs_task);
 
 	/*
 	 * Create any kmem_caches needed for blobs
@@ -670,6 +672,27 @@ static void __init lsm_early_task(struct task_struct *task)
 		panic("%s: Early task alloc failed.\n", __func__);
 }
 
+/**
+ * lsm_superblock_alloc - allocate a composite superblock blob
+ * @sb: the superblock that needs a blob
+ *
+ * Allocate the superblock blob for all the modules
+ *
+ * Returns 0, or -ENOMEM if memory can't be allocated.
+ */
+static int lsm_superblock_alloc(struct super_block *sb)
+{
+	if (blob_sizes.lbs_superblock == 0) {
+		sb->s_security = NULL;
+		return 0;
+	}
+
+	sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL);
+	if (sb->s_security == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
 /*
  * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
  * can be accessed with:
@@ -867,12 +890,21 @@ int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *
 
 int security_sb_alloc(struct super_block *sb)
 {
-	return call_int_hook(sb_alloc_security, 0, sb);
+	int rc = lsm_superblock_alloc(sb);
+
+	if (unlikely(rc))
+		return rc;
+	rc = call_int_hook(sb_alloc_security, 0, sb);
+	if (unlikely(rc))
+		security_sb_free(sb);
+	return rc;
 }
 
 void security_sb_free(struct super_block *sb)
 {
 	call_void_hook(sb_free_security, sb);
+	kfree(sb->s_security);
+	sb->s_security = NULL;
 }
 
 void security_free_mnt_opts(void **mnt_opts)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index ddd097790d47..2ed9c995263a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -322,7 +322,7 @@ static void inode_free_security(struct inode *inode)
 
 	if (!isec)
 		return;
-	sbsec = inode->i_sb->s_security;
+	sbsec = selinux_superblock(inode->i_sb);
 	/*
 	 * As not all inode security structures are in a list, we check for
 	 * empty list outside of the lock to make sure that we won't waste
@@ -340,13 +340,6 @@ static void inode_free_security(struct inode *inode)
 	}
 }
 
-static void superblock_free_security(struct super_block *sb)
-{
-	struct superblock_security_struct *sbsec = sb->s_security;
-	sb->s_security = NULL;
-	kfree(sbsec);
-}
-
 struct selinux_mnt_opts {
 	const char *fscontext, *context, *rootcontext, *defcontext;
 };
@@ -458,7 +451,7 @@ static int selinux_is_genfs_special_handling(struct super_block *sb)
 
 static int selinux_is_sblabel_mnt(struct super_block *sb)
 {
-	struct superblock_security_struct *sbsec = sb->s_security;
+	struct superblock_security_struct *sbsec = selinux_superblock(sb);
 
 	/*
 	 * IMPORTANT: Double-check logic in this function when adding a new
@@ -535,7 +528,7 @@ fallback:
 
 static int sb_finish_set_opts(struct super_block *sb)
 {
-	struct superblock_security_struct *sbsec = sb->s_security;
+	struct superblock_security_struct *sbsec = selinux_superblock(sb);
 	struct dentry *root = sb->s_root;
 	struct inode *root_inode = d_backing_inode(root);
 	int rc = 0;
@@ -626,7 +619,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 				unsigned long *set_kern_flags)
 {
 	const struct cred *cred = current_cred();
-	struct superblock_security_struct *sbsec = sb->s_security;
+	struct superblock_security_struct *sbsec = selinux_superblock(sb);
 	struct dentry *root = sb->s_root;
 	struct selinux_mnt_opts *opts = mnt_opts;
 	struct inode_security_struct *root_isec;
@@ -863,8 +856,8 @@ out_double_mount:
 static int selinux_cmp_sb_context(const struct super_block *oldsb,
 				    const struct super_block *newsb)
 {
-	struct superblock_security_struct *old = oldsb->s_security;
-	struct superblock_security_struct *new = newsb->s_security;
+	struct superblock_security_struct *old = selinux_superblock(oldsb);
+	struct superblock_security_struct *new = selinux_superblock(newsb);
 	char oldflags = old->flags & SE_MNTMASK;
 	char newflags = new->flags & SE_MNTMASK;
 
@@ -896,8 +889,9 @@ static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
 					unsigned long *set_kern_flags)
 {
 	int rc = 0;
-	const struct superblock_security_struct *oldsbsec = oldsb->s_security;
-	struct superblock_security_struct *newsbsec = newsb->s_security;
+	const struct superblock_security_struct *oldsbsec =
+						selinux_superblock(oldsb);
+	struct superblock_security_struct *newsbsec = selinux_superblock(newsb);
 
 	int set_fscontext =	(oldsbsec->flags & FSCONTEXT_MNT);
 	int set_context =	(oldsbsec->flags & CONTEXT_MNT);
@@ -1076,7 +1070,7 @@ static int show_sid(struct seq_file *m, u32 sid)
 
 static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
 {
-	struct superblock_security_struct *sbsec = sb->s_security;
+	struct superblock_security_struct *sbsec = selinux_superblock(sb);
 	int rc;
 
 	if (!(sbsec->flags & SE_SBINITIALIZED))
@@ -1427,7 +1421,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 	if (isec->sclass == SECCLASS_FILE)
 		isec->sclass = inode_mode_to_security_class(inode->i_mode);
 
-	sbsec = inode->i_sb->s_security;
+	sbsec = selinux_superblock(inode->i_sb);
 	if (!(sbsec->flags & SE_SBINITIALIZED)) {
 		/* Defer initialization until selinux_complete_init,
 		   after the initial policy is loaded and the security
@@ -1778,7 +1772,8 @@ selinux_determine_inode_label(const struct task_security_struct *tsec,
 				 const struct qstr *name, u16 tclass,
 				 u32 *_new_isid)
 {
-	const struct superblock_security_struct *sbsec = dir->i_sb->s_security;
+	const struct superblock_security_struct *sbsec =
+						selinux_superblock(dir->i_sb);
 
 	if ((sbsec->flags & SE_SBINITIALIZED) &&
 	    (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
@@ -1809,7 +1804,7 @@ static int may_create(struct inode *dir,
 	int rc;
 
 	dsec = inode_security(dir);
-	sbsec = dir->i_sb->s_security;
+	sbsec = selinux_superblock(dir->i_sb);
 
 	sid = tsec->sid;
 
@@ -1958,7 +1953,7 @@ static int superblock_has_perm(const struct cred *cred,
 	struct superblock_security_struct *sbsec;
 	u32 sid = cred_sid(cred);
 
-	sbsec = sb->s_security;
+	sbsec = selinux_superblock(sb);
 	return avc_has_perm(&selinux_state,
 			    sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad);
 }
@@ -2587,11 +2582,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 
 static int selinux_sb_alloc_security(struct super_block *sb)
 {
-	struct superblock_security_struct *sbsec;
-
-	sbsec = kzalloc(sizeof(struct superblock_security_struct), GFP_KERNEL);
-	if (!sbsec)
-		return -ENOMEM;
+	struct superblock_security_struct *sbsec = selinux_superblock(sb);
 
 	mutex_init(&sbsec->lock);
 	INIT_LIST_HEAD(&sbsec->isec_head);
@@ -2599,16 +2590,10 @@ static int selinux_sb_alloc_security(struct super_block *sb)
 	sbsec->sid = SECINITSID_UNLABELED;
 	sbsec->def_sid = SECINITSID_FILE;
 	sbsec->mntpoint_sid = SECINITSID_UNLABELED;
-	sb->s_security = sbsec;
 
 	return 0;
 }
 
-static void selinux_sb_free_security(struct super_block *sb)
-{
-	superblock_free_security(sb);
-}
-
 static inline int opt_len(const char *s)
 {
 	bool open_quote = false;
@@ -2687,7 +2672,7 @@ free_opt:
 static int selinux_sb_remount(struct super_block *sb, void *mnt_opts)
 {
 	struct selinux_mnt_opts *opts = mnt_opts;
-	struct superblock_security_struct *sbsec = sb->s_security;
+	struct superblock_security_struct *sbsec = selinux_superblock(sb);
 	u32 sid;
 	int rc;
 
@@ -2925,7 +2910,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 	int rc;
 	char *context;
 
-	sbsec = dir->i_sb->s_security;
+	sbsec = selinux_superblock(dir->i_sb);
 
 	newsid = tsec->create_sid;
 
@@ -3227,7 +3212,7 @@ static int selinux_inode_setxattr(struct user_namespace *mnt_userns,
 	if (!selinux_initialized(&selinux_state))
 		return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM);
 
-	sbsec = inode->i_sb->s_security;
+	sbsec = selinux_superblock(inode->i_sb);
 	if (!(sbsec->flags & SBLABEL_MNT))
 		return -EOPNOTSUPP;
 
@@ -3472,13 +3457,14 @@ static int selinux_inode_setsecurity(struct inode *inode, const char *name,
 				     const void *value, size_t size, int flags)
 {
 	struct inode_security_struct *isec = inode_security_novalidate(inode);
-	struct superblock_security_struct *sbsec = inode->i_sb->s_security;
+	struct superblock_security_struct *sbsec;
 	u32 newsid;
 	int rc;
 
 	if (strcmp(name, XATTR_SELINUX_SUFFIX))
 		return -EOPNOTSUPP;
 
+	sbsec = selinux_superblock(inode->i_sb);
 	if (!(sbsec->flags & SBLABEL_MNT))
 		return -EOPNOTSUPP;
 
@@ -6975,6 +6961,7 @@ struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
 	.lbs_inode = sizeof(struct inode_security_struct),
 	.lbs_ipc = sizeof(struct ipc_security_struct),
 	.lbs_msg_msg = sizeof(struct msg_security_struct),
+	.lbs_superblock = sizeof(struct superblock_security_struct),
 };
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7075,7 +7062,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
 	LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),
 
-	LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security),
 	LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts),
 	LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
 	LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount),
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index ca4d7ab6a835..2953132408bf 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -188,4 +188,10 @@ static inline u32 current_sid(void)
 	return tsec->sid;
 }
 
+static inline struct superblock_security_struct *selinux_superblock(
+					const struct super_block *superblock)
+{
+	return superblock->s_security + selinux_blob_sizes.lbs_superblock;
+}
+
 #endif /* _SELINUX_OBJSEC_H_ */
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 3438d0130378..9cea2e6c809f 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -47,6 +47,7 @@
 #include <linux/sched.h>
 #include <linux/audit.h>
 #include <linux/vmalloc.h>
+#include <linux/lsm_hooks.h>
 #include <net/netlabel.h>
 
 #include "flask.h"
@@ -2875,7 +2876,7 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb)
 	struct sidtab *sidtab;
 	int rc = 0;
 	struct ocontext *c;
-	struct superblock_security_struct *sbsec = sb->s_security;
+	struct superblock_security_struct *sbsec = selinux_superblock(sb);
 	const char *fstype = sb->s_type->name;
 
 	if (!selinux_initialized(state)) {
diff --git a/security/smack/smack.h b/security/smack/smack.h
index a9768b12716b..7077b18c79ec 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -357,6 +357,12 @@ static inline struct smack_known **smack_ipc(const struct kern_ipc_perm *ipc)
 	return ipc->security + smack_blob_sizes.lbs_ipc;
 }
 
+static inline struct superblock_smack *smack_superblock(
+					const struct super_block *superblock)
+{
+	return superblock->s_security + smack_blob_sizes.lbs_superblock;
+}
+
 /*
  * Is the directory transmuting?
  */
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 12a45e61c1a5..ee3e29603c9c 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -535,12 +535,7 @@ static int smack_syslog(int typefrom_file)
  */
 static int smack_sb_alloc_security(struct super_block *sb)
 {
-	struct superblock_smack *sbsp;
-
-	sbsp = kzalloc(sizeof(struct superblock_smack), GFP_KERNEL);
-
-	if (sbsp == NULL)
-		return -ENOMEM;
+	struct superblock_smack *sbsp = smack_superblock(sb);
 
 	sbsp->smk_root = &smack_known_floor;
 	sbsp->smk_default = &smack_known_floor;
@@ -549,22 +544,10 @@ static int smack_sb_alloc_security(struct super_block *sb)
 	/*
 	 * SMK_SB_INITIALIZED will be zero from kzalloc.
 	 */
-	sb->s_security = sbsp;
 
 	return 0;
 }
 
-/**
- * smack_sb_free_security - free a superblock blob
- * @sb: the superblock getting the blob
- *
- */
-static void smack_sb_free_security(struct super_block *sb)
-{
-	kfree(sb->s_security);
-	sb->s_security = NULL;
-}
-
 struct smack_mnt_opts {
 	const char *fsdefault, *fsfloor, *fshat, *fsroot, *fstransmute;
 };
@@ -772,7 +755,7 @@ static int smack_set_mnt_opts(struct super_block *sb,
 {
 	struct dentry *root = sb->s_root;
 	struct inode *inode = d_backing_inode(root);
-	struct superblock_smack *sp = sb->s_security;
+	struct superblock_smack *sp = smack_superblock(sb);
 	struct inode_smack *isp;
 	struct smack_known *skp;
 	struct smack_mnt_opts *opts = mnt_opts;
@@ -871,7 +854,7 @@ static int smack_set_mnt_opts(struct super_block *sb,
  */
 static int smack_sb_statfs(struct dentry *dentry)
 {
-	struct superblock_smack *sbp = dentry->d_sb->s_security;
+	struct superblock_smack *sbp = smack_superblock(dentry->d_sb);
 	int rc;
 	struct smk_audit_info ad;
 
@@ -905,7 +888,7 @@ static int smack_bprm_creds_for_exec(struct linux_binprm *bprm)
 	if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task)
 		return 0;
 
-	sbsp = inode->i_sb->s_security;
+	sbsp = smack_superblock(inode->i_sb);
 	if ((sbsp->smk_flags & SMK_SB_UNTRUSTED) &&
 	    isp->smk_task != sbsp->smk_root)
 		return 0;
@@ -1157,7 +1140,7 @@ static int smack_inode_rename(struct inode *old_inode,
  */
 static int smack_inode_permission(struct inode *inode, int mask)
 {
-	struct superblock_smack *sbsp = inode->i_sb->s_security;
+	struct superblock_smack *sbsp = smack_superblock(inode->i_sb);
 	struct smk_audit_info ad;
 	int no_block = mask & MAY_NOT_BLOCK;
 	int rc;
@@ -1400,7 +1383,7 @@ static int smack_inode_removexattr(struct user_namespace *mnt_userns,
 	 */
 	if (strcmp(name, XATTR_NAME_SMACK) == 0) {
 		struct super_block *sbp = dentry->d_sb;
-		struct superblock_smack *sbsp = sbp->s_security;
+		struct superblock_smack *sbsp = smack_superblock(sbp);
 
 		isp->smk_inode = sbsp->smk_default;
 	} else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0)
@@ -1670,7 +1653,7 @@ static int smack_mmap_file(struct file *file,
 	isp = smack_inode(file_inode(file));
 	if (isp->smk_mmap == NULL)
 		return 0;
-	sbsp = file_inode(file)->i_sb->s_security;
+	sbsp = smack_superblock(file_inode(file)->i_sb);
 	if (sbsp->smk_flags & SMK_SB_UNTRUSTED &&
 	    isp->smk_mmap != sbsp->smk_root)
 		return -EACCES;
@@ -3285,7 +3268,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 		return;
 
 	sbp = inode->i_sb;
-	sbsp = sbp->s_security;
+	sbsp = smack_superblock(sbp);
 	/*
 	 * We're going to use the superblock default label
 	 * if there's no label on the file.
@@ -4700,6 +4683,7 @@ struct lsm_blob_sizes smack_blob_sizes __lsm_ro_after_init = {
 	.lbs_inode = sizeof(struct inode_smack),
 	.lbs_ipc = sizeof(struct smack_known *),
 	.lbs_msg_msg = sizeof(struct smack_known *),
+	.lbs_superblock = sizeof(struct superblock_smack),
 };
 
 static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
@@ -4711,7 +4695,6 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param),
 
 	LSM_HOOK_INIT(sb_alloc_security, smack_sb_alloc_security),
-	LSM_HOOK_INIT(sb_free_security, smack_sb_free_security),
 	LSM_HOOK_INIT(sb_free_mnt_opts, smack_free_mnt_opts),
 	LSM_HOOK_INIT(sb_eat_lsm_opts, smack_sb_eat_lsm_opts),
 	LSM_HOOK_INIT(sb_statfs, smack_sb_statfs),
-- 
cgit v1.2.3


From 83e804f0bfee2247b1c0aa64845c81a38562da7a Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@linux.microsoft.com>
Date: Thu, 22 Apr 2021 17:41:16 +0200
Subject: fs,security: Add sb_delete hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sb_delete security hook is called when shutting down a superblock,
which may be useful to release kernel objects tied to the superblock's
lifetime (e.g. inodes).

This new hook is needed by Landlock to release (ephemerally) tagged
struct inodes.  This comes from the unprivileged nature of Landlock
described in the next commit.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Mickaël Salaün <mic@linux.microsoft.com>
Reviewed-by: Jann Horn <jannh@google.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210422154123.13086-7-mic@digikod.net
Signed-off-by: James Morris <jamorris@linux.microsoft.com>
---
 fs/super.c                    | 1 +
 include/linux/lsm_hook_defs.h | 1 +
 include/linux/lsm_hooks.h     | 3 +++
 include/linux/security.h      | 4 ++++
 security/security.c           | 5 +++++
 5 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 8c1baca35c16..11b7e7213fd1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -454,6 +454,7 @@ void generic_shutdown_super(struct super_block *sb)
 		evict_inodes(sb);
 		/* only nonzero refcount inodes can have marks */
 		fsnotify_sb_delete(sb);
+		security_sb_delete(sb);
 
 		if (sb->s_dio_done_wq) {
 			destroy_workqueue(sb->s_dio_done_wq);
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 477a597db013..e8adadbf9581 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -59,6 +59,7 @@ LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
 LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
 	 struct fs_parameter *param)
 LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
+LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
 LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
 LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 75715998a95f..cc2eaaaec0e4 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -108,6 +108,9 @@
  *	allocated.
  *	@sb contains the super_block structure to be modified.
  *	Return 0 if operation was successful.
+ * @sb_delete:
+ *	Release objects tied to a superblock (e.g. inodes).
+ *	@sb contains the super_block structure being released.
  * @sb_free_security:
  *	Deallocate and clear the sb->s_security field.
  *	@sb contains the super_block structure to be modified.
diff --git a/include/linux/security.h b/include/linux/security.h
index 8aeebd6646dc..90298baa4551 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -291,6 +291,7 @@ void security_bprm_committed_creds(struct linux_binprm *bprm);
 int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc);
 int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param);
 int security_sb_alloc(struct super_block *sb);
+void security_sb_delete(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 void security_free_mnt_opts(void **mnt_opts);
 int security_sb_eat_lsm_opts(char *options, void **mnt_opts);
@@ -631,6 +632,9 @@ static inline int security_sb_alloc(struct super_block *sb)
 	return 0;
 }
 
+static inline void security_sb_delete(struct super_block *sb)
+{ }
+
 static inline void security_sb_free(struct super_block *sb)
 { }
 
diff --git a/security/security.c b/security/security.c
index e9c29480eb18..bb666f992497 100644
--- a/security/security.c
+++ b/security/security.c
@@ -900,6 +900,11 @@ int security_sb_alloc(struct super_block *sb)
 	return rc;
 }
 
+void security_sb_delete(struct super_block *sb)
+{
+	call_void_hook(sb_delete, sb);
+}
+
 void security_sb_free(struct super_block *sb)
 {
 	call_void_hook(sb_free_security, sb);
-- 
cgit v1.2.3


From 265885daf3e5082eb9f6e2a23bdbf9ba4456a21b Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@linux.microsoft.com>
Date: Thu, 22 Apr 2021 17:41:18 +0200
Subject: landlock: Add syscall implementations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These 3 system calls are designed to be used by unprivileged processes
to sandbox themselves:
* landlock_create_ruleset(2): Creates a ruleset and returns its file
  descriptor.
* landlock_add_rule(2): Adds a rule (e.g. file hierarchy access) to a
  ruleset, identified by the dedicated file descriptor.
* landlock_restrict_self(2): Enforces a ruleset on the calling thread
  and its future children (similar to seccomp).  This syscall has the
  same usage restrictions as seccomp(2): the caller must have the
  no_new_privs attribute set or have CAP_SYS_ADMIN in the current user
  namespace.

All these syscalls have a "flags" argument (not currently used) to
enable extensibility.

Here are the motivations for these new syscalls:
* A sandboxed process may not have access to file systems, including
  /dev, /sys or /proc, but it should still be able to add more
  restrictions to itself.
* Neither prctl(2) nor seccomp(2) (which was used in a previous version)
  fit well with the current definition of a Landlock security policy.

All passed structs (attributes) are checked at build time to ensure that
they don't contain holes and that they are aligned the same way for each
architecture.

See the user and kernel documentation for more details (provided by a
following commit):
* Documentation/userspace-api/landlock.rst
* Documentation/security/landlock.rst

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: James Morris <jmorris@namei.org>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Mickaël Salaün <mic@linux.microsoft.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Link: https://lore.kernel.org/r/20210422154123.13086-9-mic@digikod.net
Signed-off-by: James Morris <jamorris@linux.microsoft.com>
---
 include/linux/syscalls.h      |   7 +
 include/uapi/linux/landlock.h |  53 +++++
 kernel/sys_ni.c               |   5 +
 security/landlock/Makefile    |   2 +-
 security/landlock/syscalls.c  | 442 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 508 insertions(+), 1 deletion(-)
 create mode 100644 security/landlock/syscalls.c

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2839dc9a7c01..fa3971012e1c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -69,6 +69,8 @@ struct io_uring_params;
 struct clone_args;
 struct open_how;
 struct mount_attr;
+struct landlock_ruleset_attr;
+enum landlock_rule_type;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -1041,6 +1043,11 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
 asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
+asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr __user *attr,
+		size_t size, __u32 flags);
+asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type,
+		const void __user *rule_attr, __u32 flags);
+asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index b1a81b5a8b86..ba946a1e40b2 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -9,6 +9,59 @@
 #ifndef _UAPI_LINUX_LANDLOCK_H
 #define _UAPI_LINUX_LANDLOCK_H
 
+#include <linux/types.h>
+
+/**
+ * struct landlock_ruleset_attr - Ruleset definition
+ *
+ * Argument of sys_landlock_create_ruleset().  This structure can grow in
+ * future versions.
+ */
+struct landlock_ruleset_attr {
+	/**
+	 * @handled_access_fs: Bitmask of actions (cf. `Filesystem flags`_)
+	 * that is handled by this ruleset and should then be forbidden if no
+	 * rule explicitly allow them.  This is needed for backward
+	 * compatibility reasons.
+	 */
+	__u64 handled_access_fs;
+};
+
+/**
+ * enum landlock_rule_type - Landlock rule type
+ *
+ * Argument of sys_landlock_add_rule().
+ */
+enum landlock_rule_type {
+	/**
+	 * @LANDLOCK_RULE_PATH_BENEATH: Type of a &struct
+	 * landlock_path_beneath_attr .
+	 */
+	LANDLOCK_RULE_PATH_BENEATH = 1,
+};
+
+/**
+ * struct landlock_path_beneath_attr - Path hierarchy definition
+ *
+ * Argument of sys_landlock_add_rule().
+ */
+struct landlock_path_beneath_attr {
+	/**
+	 * @allowed_access: Bitmask of allowed actions for this file hierarchy
+	 * (cf. `Filesystem flags`_).
+	 */
+	__u64 allowed_access;
+	/**
+	 * @parent_fd: File descriptor, open with ``O_PATH``, which identifies
+	 * the parent directory of a file hierarchy, or just a file.
+	 */
+	__s32 parent_fd;
+	/*
+	 * This struct is packed to avoid trailing reserved members.
+	 * Cf. security/landlock/syscalls.c:build_check_abi()
+	 */
+} __attribute__((packed));
+
 /**
  * DOC: fs_access
  *
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 19aa806890d5..cce430cf2ff2 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -266,6 +266,11 @@ COND_SYSCALL(request_key);
 COND_SYSCALL(keyctl);
 COND_SYSCALL_COMPAT(keyctl);
 
+/* security/landlock/syscalls.c */
+COND_SYSCALL(landlock_create_ruleset);
+COND_SYSCALL(landlock_add_rule);
+COND_SYSCALL(landlock_restrict_self);
+
 /* arch/example/kernel/sys_example.c */
 
 /* mm/fadvise.c */
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 92e3d80ab8ed..7bbd2f413b3e 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
-landlock-y := setup.o object.o ruleset.o \
+landlock-y := setup.o syscalls.o object.o ruleset.o \
 	cred.o ptrace.o fs.o
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
new file mode 100644
index 000000000000..93620ad7593b
--- /dev/null
+++ b/security/landlock/syscalls.c
@@ -0,0 +1,442 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock LSM - System call implementations and user space interfaces
+ *
+ * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2020 ANSSI
+ */
+
+#include <asm/current.h>
+#include <linux/anon_inodes.h>
+#include <linux/build_bug.h>
+#include <linux/capability.h>
+#include <linux/compiler_types.h>
+#include <linux/dcache.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/limits.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <linux/stddef.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/landlock.h>
+
+#include "cred.h"
+#include "fs.h"
+#include "limits.h"
+#include "ruleset.h"
+#include "setup.h"
+
+/**
+ * copy_min_struct_from_user - Safe future-proof argument copying
+ *
+ * Extend copy_struct_from_user() to check for consistent user buffer.
+ *
+ * @dst: Kernel space pointer or NULL.
+ * @ksize: Actual size of the data pointed to by @dst.
+ * @ksize_min: Minimal required size to be copied.
+ * @src: User space pointer or NULL.
+ * @usize: (Alleged) size of the data pointed to by @src.
+ */
+static __always_inline int copy_min_struct_from_user(void *const dst,
+		const size_t ksize, const size_t ksize_min,
+		const void __user *const src, const size_t usize)
+{
+	/* Checks buffer inconsistencies. */
+	BUILD_BUG_ON(!dst);
+	if (!src)
+		return -EFAULT;
+
+	/* Checks size ranges. */
+	BUILD_BUG_ON(ksize <= 0);
+	BUILD_BUG_ON(ksize < ksize_min);
+	if (usize < ksize_min)
+		return -EINVAL;
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+
+	/* Copies user buffer and fills with zeros. */
+	return copy_struct_from_user(dst, ksize, src, usize);
+}
+
+/*
+ * This function only contains arithmetic operations with constants, leading to
+ * BUILD_BUG_ON().  The related code is evaluated and checked at build time,
+ * but it is then ignored thanks to compiler optimizations.
+ */
+static void build_check_abi(void)
+{
+	struct landlock_ruleset_attr ruleset_attr;
+	struct landlock_path_beneath_attr path_beneath_attr;
+	size_t ruleset_size, path_beneath_size;
+
+	/*
+	 * For each user space ABI structures, first checks that there is no
+	 * hole in them, then checks that all architectures have the same
+	 * struct size.
+	 */
+	ruleset_size = sizeof(ruleset_attr.handled_access_fs);
+	BUILD_BUG_ON(sizeof(ruleset_attr) != ruleset_size);
+	BUILD_BUG_ON(sizeof(ruleset_attr) != 8);
+
+	path_beneath_size = sizeof(path_beneath_attr.allowed_access);
+	path_beneath_size += sizeof(path_beneath_attr.parent_fd);
+	BUILD_BUG_ON(sizeof(path_beneath_attr) != path_beneath_size);
+	BUILD_BUG_ON(sizeof(path_beneath_attr) != 12);
+}
+
+/* Ruleset handling */
+
+static int fop_ruleset_release(struct inode *const inode,
+		struct file *const filp)
+{
+	struct landlock_ruleset *ruleset = filp->private_data;
+
+	landlock_put_ruleset(ruleset);
+	return 0;
+}
+
+static ssize_t fop_dummy_read(struct file *const filp, char __user *const buf,
+		const size_t size, loff_t *const ppos)
+{
+	/* Dummy handler to enable FMODE_CAN_READ. */
+	return -EINVAL;
+}
+
+static ssize_t fop_dummy_write(struct file *const filp,
+		const char __user *const buf, const size_t size,
+		loff_t *const ppos)
+{
+	/* Dummy handler to enable FMODE_CAN_WRITE. */
+	return -EINVAL;
+}
+
+/*
+ * A ruleset file descriptor enables to build a ruleset by adding (i.e.
+ * writing) rule after rule, without relying on the task's context.  This
+ * reentrant design is also used in a read way to enforce the ruleset on the
+ * current task.
+ */
+static const struct file_operations ruleset_fops = {
+	.release = fop_ruleset_release,
+	.read = fop_dummy_read,
+	.write = fop_dummy_write,
+};
+
+/**
+ * sys_landlock_create_ruleset - Create a new ruleset
+ *
+ * @attr: Pointer to a &struct landlock_ruleset_attr identifying the scope of
+ *        the new ruleset.
+ * @size: Size of the pointed &struct landlock_ruleset_attr (needed for
+ *        backward and forward compatibility).
+ * @flags: Must be 0.
+ *
+ * This system call enables to create a new Landlock ruleset, and returns the
+ * related file descriptor on success.
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: @flags is not 0, or unknown access, or too small @size;
+ * - E2BIG or EFAULT: @attr or @size inconsistencies;
+ * - ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
+ */
+SYSCALL_DEFINE3(landlock_create_ruleset,
+		const struct landlock_ruleset_attr __user *const, attr,
+		const size_t, size, const __u32, flags)
+{
+	struct landlock_ruleset_attr ruleset_attr;
+	struct landlock_ruleset *ruleset;
+	int err, ruleset_fd;
+
+	/* Build-time checks. */
+	build_check_abi();
+
+	if (!landlock_initialized)
+		return -EOPNOTSUPP;
+
+	/* No flag for now. */
+	if (flags)
+		return -EINVAL;
+
+	/* Copies raw user space buffer. */
+	err = copy_min_struct_from_user(&ruleset_attr, sizeof(ruleset_attr),
+			offsetofend(typeof(ruleset_attr), handled_access_fs),
+			attr, size);
+	if (err)
+		return err;
+
+	/* Checks content (and 32-bits cast). */
+	if ((ruleset_attr.handled_access_fs | LANDLOCK_MASK_ACCESS_FS) !=
+			LANDLOCK_MASK_ACCESS_FS)
+		return -EINVAL;
+
+	/* Checks arguments and transforms to kernel struct. */
+	ruleset = landlock_create_ruleset(ruleset_attr.handled_access_fs);
+	if (IS_ERR(ruleset))
+		return PTR_ERR(ruleset);
+
+	/* Creates anonymous FD referring to the ruleset. */
+	ruleset_fd = anon_inode_getfd("landlock-ruleset", &ruleset_fops,
+			ruleset, O_RDWR | O_CLOEXEC);
+	if (ruleset_fd < 0)
+		landlock_put_ruleset(ruleset);
+	return ruleset_fd;
+}
+
+/*
+ * Returns an owned ruleset from a FD. It is thus needed to call
+ * landlock_put_ruleset() on the return value.
+ */
+static struct landlock_ruleset *get_ruleset_from_fd(const int fd,
+		const fmode_t mode)
+{
+	struct fd ruleset_f;
+	struct landlock_ruleset *ruleset;
+
+	ruleset_f = fdget(fd);
+	if (!ruleset_f.file)
+		return ERR_PTR(-EBADF);
+
+	/* Checks FD type and access right. */
+	if (ruleset_f.file->f_op != &ruleset_fops) {
+		ruleset = ERR_PTR(-EBADFD);
+		goto out_fdput;
+	}
+	if (!(ruleset_f.file->f_mode & mode)) {
+		ruleset = ERR_PTR(-EPERM);
+		goto out_fdput;
+	}
+	ruleset = ruleset_f.file->private_data;
+	if (WARN_ON_ONCE(ruleset->num_layers != 1)) {
+		ruleset = ERR_PTR(-EINVAL);
+		goto out_fdput;
+	}
+	landlock_get_ruleset(ruleset);
+
+out_fdput:
+	fdput(ruleset_f);
+	return ruleset;
+}
+
+/* Path handling */
+
+/*
+ * @path: Must call put_path(@path) after the call if it succeeded.
+ */
+static int get_path_from_fd(const s32 fd, struct path *const path)
+{
+	struct fd f;
+	int err = 0;
+
+	BUILD_BUG_ON(!__same_type(fd,
+		((struct landlock_path_beneath_attr *)NULL)->parent_fd));
+
+	/* Handles O_PATH. */
+	f = fdget_raw(fd);
+	if (!f.file)
+		return -EBADF;
+	/*
+	 * Forbids ruleset FDs, internal filesystems (e.g. nsfs), including
+	 * pseudo filesystems that will never be mountable (e.g. sockfs,
+	 * pipefs).
+	 */
+	if ((f.file->f_op == &ruleset_fops) ||
+			(f.file->f_path.mnt->mnt_flags & MNT_INTERNAL) ||
+			(f.file->f_path.dentry->d_sb->s_flags & SB_NOUSER) ||
+			d_is_negative(f.file->f_path.dentry) ||
+			IS_PRIVATE(d_backing_inode(f.file->f_path.dentry))) {
+		err = -EBADFD;
+		goto out_fdput;
+	}
+	*path = f.file->f_path;
+	path_get(path);
+
+out_fdput:
+	fdput(f);
+	return err;
+}
+
+/**
+ * sys_landlock_add_rule - Add a new rule to a ruleset
+ *
+ * @ruleset_fd: File descriptor tied to the ruleset that should be extended
+ *		with the new rule.
+ * @rule_type: Identify the structure type pointed to by @rule_attr (only
+ *             LANDLOCK_RULE_PATH_BENEATH for now).
+ * @rule_attr: Pointer to a rule (only of type &struct
+ *             landlock_path_beneath_attr for now).
+ * @flags: Must be 0.
+ *
+ * This system call enables to define a new rule and add it to an existing
+ * ruleset.
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: @flags is not 0, or inconsistent access in the rule (i.e.
+ *   &landlock_path_beneath_attr.allowed_access is not a subset of the rule's
+ *   accesses);
+ * - ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access);
+ * - EBADF: @ruleset_fd is not a file descriptor for the current thread, or a
+ *   member of @rule_attr is not a file descriptor as expected;
+ * - EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of
+ *   @rule_attr is not the expected file descriptor type (e.g. file open
+ *   without O_PATH);
+ * - EPERM: @ruleset_fd has no write access to the underlying ruleset;
+ * - EFAULT: @rule_attr inconsistency.
+ */
+SYSCALL_DEFINE4(landlock_add_rule,
+		const int, ruleset_fd, const enum landlock_rule_type, rule_type,
+		const void __user *const, rule_attr, const __u32, flags)
+{
+	struct landlock_path_beneath_attr path_beneath_attr;
+	struct path path;
+	struct landlock_ruleset *ruleset;
+	int res, err;
+
+	if (!landlock_initialized)
+		return -EOPNOTSUPP;
+
+	/* No flag for now. */
+	if (flags)
+		return -EINVAL;
+
+	if (rule_type != LANDLOCK_RULE_PATH_BENEATH)
+		return -EINVAL;
+
+	/* Copies raw user space buffer, only one type for now. */
+	res = copy_from_user(&path_beneath_attr, rule_attr,
+			sizeof(path_beneath_attr));
+	if (res)
+		return -EFAULT;
+
+	/* Gets and checks the ruleset. */
+	ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_WRITE);
+	if (IS_ERR(ruleset))
+		return PTR_ERR(ruleset);
+
+	/*
+	 * Informs about useless rule: empty allowed_access (i.e. deny rules)
+	 * are ignored in path walks.
+	 */
+	if (!path_beneath_attr.allowed_access) {
+		err = -ENOMSG;
+		goto out_put_ruleset;
+	}
+	/*
+	 * Checks that allowed_access matches the @ruleset constraints
+	 * (ruleset->fs_access_masks[0] is automatically upgraded to 64-bits).
+	 */
+	if ((path_beneath_attr.allowed_access | ruleset->fs_access_masks[0]) !=
+			ruleset->fs_access_masks[0]) {
+		err = -EINVAL;
+		goto out_put_ruleset;
+	}
+
+	/* Gets and checks the new rule. */
+	err = get_path_from_fd(path_beneath_attr.parent_fd, &path);
+	if (err)
+		goto out_put_ruleset;
+
+	/* Imports the new rule. */
+	err = landlock_append_fs_rule(ruleset, &path,
+			path_beneath_attr.allowed_access);
+	path_put(&path);
+
+out_put_ruleset:
+	landlock_put_ruleset(ruleset);
+	return err;
+}
+
+/* Enforcement */
+
+/**
+ * sys_landlock_restrict_self - Enforce a ruleset on the calling thread
+ *
+ * @ruleset_fd: File descriptor tied to the ruleset to merge with the target.
+ * @flags: Must be 0.
+ *
+ * This system call enables to enforce a Landlock ruleset on the current
+ * thread.  Enforcing a ruleset requires that the task has CAP_SYS_ADMIN in its
+ * namespace or is running with no_new_privs.  This avoids scenarios where
+ * unprivileged tasks can affect the behavior of privileged children.
+ *
+ * Possible returned errors are:
+ *
+ * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - EINVAL: @flags is not 0.
+ * - EBADF: @ruleset_fd is not a file descriptor for the current thread;
+ * - EBADFD: @ruleset_fd is not a ruleset file descriptor;
+ * - EPERM: @ruleset_fd has no read access to the underlying ruleset, or the
+ *   current thread is not running with no_new_privs, or it doesn't have
+ *   CAP_SYS_ADMIN in its namespace.
+ * - E2BIG: The maximum number of stacked rulesets is reached for the current
+ *   thread.
+ */
+SYSCALL_DEFINE2(landlock_restrict_self,
+		const int, ruleset_fd, const __u32, flags)
+{
+	struct landlock_ruleset *new_dom, *ruleset;
+	struct cred *new_cred;
+	struct landlock_cred_security *new_llcred;
+	int err;
+
+	if (!landlock_initialized)
+		return -EOPNOTSUPP;
+
+	/* No flag for now. */
+	if (flags)
+		return -EINVAL;
+
+	/*
+	 * Similar checks as for seccomp(2), except that an -EPERM may be
+	 * returned.
+	 */
+	if (!task_no_new_privs(current) &&
+			!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Gets and checks the ruleset. */
+	ruleset = get_ruleset_from_fd(ruleset_fd, FMODE_CAN_READ);
+	if (IS_ERR(ruleset))
+		return PTR_ERR(ruleset);
+
+	/* Prepares new credentials. */
+	new_cred = prepare_creds();
+	if (!new_cred) {
+		err = -ENOMEM;
+		goto out_put_ruleset;
+	}
+	new_llcred = landlock_cred(new_cred);
+
+	/*
+	 * There is no possible race condition while copying and manipulating
+	 * the current credentials because they are dedicated per thread.
+	 */
+	new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset);
+	if (IS_ERR(new_dom)) {
+		err = PTR_ERR(new_dom);
+		goto out_put_creds;
+	}
+
+	/* Replaces the old (prepared) domain. */
+	landlock_put_ruleset(new_llcred->domain);
+	new_llcred->domain = new_dom;
+
+	landlock_put_ruleset(ruleset);
+	return commit_creds(new_cred);
+
+out_put_creds:
+	abort_creds(new_cred);
+
+out_put_ruleset:
+	landlock_put_ruleset(ruleset);
+	return err;
+}
-- 
cgit v1.2.3


From 96874c619c200bc704ae2d8e34a3746350922135 Mon Sep 17 00:00:00 2001
From: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Date: Thu, 22 Apr 2021 15:55:00 +0800
Subject: net: stmmac: Add HW descriptor prefetch setting for DWMAC Core 5.20
 onwards

DWMAC Core 5.20 onwards supports HW descriptor prefetching.
Additionally, it also depends on platform specific RTL configuration.
This capability could be enabled by setting DMA_Mode bit-19 (DCHE).

So, to enable this cability, platform must set plat->dma_cfg->dche = true
and the DWMAC core version must be 5.20 onwards. Else, this capability
wouldn`t be configured

Signed-off-by: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h      |  1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c  | 10 ++++++++--
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h  |  1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  5 +++++
 include/linux/stmmac.h                            |  1 +
 5 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index c54a56b732b3..619e3c0760d6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -33,6 +33,7 @@
 #define DWMAC_CORE_4_10		0x41
 #define DWMAC_CORE_5_00		0x50
 #define DWMAC_CORE_5_10		0x51
+#define DWMAC_CORE_5_20		0x52
 #define DWXGMAC_CORE_2_10	0x21
 #define DWXLGMAC_CORE_2_00	0x20
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index cb17f6c35e54..a602d16b9e53 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -162,12 +162,18 @@ static void dwmac4_dma_init(void __iomem *ioaddr,
 
 	writel(value, ioaddr + DMA_SYS_BUS_MODE);
 
+	value = readl(ioaddr + DMA_BUS_MODE);
+
 	if (dma_cfg->multi_msi_en) {
-		value = readl(ioaddr + DMA_BUS_MODE);
 		value &= ~DMA_BUS_MODE_INTM_MASK;
 		value |= (DMA_BUS_MODE_INTM_MODE1 << DMA_BUS_MODE_INTM_SHIFT);
-		writel(value, ioaddr + DMA_BUS_MODE);
 	}
+
+	if (dma_cfg->dche)
+		value |= DMA_BUS_MODE_DCHE;
+
+	writel(value, ioaddr + DMA_BUS_MODE);
+
 }
 
 static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
index 05481eb13ba6..9321879b599c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
@@ -25,6 +25,7 @@
 #define DMA_TBS_CTRL			0x00001050
 
 /* DMA Bus Mode bitmap */
+#define DMA_BUS_MODE_DCHE		BIT(19)
 #define DMA_BUS_MODE_INTM_MASK		GENMASK(17, 16)
 #define DMA_BUS_MODE_INTM_SHIFT		16
 #define DMA_BUS_MODE_INTM_MODE1		0x1
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index d1ca07c846e6..372090e8ee6f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -6849,6 +6849,11 @@ int stmmac_dvr_probe(struct device *device,
 	if (ret)
 		goto error_hw_init;
 
+	/* Only DWMAC core version 5.20 onwards supports HW descriptor prefetch.
+	 */
+	if (priv->synopsys_id < DWMAC_CORE_5_20)
+		priv->plat->dma_cfg->dche = false;
+
 	stmmac_check_ether_addr(priv);
 
 	ndev->netdev_ops = &stmmac_netdev_ops;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 97edb31d6310..0db36360ef21 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -97,6 +97,7 @@ struct stmmac_dma_cfg {
 	bool aal;
 	bool eame;
 	bool multi_msi_en;
+	bool dche;
 };
 
 #define AXI_BLEN	7
-- 
cgit v1.2.3


From 3ddb3fd8cdb0a6c11b7c8d91ba42d84c4ea3cc43 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 22 Apr 2021 21:18:22 +0200
Subject: signal, perf: Fix siginfo_t by avoiding u64 on 32-bit architectures

The alignment of a structure is that of its largest member. On
architectures like 32-bit Arm (but not e.g. 32-bit x86) 64-bit integers
will require 64-bit alignment and not its natural word size.

This means that there is no portable way to add 64-bit integers to
siginfo_t on 32-bit architectures without breaking the ABI, because
siginfo_t does not yet (and therefore likely never will) contain 64-bit
fields on 32-bit architectures. Adding a 64-bit integer could change the
alignment of the union after the 3 initial int si_signo, si_errno,
si_code, thus introducing 4 bytes of padding shifting the entire union,
which would break the ABI.

One alternative would be to use the __packed attribute, however, it is
non-standard C. Given siginfo_t has definitions outside the Linux kernel
in various standard libraries that can be compiled with any number of
different compilers (not just those we rely on), using non-standard
attributes on siginfo_t should be avoided to ensure portability.

In the case of the si_perf field, word size is sufficient since there is
no exact requirement on size, given the data it contains is user-defined
via perf_event_attr::sig_data. On 32-bit architectures, any excess bits
of perf_event_attr::sig_data will therefore be truncated when copying
into si_perf.

Since si_perf is intended to disambiguate events (e.g. encoding relevant
information if there are more events of the same type), 32 bits should
provide enough entropy to do so on 32-bit architectures.

For 64-bit architectures, no change is intended.

Fixes: fb6cc127e0b6 ("signal: Introduce TRAP_PERF si_code and si_perf to siginfo")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Link: https://lkml.kernel.org/r/20210422191823.79012-1-elver@google.com
---
 include/linux/compat.h                                | 2 +-
 include/uapi/asm-generic/siginfo.h                    | 2 +-
 tools/testing/selftests/perf_events/sigtrap_threads.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index c8821d966812..f0d2dd35d408 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -237,7 +237,7 @@ typedef struct compat_siginfo {
 					u32 _pkey;
 				} _addr_pkey;
 				/* used when si_code=TRAP_PERF */
-				compat_u64 _perf;
+				compat_ulong_t _perf;
 			};
 		} _sigfault;
 
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index d0bb9125c853..03d6f6d2c1fe 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -92,7 +92,7 @@ union __sifields {
 				__u32 _pkey;
 			} _addr_pkey;
 			/* used when si_code=TRAP_PERF */
-			__u64 _perf;
+			unsigned long _perf;
 		};
 	} _sigfault;
 
diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c
index 9c0fd442da60..78ddf5e11625 100644
--- a/tools/testing/selftests/perf_events/sigtrap_threads.c
+++ b/tools/testing/selftests/perf_events/sigtrap_threads.c
@@ -44,7 +44,7 @@ static struct {
 } ctx;
 
 /* Unique value to check si_perf is correctly set from perf_event_attr::sig_data. */
-#define TEST_SIG_DATA(addr) (~(uint64_t)(addr))
+#define TEST_SIG_DATA(addr) (~(unsigned long)(addr))
 
 static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr)
 {
-- 
cgit v1.2.3


From 6dab809bb5b183015e19d558bfa95107de660be0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 19 Apr 2021 14:24:55 +0300
Subject: mmc: core: Convert mmc_of_parse_voltage() to use device property API

mmc_of_parse() for a few years has been using device property API.
Convert mmc_of_parse_voltage() as well.

At the same time switch users to new API.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210419112459.25241-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/host.c            | 46 ++++++++++++++++++++++++++------------
 drivers/mmc/host/mmc_spi.c         |  8 +++----
 drivers/mmc/host/of_mmc_spi.c      |  3 ++-
 drivers/mmc/host/sdhci-esdhc-imx.c |  2 +-
 drivers/mmc/host/sdhci-of-esdhc.c  |  2 +-
 include/linux/mmc/host.h           |  2 +-
 6 files changed, 41 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 324f4cfa92a7..0b0577990ddc 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -416,44 +416,62 @@ EXPORT_SYMBOL(mmc_of_parse);
 
 /**
  * mmc_of_parse_voltage - return mask of supported voltages
- * @np: The device node need to be parsed.
+ * @host: host whose properties should be parsed.
  * @mask: mask of voltages available for MMC/SD/SDIO
  *
- * Parse the "voltage-ranges" DT property, returning zero if it is not
+ * Parse the "voltage-ranges" property, returning zero if it is not
  * found, negative errno if the voltage-range specification is invalid,
  * or one if the voltage-range is specified and successfully parsed.
  */
-int mmc_of_parse_voltage(struct device_node *np, u32 *mask)
+int mmc_of_parse_voltage(struct mmc_host *host, u32 *mask)
 {
-	const u32 *voltage_ranges;
+	const char *prop = "voltage-ranges";
+	struct device *dev = host->parent;
+	u32 *voltage_ranges;
 	int num_ranges, i;
+	int ret;
 
-	voltage_ranges = of_get_property(np, "voltage-ranges", &num_ranges);
-	if (!voltage_ranges) {
-		pr_debug("%pOF: voltage-ranges unspecified\n", np);
+	if (!device_property_present(dev, prop)) {
+		dev_dbg(dev, "%s unspecified\n", prop);
 		return 0;
 	}
-	num_ranges = num_ranges / sizeof(*voltage_ranges) / 2;
+
+	ret = device_property_count_u32(dev, prop);
+	if (ret < 0)
+		return ret;
+
+	num_ranges = ret / 2;
 	if (!num_ranges) {
-		pr_err("%pOF: voltage-ranges empty\n", np);
+		dev_err(dev, "%s empty\n", prop);
 		return -EINVAL;
 	}
 
+	voltage_ranges = kcalloc(2 * num_ranges, sizeof(*voltage_ranges), GFP_KERNEL);
+	if (!voltage_ranges)
+		return -ENOMEM;
+
+	ret = device_property_read_u32_array(dev, prop, voltage_ranges, 2 * num_ranges);
+	if (ret) {
+		kfree(voltage_ranges);
+		return ret;
+	}
+
 	for (i = 0; i < num_ranges; i++) {
 		const int j = i * 2;
 		u32 ocr_mask;
 
-		ocr_mask = mmc_vddrange_to_ocrmask(
-				be32_to_cpu(voltage_ranges[j]),
-				be32_to_cpu(voltage_ranges[j + 1]));
+		ocr_mask = mmc_vddrange_to_ocrmask(voltage_ranges[j + 0],
+						   voltage_ranges[j + 1]);
 		if (!ocr_mask) {
-			pr_err("%pOF: voltage-range #%d is invalid\n",
-				np, i);
+			dev_err(dev, "range #%d in %s is invalid\n", i, prop);
+			kfree(voltage_ranges);
 			return -EINVAL;
 		}
 		*mask |= ocr_mask;
 	}
 
+	kfree(voltage_ranges);
+
 	return 1;
 }
 EXPORT_SYMBOL(mmc_of_parse_voltage);
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index 02f4fd26e76a..9776a03a10f5 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1397,6 +1397,8 @@ static int mmc_spi_probe(struct spi_device *spi)
 
 	host->ones = ones;
 
+	dev_set_drvdata(&spi->dev, mmc);
+
 	/* Platform data is used to hook up things like card sensing
 	 * and power switching gpios.
 	 */
@@ -1413,8 +1415,6 @@ static int mmc_spi_probe(struct spi_device *spi)
 			host->powerup_msecs = 250;
 	}
 
-	dev_set_drvdata(&spi->dev, mmc);
-
 	/* preallocate dma buffers */
 	host->data = kmalloc(sizeof(*host->data), GFP_KERNEL);
 	if (!host->data)
@@ -1494,8 +1494,8 @@ fail_glue_init:
 fail_dma:
 	kfree(host->data);
 fail_nobuf1:
-	mmc_free_host(mmc);
 	mmc_spi_put_pdata(spi);
+	mmc_free_host(mmc);
 nomem:
 	kfree(ones);
 	return status;
@@ -1518,8 +1518,8 @@ static int mmc_spi_remove(struct spi_device *spi)
 	kfree(host->ones);
 
 	spi->max_speed_hz = mmc->f_max;
-	mmc_free_host(mmc);
 	mmc_spi_put_pdata(spi);
+	mmc_free_host(mmc);
 	return 0;
 }
 
diff --git a/drivers/mmc/host/of_mmc_spi.c b/drivers/mmc/host/of_mmc_spi.c
index 3c4d950a4755..acd96ea399b8 100644
--- a/drivers/mmc/host/of_mmc_spi.c
+++ b/drivers/mmc/host/of_mmc_spi.c
@@ -54,6 +54,7 @@ static void of_mmc_spi_exit(struct device *dev, void *mmc)
 
 struct mmc_spi_platform_data *mmc_spi_get_pdata(struct spi_device *spi)
 {
+	struct mmc_host *mmc = dev_get_drvdata(&spi->dev);
 	struct device *dev = &spi->dev;
 	struct device_node *np = dev->of_node;
 	struct of_mmc_spi *oms;
@@ -65,7 +66,7 @@ struct mmc_spi_platform_data *mmc_spi_get_pdata(struct spi_device *spi)
 	if (!oms)
 		return NULL;
 
-	if (mmc_of_parse_voltage(np, &oms->pdata.ocr_mask) <= 0)
+	if (mmc_of_parse_voltage(mmc, &oms->pdata.ocr_mask) <= 0)
 		goto err_ocr;
 
 	oms->detect_irq = irq_of_parse_and_map(np, 0);
diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index a380244db521..b991cf0e60c5 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -1485,7 +1485,7 @@ sdhci_esdhc_imx_probe_dt(struct platform_device *pdev,
 	if (of_property_read_u32(np, "fsl,delay-line", &boarddata->delay_line))
 		boarddata->delay_line = 0;
 
-	mmc_of_parse_voltage(np, &host->ocr_mask);
+	mmc_of_parse_voltage(host->mmc, &host->ocr_mask);
 
 	if (esdhc_is_usdhc(imx_data) && !IS_ERR(imx_data->pinctrl)) {
 		imx_data->pins_100mhz = pinctrl_lookup_state(imx_data->pinctrl,
diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c
index ab5ab969f711..a593b1fbd69e 100644
--- a/drivers/mmc/host/sdhci-of-esdhc.c
+++ b/drivers/mmc/host/sdhci-of-esdhc.c
@@ -1489,7 +1489,7 @@ static int sdhci_esdhc_probe(struct platform_device *pdev)
 	if (ret)
 		goto err;
 
-	mmc_of_parse_voltage(np, &host->ocr_mask);
+	mmc_of_parse_voltage(host->mmc, &host->ocr_mask);
 
 	ret = sdhci_add_host(host);
 	if (ret)
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 17d7b326af29..c7e7b43600e9 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -509,7 +509,7 @@ void mmc_free_host(struct mmc_host *);
 void mmc_of_parse_clk_phase(struct mmc_host *host,
 			    struct mmc_clk_phase_map *map);
 int mmc_of_parse(struct mmc_host *host);
-int mmc_of_parse_voltage(struct device_node *np, u32 *mask);
+int mmc_of_parse_voltage(struct mmc_host *host, u32 *mask);
 
 static inline void *mmc_priv(struct mmc_host *host)
 {
-- 
cgit v1.2.3


From edd602146507532c1714d8428f654b87205f492e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 19 Apr 2021 14:24:59 +0300
Subject: mmc: mmc_spi: Make of_mmc_spi.c resource provider agnostic

In order to use the same driver on non-OF platforms, make
of_mmc_spi.c resource provider agnostic.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20210419112459.25241-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/Makefile     | 2 --
 drivers/mmc/host/of_mmc_spi.c | 6 ++----
 include/linux/spi/mmc_spi.h   | 9 ---------
 3 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index 6df5c4774260..14004cc09aaa 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -34,9 +34,7 @@ obj-$(CONFIG_MMC_TIFM_SD)	+= tifm_sd.o
 obj-$(CONFIG_MMC_MVSDIO)	+= mvsdio.o
 obj-$(CONFIG_MMC_DAVINCI)       += davinci_mmc.o
 obj-$(CONFIG_MMC_SPI)		+= mmc_spi.o
-ifeq ($(CONFIG_OF),y)
 obj-$(CONFIG_MMC_SPI)		+= of_mmc_spi.o
-endif
 obj-$(CONFIG_MMC_S3C)   	+= s3cmci.o
 obj-$(CONFIG_MMC_SDRICOH_CS)	+= sdricoh_cs.o
 obj-$(CONFIG_MMC_TMIO)		+= tmio_mmc.o
diff --git a/drivers/mmc/host/of_mmc_spi.c b/drivers/mmc/host/of_mmc_spi.c
index 009c3885f6ba..9d480a05f655 100644
--- a/drivers/mmc/host/of_mmc_spi.c
+++ b/drivers/mmc/host/of_mmc_spi.c
@@ -51,10 +51,9 @@ struct mmc_spi_platform_data *mmc_spi_get_pdata(struct spi_device *spi)
 {
 	struct mmc_host *mmc = dev_get_drvdata(&spi->dev);
 	struct device *dev = &spi->dev;
-	struct device_node *np = dev->of_node;
 	struct of_mmc_spi *oms;
 
-	if (dev->platform_data || !np)
+	if (dev->platform_data || !dev_fwnode(dev))
 		return dev->platform_data;
 
 	oms = kzalloc(sizeof(*oms), GFP_KERNEL);
@@ -83,10 +82,9 @@ EXPORT_SYMBOL(mmc_spi_get_pdata);
 void mmc_spi_put_pdata(struct spi_device *spi)
 {
 	struct device *dev = &spi->dev;
-	struct device_node *np = dev->of_node;
 	struct of_mmc_spi *oms = to_of_mmc_spi(dev);
 
-	if (!dev->platform_data || !np)
+	if (!dev->platform_data || !dev_fwnode(dev))
 		return;
 
 	kfree(oms);
diff --git a/include/linux/spi/mmc_spi.h b/include/linux/spi/mmc_spi.h
index 778ae8eb1f3e..9ad9a06e488d 100644
--- a/include/linux/spi/mmc_spi.h
+++ b/include/linux/spi/mmc_spi.h
@@ -35,16 +35,7 @@ struct mmc_spi_platform_data {
 	void (*setpower)(struct device *, unsigned int maskval);
 };
 
-#ifdef CONFIG_OF
 extern struct mmc_spi_platform_data *mmc_spi_get_pdata(struct spi_device *spi);
 extern void mmc_spi_put_pdata(struct spi_device *spi);
-#else
-static inline struct mmc_spi_platform_data *
-mmc_spi_get_pdata(struct spi_device *spi)
-{
-	return spi->dev.platform_data;
-}
-static inline void mmc_spi_put_pdata(struct spi_device *spi) {}
-#endif /* CONFIG_OF */
 
 #endif /* __LINUX_SPI_MMC_SPI_H */
-- 
cgit v1.2.3


From 7ff5062079ef5c2f92af0c770dfc0ab925fa2128 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 10 Feb 2020 10:00:21 +0000
Subject: iov_iter: Add ITER_XARRAY

Add an iterator, ITER_XARRAY, that walks through a set of pages attached to
an xarray, starting at a given page and offset and walking for the
specified amount of bytes.  The iterator supports transparent huge pages.

The iterate_xarray() macro calls the helper function with rcu_access()
helped.  I think that this is only a problem for iov_iter_for_each_range()
- and that returns an error for ITER_XARRAY (also, this function does not
appear to be called).

The caller must guarantee that the pages are all present and they must be
locked using PG_locked, PG_writeback or PG_fscache to prevent them from
going away or being migrated whilst they're being accessed.

This is useful for copying data from socket buffers to inodes in network
filesystems and for transferring data between those inodes and the cache
using direct I/O.

Whilst it is true that ITER_BVEC could be used instead, that would require
a bio_vec array to be allocated to refer to all the pages - which should be
redundant if inode->i_pages also points to all these pages.

Note that older versions of this patch implemented an ITER_MAPPING instead,
which was almost the same.

Changes:
v7:
 - Rename iter_xarray_copy_pages() to iter_xarray_populate_pages()[1].

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Matthew Wilcox (Oracle) <willy@infradead.org>
cc: Christoph Hellwig <hch@lst.de>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/3577430.1579705075@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/158861205740.340223.16592990225607814022.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/159465785214.1376674.6062549291411362531.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/160588477334.3465195.3608963255682568730.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161118129703.1232039.17141248432017826976.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161161026313.2537118.14676007075365418649.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/161340386671.1303470.10752208972482479840.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539527815.286939.14607323792547049341.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653786033.2770958.14154191921867463240.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789064740.6155.11932541175173658065.stgit@warthog.procyon.org.uk/ # v6
Link: https://lore.kernel.org/r/27c369a8f42bb8a617672b2dc0126a5c6df5a050.camel@kernel.org [1]
---
 include/linux/uio.h |  11 ++
 lib/iov_iter.c      | 313 ++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 301 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 27ff8eb786dc..5f5ffc45d4aa 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -10,6 +10,7 @@
 #include <uapi/linux/uio.h>
 
 struct page;
+struct address_space;
 struct pipe_inode_info;
 
 struct kvec {
@@ -24,6 +25,7 @@ enum iter_type {
 	ITER_BVEC = 16,
 	ITER_PIPE = 32,
 	ITER_DISCARD = 64,
+	ITER_XARRAY = 128,
 };
 
 struct iov_iter {
@@ -39,6 +41,7 @@ struct iov_iter {
 		const struct iovec *iov;
 		const struct kvec *kvec;
 		const struct bio_vec *bvec;
+		struct xarray *xarray;
 		struct pipe_inode_info *pipe;
 	};
 	union {
@@ -47,6 +50,7 @@ struct iov_iter {
 			unsigned int head;
 			unsigned int start_head;
 		};
+		loff_t xarray_start;
 	};
 };
 
@@ -80,6 +84,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i)
 	return iov_iter_type(i) == ITER_DISCARD;
 }
 
+static inline bool iov_iter_is_xarray(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_XARRAY;
+}
+
 static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 {
 	return i->type & (READ | WRITE);
@@ -221,6 +230,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_
 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe,
 			size_t count);
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
+void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
+		     loff_t start, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f66c62aa7154..44fa726a8323 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -76,7 +76,44 @@
 	}						\
 }
 
-#define iterate_all_kinds(i, n, v, I, B, K) {			\
+#define iterate_xarray(i, n, __v, skip, STEP) {		\
+	struct page *head = NULL;				\
+	size_t wanted = n, seg, offset;				\
+	loff_t start = i->xarray_start + skip;			\
+	pgoff_t index = start >> PAGE_SHIFT;			\
+	int j;							\
+								\
+	XA_STATE(xas, i->xarray, index);			\
+								\
+	rcu_read_lock();						\
+	xas_for_each(&xas, head, ULONG_MAX) {				\
+		if (xas_retry(&xas, head))				\
+			continue;					\
+		if (WARN_ON(xa_is_value(head)))				\
+			break;						\
+		if (WARN_ON(PageHuge(head)))				\
+			break;						\
+		for (j = (head->index < index) ? index - head->index : 0; \
+		     j < thp_nr_pages(head); j++) {			\
+			__v.bv_page = head + j;				\
+			offset = (i->xarray_start + skip) & ~PAGE_MASK;	\
+			seg = PAGE_SIZE - offset;			\
+			__v.bv_offset = offset;				\
+			__v.bv_len = min(n, seg);			\
+			(void)(STEP);					\
+			n -= __v.bv_len;				\
+			skip += __v.bv_len;				\
+			if (n == 0)					\
+				break;					\
+		}							\
+		if (n == 0)						\
+			break;						\
+	}							\
+	rcu_read_unlock();					\
+	n = wanted - n;						\
+}
+
+#define iterate_all_kinds(i, n, v, I, B, K, X) {		\
 	if (likely(n)) {					\
 		size_t skip = i->iov_offset;			\
 		if (unlikely(i->type & ITER_BVEC)) {		\
@@ -88,6 +125,9 @@
 			struct kvec v;				\
 			iterate_kvec(i, n, v, kvec, skip, (K))	\
 		} else if (unlikely(i->type & ITER_DISCARD)) {	\
+		} else if (unlikely(i->type & ITER_XARRAY)) {	\
+			struct bio_vec v;			\
+			iterate_xarray(i, n, v, skip, (X));	\
 		} else {					\
 			const struct iovec *iov;		\
 			struct iovec v;				\
@@ -96,7 +136,7 @@
 	}							\
 }
 
-#define iterate_and_advance(i, n, v, I, B, K) {			\
+#define iterate_and_advance(i, n, v, I, B, K, X) {		\
 	if (unlikely(i->count < n))				\
 		n = i->count;					\
 	if (i->count) {						\
@@ -121,6 +161,9 @@
 			i->kvec = kvec;				\
 		} else if (unlikely(i->type & ITER_DISCARD)) {	\
 			skip += n;				\
+		} else if (unlikely(i->type & ITER_XARRAY)) {	\
+			struct bio_vec v;			\
+			iterate_xarray(i, n, v, skip, (X))	\
 		} else {					\
 			const struct iovec *iov;		\
 			struct iovec v;				\
@@ -622,7 +665,9 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 		copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
 		memcpy_to_page(v.bv_page, v.bv_offset,
 			       (from += v.bv_len) - v.bv_len, v.bv_len),
-		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
+		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
+		memcpy_to_page(v.bv_page, v.bv_offset,
+			       (from += v.bv_len) - v.bv_len, v.bv_len)
 	)
 
 	return bytes;
@@ -738,6 +783,16 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 			bytes = curr_addr - s_addr - rem;
 			return bytes;
 		}
+		}),
+		({
+		rem = copy_mc_to_page(v.bv_page, v.bv_offset,
+				      (from += v.bv_len) - v.bv_len, v.bv_len);
+		if (rem) {
+			curr_addr = (unsigned long) from;
+			bytes = curr_addr - s_addr - rem;
+			rcu_read_unlock();
+			return bytes;
+		}
 		})
 	)
 
@@ -759,7 +814,9 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 		copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
-		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len)
 	)
 
 	return bytes;
@@ -785,7 +842,9 @@ bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
 		0;}),
 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
-		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len)
 	)
 
 	iov_iter_advance(i, bytes);
@@ -805,7 +864,9 @@ size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 					 v.iov_base, v.iov_len),
 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
-		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len)
 	)
 
 	return bytes;
@@ -840,7 +901,9 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
 		memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
-			v.iov_len)
+			v.iov_len),
+		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len)
 	)
 
 	return bytes;
@@ -864,7 +927,9 @@ bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
 		0;}),
 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
-		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len)
 	)
 
 	iov_iter_advance(i, bytes);
@@ -901,7 +966,7 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 {
 	if (unlikely(!page_copy_sane(page, offset, bytes)))
 		return 0;
-	if (i->type & (ITER_BVEC|ITER_KVEC)) {
+	if (i->type & (ITER_BVEC | ITER_KVEC | ITER_XARRAY)) {
 		void *kaddr = kmap_atomic(page);
 		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
 		kunmap_atomic(kaddr);
@@ -924,7 +989,7 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 		WARN_ON(1);
 		return 0;
 	}
-	if (i->type & (ITER_BVEC|ITER_KVEC)) {
+	if (i->type & (ITER_BVEC | ITER_KVEC | ITER_XARRAY)) {
 		void *kaddr = kmap_atomic(page);
 		size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
 		kunmap_atomic(kaddr);
@@ -968,7 +1033,8 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 	iterate_and_advance(i, bytes, v,
 		clear_user(v.iov_base, v.iov_len),
 		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
-		memset(v.iov_base, 0, v.iov_len)
+		memset(v.iov_base, 0, v.iov_len),
+		memzero_page(v.bv_page, v.bv_offset, v.bv_len)
 	)
 
 	return bytes;
@@ -992,7 +1058,9 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 		copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
 		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
-		memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+		memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
+		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len)
 	)
 	kunmap_atomic(kaddr);
 	return bytes;
@@ -1078,11 +1146,16 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		i->count -= size;
 		return;
 	}
+	if (unlikely(iov_iter_is_xarray(i))) {
+		i->iov_offset += size;
+		i->count -= size;
+		return;
+	}
 	if (iov_iter_is_bvec(i)) {
 		iov_iter_bvec_advance(i, size);
 		return;
 	}
-	iterate_and_advance(i, size, v, 0, 0, 0)
+	iterate_and_advance(i, size, v, 0, 0, 0, 0)
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
@@ -1126,7 +1199,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 		return;
 	}
 	unroll -= i->iov_offset;
-	if (iov_iter_is_bvec(i)) {
+	if (iov_iter_is_xarray(i)) {
+		BUG(); /* We should never go beyond the start of the specified
+			* range since we might then be straying into pages that
+			* aren't pinned.
+			*/
+	} else if (iov_iter_is_bvec(i)) {
 		const struct bio_vec *bvec = i->bvec;
 		while (1) {
 			size_t n = (--bvec)->bv_len;
@@ -1163,9 +1241,9 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
 		return i->count;	// it is a silly place, anyway
 	if (i->nr_segs == 1)
 		return i->count;
-	if (unlikely(iov_iter_is_discard(i)))
+	if (unlikely(iov_iter_is_discard(i) || iov_iter_is_xarray(i)))
 		return i->count;
-	else if (iov_iter_is_bvec(i))
+	if (iov_iter_is_bvec(i))
 		return min(i->count, i->bvec->bv_len - i->iov_offset);
 	else
 		return min(i->count, i->iov->iov_len - i->iov_offset);
@@ -1213,6 +1291,31 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_pipe);
 
+/**
+ * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
+ * @i: The iterator to initialise.
+ * @direction: The direction of the transfer.
+ * @xarray: The xarray to access.
+ * @start: The start file position.
+ * @count: The size of the I/O buffer in bytes.
+ *
+ * Set up an I/O iterator to either draw data out of the pages attached to an
+ * inode or to inject data into those pages.  The pages *must* be prevented
+ * from evaporation, either by taking a ref on them or locking them by the
+ * caller.
+ */
+void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
+		     struct xarray *xarray, loff_t start, size_t count)
+{
+	BUG_ON(direction & ~1);
+	i->type = ITER_XARRAY | (direction & (READ | WRITE));
+	i->xarray = xarray;
+	i->xarray_start = start;
+	i->count = count;
+	i->iov_offset = 0;
+}
+EXPORT_SYMBOL(iov_iter_xarray);
+
 /**
  * iov_iter_discard - Initialise an I/O iterator that discards data
  * @i: The iterator to initialise.
@@ -1246,7 +1349,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 	iterate_all_kinds(i, size, v,
 		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
 		res |= v.bv_offset | v.bv_len,
-		res |= (unsigned long)v.iov_base | v.iov_len
+		res |= (unsigned long)v.iov_base | v.iov_len,
+		res |= v.bv_offset | v.bv_len
 	)
 	return res;
 }
@@ -1268,7 +1372,9 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 		(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
 			(size != v.bv_len ? size : 0)),
 		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
-			(size != v.iov_len ? size : 0))
+			(size != v.iov_len ? size : 0)),
+		(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
+			(size != v.bv_len ? size : 0))
 		);
 	return res;
 }
@@ -1318,6 +1424,75 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 	return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
 }
 
+static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
+					  pgoff_t index, unsigned int nr_pages)
+{
+	XA_STATE(xas, xa, index);
+	struct page *page;
+	unsigned int ret = 0;
+
+	rcu_read_lock();
+	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
+		if (xas_retry(&xas, page))
+			continue;
+
+		/* Has the page moved or been split? */
+		if (unlikely(page != xas_reload(&xas))) {
+			xas_reset(&xas);
+			continue;
+		}
+
+		pages[ret] = find_subpage(page, xas.xa_index);
+		get_page(pages[ret]);
+		if (++ret == nr_pages)
+			break;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static ssize_t iter_xarray_get_pages(struct iov_iter *i,
+				     struct page **pages, size_t maxsize,
+				     unsigned maxpages, size_t *_start_offset)
+{
+	unsigned nr, offset;
+	pgoff_t index, count;
+	size_t size = maxsize, actual;
+	loff_t pos;
+
+	if (!size || !maxpages)
+		return 0;
+
+	pos = i->xarray_start + i->iov_offset;
+	index = pos >> PAGE_SHIFT;
+	offset = pos & ~PAGE_MASK;
+	*_start_offset = offset;
+
+	count = 1;
+	if (size > PAGE_SIZE - offset) {
+		size -= PAGE_SIZE - offset;
+		count += size >> PAGE_SHIFT;
+		size &= ~PAGE_MASK;
+		if (size)
+			count++;
+	}
+
+	if (count > maxpages)
+		count = maxpages;
+
+	nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
+	if (nr == 0)
+		return 0;
+
+	actual = PAGE_SIZE * nr;
+	actual -= offset;
+	if (nr == count && size > 0) {
+		unsigned last_offset = (nr > 1) ? 0 : offset;
+		actual -= PAGE_SIZE - (last_offset + size);
+	}
+	return actual;
+}
+
 ssize_t iov_iter_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
@@ -1327,6 +1502,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 
 	if (unlikely(iov_iter_is_pipe(i)))
 		return pipe_get_pages(i, pages, maxsize, maxpages, start);
+	if (unlikely(iov_iter_is_xarray(i)))
+		return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
 	if (unlikely(iov_iter_is_discard(i)))
 		return -EFAULT;
 
@@ -1353,7 +1530,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
 		return v.bv_len;
 	}),({
 		return -EFAULT;
-	})
+	}),
+	0
 	)
 	return 0;
 }
@@ -1397,6 +1575,51 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 	return n;
 }
 
+static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
+					   struct page ***pages, size_t maxsize,
+					   size_t *_start_offset)
+{
+	struct page **p;
+	unsigned nr, offset;
+	pgoff_t index, count;
+	size_t size = maxsize, actual;
+	loff_t pos;
+
+	if (!size)
+		return 0;
+
+	pos = i->xarray_start + i->iov_offset;
+	index = pos >> PAGE_SHIFT;
+	offset = pos & ~PAGE_MASK;
+	*_start_offset = offset;
+
+	count = 1;
+	if (size > PAGE_SIZE - offset) {
+		size -= PAGE_SIZE - offset;
+		count += size >> PAGE_SHIFT;
+		size &= ~PAGE_MASK;
+		if (size)
+			count++;
+	}
+
+	p = get_pages_array(count);
+	if (!p)
+		return -ENOMEM;
+	*pages = p;
+
+	nr = iter_xarray_populate_pages(p, i->xarray, index, count);
+	if (nr == 0)
+		return 0;
+
+	actual = PAGE_SIZE * nr;
+	actual -= offset;
+	if (nr == count && size > 0) {
+		unsigned last_offset = (nr > 1) ? 0 : offset;
+		actual -= PAGE_SIZE - (last_offset + size);
+	}
+	return actual;
+}
+
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
@@ -1408,6 +1631,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 
 	if (unlikely(iov_iter_is_pipe(i)))
 		return pipe_get_pages_alloc(i, pages, maxsize, start);
+	if (unlikely(iov_iter_is_xarray(i)))
+		return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
 	if (unlikely(iov_iter_is_discard(i)))
 		return -EFAULT;
 
@@ -1440,7 +1665,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 		return v.bv_len;
 	}),({
 		return -EFAULT;
-	})
+	}), 0
 	)
 	return 0;
 }
@@ -1478,6 +1703,13 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 				      v.iov_base, v.iov_len,
 				      sum, off);
 		off += v.iov_len;
+	}), ({
+		char *p = kmap_atomic(v.bv_page);
+		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
+				      p + v.bv_offset, v.bv_len,
+				      sum, off);
+		kunmap_atomic(p);
+		off += v.bv_len;
 	})
 	)
 	*csum = sum;
@@ -1519,6 +1751,13 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
 				      v.iov_base, v.iov_len,
 				      sum, off);
 		off += v.iov_len;
+	}), ({
+		char *p = kmap_atomic(v.bv_page);
+		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
+				      p + v.bv_offset, v.bv_len,
+				      sum, off);
+		kunmap_atomic(p);
+		off += v.bv_len;
 	})
 	)
 	*csum = sum;
@@ -1565,6 +1804,13 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 				     (from += v.iov_len) - v.iov_len,
 				     v.iov_len, sum, off);
 		off += v.iov_len;
+	}), ({
+		char *p = kmap_atomic(v.bv_page);
+		sum = csum_and_memcpy(p + v.bv_offset,
+				      (from += v.bv_len) - v.bv_len,
+				      v.bv_len, sum, off);
+		kunmap_atomic(p);
+		off += v.bv_len;
 	})
 	)
 	csstate->csum = sum;
@@ -1615,6 +1861,21 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 		npages = pipe_space_for_user(iter_head, pipe->tail, pipe);
 		if (npages >= maxpages)
 			return maxpages;
+	} else if (unlikely(iov_iter_is_xarray(i))) {
+		unsigned offset;
+
+		offset = (i->xarray_start + i->iov_offset) & ~PAGE_MASK;
+
+		npages = 1;
+		if (size > PAGE_SIZE - offset) {
+			size -= PAGE_SIZE - offset;
+			npages += size >> PAGE_SHIFT;
+			size &= ~PAGE_MASK;
+			if (size)
+				npages++;
+		}
+		if (npages >= maxpages)
+			return maxpages;
 	} else iterate_all_kinds(i, size, v, ({
 		unsigned long p = (unsigned long)v.iov_base;
 		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
@@ -1631,7 +1892,8 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 			- p / PAGE_SIZE;
 		if (npages >= maxpages)
 			return maxpages;
-	})
+	}),
+	0
 	)
 	return npages;
 }
@@ -1644,7 +1906,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 		WARN_ON(1);
 		return NULL;
 	}
-	if (unlikely(iov_iter_is_discard(new)))
+	if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
 		return NULL;
 	if (iov_iter_is_bvec(new))
 		return new->bvec = kmemdup(new->bvec,
@@ -1849,7 +2111,12 @@ int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
 		kunmap(v.bv_page);
 		err;}), ({
 		w = v;
-		err = f(&w, context);})
+		err = f(&w, context);}), ({
+		w.iov_base = kmap(v.bv_page) + v.bv_offset;
+		w.iov_len = v.bv_len;
+		err = f(&w, context);
+		kunmap(v.bv_page);
+		err;})
 	)
 	return err;
 }
-- 
cgit v1.2.3


From 73e10ded33a1cfc0c72404aaedc493e9813b6239 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 10 Feb 2020 10:00:21 +0000
Subject: mm: Add set/end/wait functions for PG_private_2

Add three functions to manipulate PG_private_2:

 (*) set_page_private_2() - Set the flag and take an appropriate reference
     on the flagged page.

 (*) end_page_private_2() - Clear the flag, drop the reference and wake up
     any waiters, somewhat analogously with end_page_writeback().

 (*) wait_on_page_private_2() - Wait for the flag to be cleared.

Wrappers will need to be placed in the netfs lib header in the patch that
adds that.

[This implements a suggestion by Linus[1] to not mix the terminology of
 PG_private_2 and PG_fscache in the mm core function]

Changes:
v7:
- Use compound_head() in all the functions to make them THP safe[6].

v5:
- Add set and end functions, calling the end function end rather than
  unlock[3].
- Keep a ref on the page when PG_private_2 is set[4][5].

v4:
- Remove extern from the declaration[2].

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Christoph Hellwig <hch@lst.de>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/1330473.1612974547@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/CAHk-=wjgA-74ddehziVk=XAEMTKswPu1Yw4uaro1R3ibs27ztw@mail.gmail.com/ [1]
Link: https://lore.kernel.org/r/20210216102659.GA27714@lst.de/ [2]
Link: https://lore.kernel.org/r/161340387944.1303470.7944159520278177652.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539528910.286939.1252328699383291173.stgit@warthog.procyon.org.uk # v4
Link: https://lore.kernel.org/r/20210321105309.GG3420@casper.infradead.org [3]
Link: https://lore.kernel.org/r/CAHk-=wh+2gbF7XEjYc=HV9w_2uVzVf7vs60BPz0gFA=+pUm3ww@mail.gmail.com/ [4]
Link: https://lore.kernel.org/r/CAHk-=wjSGsRj7xwhSMQ6dAQiz53xA39pOG+XA_WeTgwBBu4uqg@mail.gmail.com/ [5]
Link: https://lore.kernel.org/r/20210408145057.GN2531743@casper.infradead.org/ [6]
Link: https://lore.kernel.org/r/161653788200.2770958.9517755716374927208.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789066013.6155.9816857201817288382.stgit@warthog.procyon.org.uk/ # v6
---
 include/linux/pagemap.h | 20 ++++++++++++++++
 mm/filemap.c            | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8c9947fd62f3..bb4433c98d02 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -688,6 +688,26 @@ void wait_for_stable_page(struct page *page);
 
 void page_endio(struct page *page, bool is_write, int err);
 
+/**
+ * set_page_private_2 - Set PG_private_2 on a page and take a ref
+ * @page: The page.
+ *
+ * Set the PG_private_2 flag on a page and take the reference needed for the VM
+ * to handle its lifetime correctly.  This sets the flag and takes the
+ * reference unconditionally, so care must be taken not to set the flag again
+ * if it's already set.
+ */
+static inline void set_page_private_2(struct page *page)
+{
+	page = compound_head(page);
+	get_page(page);
+	SetPagePrivate2(page);
+}
+
+void end_page_private_2(struct page *page);
+void wait_on_page_private_2(struct page *page);
+int wait_on_page_private_2_killable(struct page *page);
+
 /*
  * Add an arbitrary waiter to a page's wait queue
  */
diff --git a/mm/filemap.c b/mm/filemap.c
index 43700480d897..afe22f09960e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1432,6 +1432,67 @@ void unlock_page(struct page *page)
 }
 EXPORT_SYMBOL(unlock_page);
 
+/**
+ * end_page_private_2 - Clear PG_private_2 and release any waiters
+ * @page: The page
+ *
+ * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for
+ * this.  The page ref held for PG_private_2 being set is released.
+ *
+ * This is, for example, used when a netfs page is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same page to be
+ * serialised.
+ */
+void end_page_private_2(struct page *page)
+{
+	page = compound_head(page);
+	VM_BUG_ON_PAGE(!PagePrivate2(page), page);
+	clear_bit_unlock(PG_private_2, &page->flags);
+	wake_up_page_bit(page, PG_private_2);
+	put_page(page);
+}
+EXPORT_SYMBOL(end_page_private_2);
+
+/**
+ * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page
+ * @page: The page to wait on
+ *
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page.
+ */
+void wait_on_page_private_2(struct page *page)
+{
+	page = compound_head(page);
+	while (PagePrivate2(page))
+		wait_on_page_bit(page, PG_private_2);
+}
+EXPORT_SYMBOL(wait_on_page_private_2);
+
+/**
+ * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page
+ * @page: The page to wait on
+ *
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a
+ * fatal signal is received by the calling task.
+ *
+ * Return:
+ * - 0 if successful.
+ * - -EINTR if a fatal signal was encountered.
+ */
+int wait_on_page_private_2_killable(struct page *page)
+{
+	int ret = 0;
+
+	page = compound_head(page);
+	while (PagePrivate2(page)) {
+		ret = wait_on_page_bit_killable(page, PG_private_2);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(wait_on_page_private_2_killable);
+
 /**
  * end_page_writeback - end writeback against a page
  * @page: the page
-- 
cgit v1.2.3


From fcd9ae4f7f3b5fbd549285bab0478a339113620e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 7 Apr 2021 21:18:55 +0100
Subject: mm/filemap: Pass the file_ra_state in the ractl

For readahead_expand(), we need to modify the file ra_state, so pass it
down by adding it to the ractl.  We have to do this because it's not always
the same as f_ra in the struct file that is already being passed.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
Link: https://lore.kernel.org/r/20210407201857.3582797-2-willy@infradead.org/
Link: https://lore.kernel.org/r/161789067431.6155.8063840447229665720.stgit@warthog.procyon.org.uk/ # v6
---
 fs/ext4/verity.c        |  2 +-
 fs/f2fs/file.c          |  2 +-
 fs/f2fs/verity.c        |  2 +-
 include/linux/pagemap.h | 20 +++++++++++---------
 mm/filemap.c            |  4 ++--
 mm/internal.h           |  7 +++----
 mm/readahead.c          | 22 +++++++++++-----------
 7 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 00e3cbde472e..07438f46b558 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -370,7 +370,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
 					       pgoff_t index,
 					       unsigned long num_ra_pages)
 {
-	DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+	DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
 	struct page *page;
 
 	index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d26ff2ae3f5e..c1e6f669a0c4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4051,7 +4051,7 @@ out:
 
 static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
 {
-	DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, page_idx);
+	DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx);
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	pgoff_t redirty_idx = page_idx;
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 054ec852b5ea..a7beff28a3c5 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -228,7 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
 					       pgoff_t index,
 					       unsigned long num_ra_pages)
 {
-	DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+	DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
 	struct page *page;
 
 	index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bb4433c98d02..4220ded38f4b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -812,20 +812,23 @@ static inline int add_to_page_cache(struct page *page,
  * @file: The file, used primarily by network filesystems for authentication.
  *	  May be NULL if invoked internally by the filesystem.
  * @mapping: Readahead this filesystem object.
+ * @ra: File readahead state.  May be NULL.
  */
 struct readahead_control {
 	struct file *file;
 	struct address_space *mapping;
+	struct file_ra_state *ra;
 /* private: use the readahead_* accessors instead */
 	pgoff_t _index;
 	unsigned int _nr_pages;
 	unsigned int _batch_count;
 };
 
-#define DEFINE_READAHEAD(rac, f, m, i)					\
-	struct readahead_control rac = {				\
+#define DEFINE_READAHEAD(ractl, f, r, m, i)				\
+	struct readahead_control ractl = {				\
 		.file = f,						\
 		.mapping = m,						\
+		.ra = r,						\
 		._index = i,						\
 	}
 
@@ -833,10 +836,9 @@ struct readahead_control {
 
 void page_cache_ra_unbounded(struct readahead_control *,
 		unsigned long nr_to_read, unsigned long lookahead_count);
-void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *,
+void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
+void page_cache_async_ra(struct readahead_control *, struct page *,
 		unsigned long req_count);
-void page_cache_async_ra(struct readahead_control *, struct file_ra_state *,
-		struct page *, unsigned long req_count);
 
 /**
  * page_cache_sync_readahead - generic file readahead
@@ -856,8 +858,8 @@ void page_cache_sync_readahead(struct address_space *mapping,
 		struct file_ra_state *ra, struct file *file, pgoff_t index,
 		unsigned long req_count)
 {
-	DEFINE_READAHEAD(ractl, file, mapping, index);
-	page_cache_sync_ra(&ractl, ra, req_count);
+	DEFINE_READAHEAD(ractl, file, ra, mapping, index);
+	page_cache_sync_ra(&ractl, req_count);
 }
 
 /**
@@ -879,8 +881,8 @@ void page_cache_async_readahead(struct address_space *mapping,
 		struct file_ra_state *ra, struct file *file,
 		struct page *page, pgoff_t index, unsigned long req_count)
 {
-	DEFINE_READAHEAD(ractl, file, mapping, index);
-	page_cache_async_ra(&ractl, ra, page, req_count);
+	DEFINE_READAHEAD(ractl, file, ra, mapping, index);
+	page_cache_async_ra(&ractl, page, req_count);
 }
 
 /**
diff --git a/mm/filemap.c b/mm/filemap.c
index afe22f09960e..46e0321ba87a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2832,7 +2832,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	struct file *file = vmf->vma->vm_file;
 	struct file_ra_state *ra = &file->f_ra;
 	struct address_space *mapping = file->f_mapping;
-	DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
+	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
 	struct file *fpin = NULL;
 	unsigned int mmap_miss;
 
@@ -2844,7 +2844,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 
 	if (vmf->vma->vm_flags & VM_SEQ_READ) {
 		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
-		page_cache_sync_ra(&ractl, ra, ra->ra_pages);
+		page_cache_sync_ra(&ractl, ra->ra_pages);
 		return fpin;
 	}
 
diff --git a/mm/internal.h b/mm/internal.h
index 1432feec62df..83a07b2a7b1f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -51,13 +51,12 @@ void unmap_page_range(struct mmu_gather *tlb,
 
 void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
 		unsigned long lookahead_size);
-void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
-		unsigned long nr);
+void force_page_cache_ra(struct readahead_control *, unsigned long nr);
 static inline void force_page_cache_readahead(struct address_space *mapping,
 		struct file *file, pgoff_t index, unsigned long nr_to_read)
 {
-	DEFINE_READAHEAD(ractl, file, mapping, index);
-	force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
+	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
+	force_page_cache_ra(&ractl, nr_to_read);
 }
 
 unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
diff --git a/mm/readahead.c b/mm/readahead.c
index c5b0457415be..2088569a947e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -272,9 +272,10 @@ void do_page_cache_ra(struct readahead_control *ractl,
  * memory at once.
  */
 void force_page_cache_ra(struct readahead_control *ractl,
-		struct file_ra_state *ra, unsigned long nr_to_read)
+		unsigned long nr_to_read)
 {
 	struct address_space *mapping = ractl->mapping;
+	struct file_ra_state *ra = ractl->ra;
 	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	unsigned long max_pages, index;
 
@@ -433,10 +434,10 @@ static int try_context_readahead(struct address_space *mapping,
  * A minimal readahead algorithm for trivial sequential/random reads.
  */
 static void ondemand_readahead(struct readahead_control *ractl,
-		struct file_ra_state *ra, bool hit_readahead_marker,
-		unsigned long req_size)
+		bool hit_readahead_marker, unsigned long req_size)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
+	struct file_ra_state *ra = ractl->ra;
 	unsigned long max_pages = ra->ra_pages;
 	unsigned long add_pages;
 	unsigned long index = readahead_index(ractl);
@@ -550,7 +551,7 @@ readit:
 }
 
 void page_cache_sync_ra(struct readahead_control *ractl,
-		struct file_ra_state *ra, unsigned long req_count)
+		unsigned long req_count)
 {
 	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
 
@@ -560,7 +561,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
 	 * read-ahead will do the right thing and limit the read to just the
 	 * requested range, which we'll set to 1 page for this case.
 	 */
-	if (!ra->ra_pages || blk_cgroup_congested()) {
+	if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
 		if (!ractl->file)
 			return;
 		req_count = 1;
@@ -569,21 +570,20 @@ void page_cache_sync_ra(struct readahead_control *ractl,
 
 	/* be dumb */
 	if (do_forced_ra) {
-		force_page_cache_ra(ractl, ra, req_count);
+		force_page_cache_ra(ractl, req_count);
 		return;
 	}
 
 	/* do read-ahead */
-	ondemand_readahead(ractl, ra, false, req_count);
+	ondemand_readahead(ractl, false, req_count);
 }
 EXPORT_SYMBOL_GPL(page_cache_sync_ra);
 
 void page_cache_async_ra(struct readahead_control *ractl,
-		struct file_ra_state *ra, struct page *page,
-		unsigned long req_count)
+		struct page *page, unsigned long req_count)
 {
 	/* no read-ahead */
-	if (!ra->ra_pages)
+	if (!ractl->ra->ra_pages)
 		return;
 
 	/*
@@ -604,7 +604,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
 		return;
 
 	/* do read-ahead */
-	ondemand_readahead(ractl, ra, true, req_count);
+	ondemand_readahead(ractl, true, req_count);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_ra);
 
-- 
cgit v1.2.3


From c790fbf20a53e8297c97ddb1c0c9d41c060067f3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 7 Apr 2021 21:18:56 +0100
Subject: fs: Document file_ra_state

Turn the comments into kernel-doc and improve the wording slightly.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
Link: https://lore.kernel.org/r/20210407201857.3582797-3-willy@infradead.org/
Link: https://lore.kernel.org/r/161789068619.6155.1397999970593531574.stgit@warthog.procyon.org.uk/ # v6
---
 include/linux/fs.h | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ec8f3ddf4a6a..33831a8bda52 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -891,18 +891,22 @@ struct fown_struct {
 	int signum;		/* posix.1b rt signal to be delivered on IO */
 };
 
-/*
- * Track a single file's readahead state
+/**
+ * struct file_ra_state - Track a file's readahead state.
+ * @start: Where the most recent readahead started.
+ * @size: Number of pages read in the most recent readahead.
+ * @async_size: Start next readahead when this many pages are left.
+ * @ra_pages: Maximum size of a readahead request.
+ * @mmap_miss: How many mmap accesses missed in the page cache.
+ * @prev_pos: The last byte in the most recent read request.
  */
 struct file_ra_state {
-	pgoff_t start;			/* where readahead started */
-	unsigned int size;		/* # of readahead pages */
-	unsigned int async_size;	/* do asynchronous readahead when
-					   there are only # of pages ahead */
-
-	unsigned int ra_pages;		/* Maximum readahead window */
-	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
-	loff_t prev_pos;		/* Cache last read() position */
+	pgoff_t start;
+	unsigned int size;
+	unsigned int async_size;
+	unsigned int ra_pages;
+	unsigned int mmap_miss;
+	loff_t prev_pos;
 };
 
 /*
-- 
cgit v1.2.3


From 3ca236440126f75c91281c53f137794b8d5f884a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 10 Sep 2020 14:03:27 +0100
Subject: mm: Implement readahead_control pageset expansion

Provide a function, readahead_expand(), that expands the set of pages
specified by a readahead_control object to encompass a revised area with a
proposed size and length.

The proposed area must include all of the old area and may be expanded yet
more by this function so that the edges align on (transparent huge) page
boundaries as allocated.

The expansion will be cut short if a page already exists in either of the
areas being expanded into.  Note that any expansion made in such a case is
not rolled back.

This will be used by fscache so that reads can be expanded to cache granule
boundaries, thereby allowing whole granules to be stored in the cache, but
there are other potential users also.

Changes:
v6:
- Fold in a patch from Matthew Wilcox to tell the ondemand readahead
  algorithm about the expansion so that the next readahead starts at the
  right place[2].

v4:
- Moved the declaration of readahead_expand() to a better place[1].

Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Christoph Hellwig <hch@lst.de>
cc: Mike Marshall <hubcap@omnibond.com>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20210217161358.GM2858050@casper.infradead.org/ [1]
Link: https://lore.kernel.org/r/20210407201857.3582797-4-willy@infradead.org/ [2]
Link: https://lore.kernel.org/r/159974633888.2094769.8326206446358128373.stgit@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/160588479816.3465195.553952688795241765.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161118131787.1232039.4863969952441067985.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161161028670.2537118.13831420617039766044.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/161340389201.1303470.14353807284546854878.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539530488.286939.18085961677838089157.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653789422.2770958.2108046612147345000.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789069829.6155.4295672417565512161.stgit@warthog.procyon.org.uk/ # v6
---
 include/linux/pagemap.h |  2 ++
 mm/readahead.c          | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4220ded38f4b..63ca6430aef5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -839,6 +839,8 @@ void page_cache_ra_unbounded(struct readahead_control *,
 void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
 void page_cache_async_ra(struct readahead_control *, struct page *,
 		unsigned long req_count);
+void readahead_expand(struct readahead_control *ractl,
+		      loff_t new_start, size_t new_len);
 
 /**
  * page_cache_sync_readahead - generic file readahead
diff --git a/mm/readahead.c b/mm/readahead.c
index 5b423ecc99f1..d589f147f4c2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -638,3 +638,78 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
 {
 	return ksys_readahead(fd, offset, count);
 }
+
+/**
+ * readahead_expand - Expand a readahead request
+ * @ractl: The request to be expanded
+ * @new_start: The revised start
+ * @new_len: The revised size of the request
+ *
+ * Attempt to expand a readahead request outwards from the current size to the
+ * specified size by inserting locked pages before and after the current window
+ * to increase the size to the new window.  This may involve the insertion of
+ * THPs, in which case the window may get expanded even beyond what was
+ * requested.
+ *
+ * The algorithm will stop if it encounters a conflicting page already in the
+ * pagecache and leave a smaller expansion than requested.
+ *
+ * The caller must check for this by examining the revised @ractl object for a
+ * different expansion than was requested.
+ */
+void readahead_expand(struct readahead_control *ractl,
+		      loff_t new_start, size_t new_len)
+{
+	struct address_space *mapping = ractl->mapping;
+	struct file_ra_state *ra = ractl->ra;
+	pgoff_t new_index, new_nr_pages;
+	gfp_t gfp_mask = readahead_gfp_mask(mapping);
+
+	new_index = new_start / PAGE_SIZE;
+
+	/* Expand the leading edge downwards */
+	while (ractl->_index > new_index) {
+		unsigned long index = ractl->_index - 1;
+		struct page *page = xa_load(&mapping->i_pages, index);
+
+		if (page && !xa_is_value(page))
+			return; /* Page apparently present */
+
+		page = __page_cache_alloc(gfp_mask);
+		if (!page)
+			return;
+		if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
+			put_page(page);
+			return;
+		}
+
+		ractl->_nr_pages++;
+		ractl->_index = page->index;
+	}
+
+	new_len += new_start - readahead_pos(ractl);
+	new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
+
+	/* Expand the trailing edge upwards */
+	while (ractl->_nr_pages < new_nr_pages) {
+		unsigned long index = ractl->_index + ractl->_nr_pages;
+		struct page *page = xa_load(&mapping->i_pages, index);
+
+		if (page && !xa_is_value(page))
+			return; /* Page apparently present */
+
+		page = __page_cache_alloc(gfp_mask);
+		if (!page)
+			return;
+		if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
+			put_page(page);
+			return;
+		}
+		ractl->_nr_pages++;
+		if (ra) {
+			ra->size++;
+			ra->async_size++;
+		}
+	}
+}
+EXPORT_SYMBOL(readahead_expand);
-- 
cgit v1.2.3


From b533a83f2bf97c22ab862a7493d13d80c93696f0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 15 Feb 2021 13:23:33 +0000
Subject: netfs, mm: Move PG_fscache helper funcs to linux/netfs.h

Move the PG_fscache related helper funcs (such as SetPageFsCache()) to
linux/netfs.h rather than linux/fscache.h as the intention is to move to a
model where they're used by the network filesystem and the helper library,
but not by fscache/cachefiles itself.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/161340392347.1303470.18065131603507621762.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539534516.286939.6265142985563005000.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653792959.2770958.5386546945273988117.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789073997.6155.18442271115255650614.stgit@warthog.procyon.org.uk/ # v6
---
 include/linux/fscache.h | 11 +----------
 include/linux/netfs.h   | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/netfs.h

(limited to 'include/linux')

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index a1c928fe98e7..1f8dc72369ee 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/list_bl.h>
+#include <linux/netfs.h>
 
 #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
 #define fscache_available() (1)
@@ -29,16 +30,6 @@
 #endif
 
 
-/*
- * overload PG_private_2 to give us PG_fscache - this is used to indicate that
- * a page is currently backed by a local disk cache
- */
-#define PageFsCache(page)		PagePrivate2((page))
-#define SetPageFsCache(page)		SetPagePrivate2((page))
-#define ClearPageFsCache(page)		ClearPagePrivate2((page))
-#define TestSetPageFsCache(page)	TestSetPagePrivate2((page))
-#define TestClearPageFsCache(page)	TestClearPagePrivate2((page))
-
 /* pattern used to fill dead space in an index entry */
 #define FSCACHE_INDEX_DEADFILL_PATTERN 0x79
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
new file mode 100644
index 000000000000..cc1102040488
--- /dev/null
+++ b/include/linux/netfs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Network filesystem support services.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * See:
+ *
+ *	Documentation/filesystems/netfs_library.rst
+ *
+ * for a description of the network filesystem interface declared here.
+ */
+
+#ifndef _LINUX_NETFS_H
+#define _LINUX_NETFS_H
+
+#include <linux/pagemap.h>
+
+/*
+ * Overload PG_private_2 to give us PG_fscache - this is used to indicate that
+ * a page is currently backed by a local disk cache
+ */
+#define PageFsCache(page)		PagePrivate2((page))
+#define SetPageFsCache(page)		SetPagePrivate2((page))
+#define ClearPageFsCache(page)		ClearPagePrivate2((page))
+#define TestSetPageFsCache(page)	TestSetPagePrivate2((page))
+#define TestClearPageFsCache(page)	TestClearPagePrivate2((page))
+
+#endif /* _LINUX_NETFS_H */
-- 
cgit v1.2.3


From 99bff93c17c05470196b2c4e699c3e58d327022b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 15 Feb 2021 13:20:46 +0000
Subject: netfs, mm: Add set/end/wait_on_page_fscache() aliases

Add set/end/wait_on_page_fscache() as aliases of
set/end/wait_page_private_2().  These allow a page to marked with
PG_fscache, the flag to be removed and waiters woken and waiting for the
flag to be cleared.  A ref on the page is also taken and dropped.

[Linus suggested putting the fscache-themed functions into the
 caching-specific headers rather than pagemap.h[1]]

Changes:
v5:
- Mirror the changes to the core routines[2].

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Linus Torvalds <torvalds@linux-foundation.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/1330473.1612974547@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/CAHk-=wjgA-74ddehziVk=XAEMTKswPu1Yw4uaro1R3ibs27ztw@mail.gmail.com/ [1]
Link: https://lore.kernel.org/r/161340393568.1303470.4997526899111310530.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539536093.286939.5076448803512118764.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/2499407.1616505440@warthog.procyon.org.uk/ [2]
Link: https://lore.kernel.org/r/161653793873.2770958.12157243390965814502.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789075327.6155.7432127924219092385.stgit@warthog.procyon.org.uk/ # v6
---
 include/linux/netfs.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index cc1102040488..8479d63406f7 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -26,4 +26,61 @@
 #define TestSetPageFsCache(page)	TestSetPagePrivate2((page))
 #define TestClearPageFsCache(page)	TestClearPagePrivate2((page))
 
+/**
+ * set_page_fscache - Set PG_fscache on a page and take a ref
+ * @page: The page.
+ *
+ * Set the PG_fscache (PG_private_2) flag on a page and take the reference
+ * needed for the VM to handle its lifetime correctly.  This sets the flag and
+ * takes the reference unconditionally, so care must be taken not to set the
+ * flag again if it's already set.
+ */
+static inline void set_page_fscache(struct page *page)
+{
+	set_page_private_2(page);
+}
+
+/**
+ * end_page_fscache - Clear PG_fscache and release any waiters
+ * @page: The page
+ *
+ * Clear the PG_fscache (PG_private_2) bit on a page and wake up any sleepers
+ * waiting for this.  The page ref held for PG_private_2 being set is released.
+ *
+ * This is, for example, used when a netfs page is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same page to be
+ * serialised.
+ */
+static inline void end_page_fscache(struct page *page)
+{
+	end_page_private_2(page);
+}
+
+/**
+ * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page
+ * @page: The page to wait on
+ *
+ * Wait for PG_fscache (aka PG_private_2) to be cleared on a page.
+ */
+static inline void wait_on_page_fscache(struct page *page)
+{
+	wait_on_page_private_2(page);
+}
+
+/**
+ * wait_on_page_fscache_killable - Wait for PG_fscache to be cleared on a page
+ * @page: The page to wait on
+ *
+ * Wait for PG_fscache (aka PG_private_2) to be cleared on a page or until a
+ * fatal signal is received by the calling task.
+ *
+ * Return:
+ * - 0 if successful.
+ * - -EINTR if a fatal signal was encountered.
+ */
+static inline int wait_on_page_fscache_killable(struct page *page)
+{
+	return wait_on_page_private_2_killable(page);
+}
+
 #endif /* _LINUX_NETFS_H */
-- 
cgit v1.2.3


From 3d3c95046742e4eebaa4b891b0b01cbbed94ebbd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 13 May 2020 17:41:20 +0100
Subject: netfs: Provide readahead and readpage netfs helpers

Add a pair of helper functions:

 (*) netfs_readahead()
 (*) netfs_readpage()

to do the work of handling a readahead or a readpage, where the page(s)
that form part of the request may be split between the local cache, the
server or just require clearing, and may be single pages and transparent
huge pages.  This is all handled within the helper.

Note that while both will read from the cache if there is data present,
only netfs_readahead() will expand the request beyond what it was asked to
do, and only netfs_readahead() will write back to the cache.

netfs_readpage(), on the other hand, is synchronous and only fetches the
page (which might be a THP) it is asked for.

The netfs gives the helper parameters from the VM, the cache cookie it
wants to use (or NULL) and a table of operations (only one of which is
mandatory):

 (*) expand_readahead() [optional]

     Called to allow the netfs to request an expansion of a readahead
     request to meet its own alignment requirements.  This is done by
     changing rreq->start and rreq->len.

 (*) clamp_length() [optional]

     Called to allow the netfs to cut down a subrequest to meet its own
     boundary requirements.  If it does this, the helper will generate
     additional subrequests until the full request is satisfied.

 (*) is_still_valid() [optional]

     Called to find out if the data just read from the cache has been
     invalidated and must be reread from the server.

 (*) issue_op() [required]

     Called to ask the netfs to issue a read to the server.  The subrequest
     describes the read.  The read request holds information about the file
     being accessed.

     The netfs can cache information in rreq->netfs_priv.

     Upon completion, the netfs should set the error, transferred and can
     also set FSCACHE_SREQ_CLEAR_TAIL and then call
     fscache_subreq_terminated().

 (*) done() [optional]

     Called after the pages have been unlocked.  The read request is still
     pinning the file and mapping and may still be pinning pages with
     PG_fscache.  rreq->error indicates any error that has been
     accumulated.

 (*) cleanup() [optional]

     Called when the helper is disposing of a finished read request.  This
     allows the netfs to clear rreq->netfs_priv.

Netfs support is enabled with CONFIG_NETFS_SUPPORT=y.  It will be built
even if CONFIG_FSCACHE=n and in this case much of it should be optimised
away, allowing the filesystem to use it even when caching is disabled.

Changes:
v5:
 - Comment why netfs_readahead() is putting pages[2].
 - Use page_file_mapping() rather than page->mapping[2].
 - Use page_index() rather than page->index[2].
 - Use set_page_fscache()[3] rather then SetPageFsCache() as this takes an
   appropriate ref too[4].

v4:
 - Folded in a kerneldoc comment fix.
 - Folded in a fix for the error handling in the case that ENOMEM occurs.
 - Added flag to netfs_subreq_terminated() to indicate that the caller may
   have been running async and stuff that might sleep needs punting to a
   workqueue (can't use in_softirq()[1]).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20210216084230.GA23669@lst.de/ [1]
Link: https://lore.kernel.org/r/20210321014202.GF3420@casper.infradead.org/ [2]
Link: https://lore.kernel.org/r/2499407.1616505440@warthog.procyon.org.uk/ [3]
Link: https://lore.kernel.org/r/CAHk-=wh+2gbF7XEjYc=HV9w_2uVzVf7vs60BPz0gFA=+pUm3ww@mail.gmail.com/ [4]
Link: https://lore.kernel.org/r/160588497406.3465195.18003475695899726222.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161118136849.1232039.8923686136144228724.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161161032290.2537118.13400578415247339173.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/161340394873.1303470.6237319335883242536.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539537375.286939.16642940088716990995.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653795430.2770958.4947584573720000554.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789076581.6155.6745849361504760209.stgit@warthog.procyon.org.uk/ # v6
---
 fs/Kconfig             |   1 +
 fs/Makefile            |   1 +
 fs/netfs/Makefile      |   6 +
 fs/netfs/internal.h    |  61 +++++
 fs/netfs/read_helper.c | 725 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h  |  83 ++++++
 6 files changed, 877 insertions(+)
 create mode 100644 fs/netfs/Makefile
 create mode 100644 fs/netfs/internal.h
 create mode 100644 fs/netfs/read_helper.c

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index a55bda4233bb..97e7b77c9309 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -125,6 +125,7 @@ source "fs/overlayfs/Kconfig"
 
 menu "Caches"
 
+source "fs/netfs/Kconfig"
 source "fs/fscache/Kconfig"
 source "fs/cachefiles/Kconfig"
 
diff --git a/fs/Makefile b/fs/Makefile
index 3215fe205256..9c708e1fbe8f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -67,6 +67,7 @@ obj-y				+= devpts/
 obj-$(CONFIG_DLM)		+= dlm/
  
 # Do not add any filesystems before this line
+obj-$(CONFIG_NETFS_SUPPORT)	+= netfs/
 obj-$(CONFIG_FSCACHE)		+= fscache/
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT4_FS)		+= ext4/
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
new file mode 100644
index 000000000000..4b4eff2ba369
--- /dev/null
+++ b/fs/netfs/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+netfs-y := \
+	read_helper.o
+
+obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
new file mode 100644
index 000000000000..ee665c0e7dc8
--- /dev/null
+++ b/fs/netfs/internal.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Internal definitions for network filesystem support
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "netfs: " fmt
+
+/*
+ * read_helper.c
+ */
+extern unsigned int netfs_debug;
+
+#define netfs_stat(x) do {} while(0)
+#define netfs_stat_d(x) do {} while(0)
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+	printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_NETFS_DEBUG)
+#define _enter(FMT, ...)			\
+do {						\
+	if (netfs_debug)			\
+		kenter(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#define _leave(FMT, ...)			\
+do {						\
+	if (netfs_debug)			\
+		kleave(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#define _debug(FMT, ...)			\
+do {						\
+	if (netfs_debug)			\
+		kdebug(FMT, ##__VA_ARGS__);	\
+} while (0)
+
+#else
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
+#endif
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
new file mode 100644
index 000000000000..30d4bf6bf28a
--- /dev/null
+++ b/fs/netfs/read_helper.c
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Network filesystem high-level read support.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("Network fs support");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned netfs_debug;
+module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+
+static void netfs_rreq_work(struct work_struct *);
+static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool);
+
+static void netfs_put_subrequest(struct netfs_read_subrequest *subreq,
+				 bool was_async)
+{
+	if (refcount_dec_and_test(&subreq->usage))
+		__netfs_put_subrequest(subreq, was_async);
+}
+
+static struct netfs_read_request *netfs_alloc_read_request(
+	const struct netfs_read_request_ops *ops, void *netfs_priv,
+	struct file *file)
+{
+	static atomic_t debug_ids;
+	struct netfs_read_request *rreq;
+
+	rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL);
+	if (rreq) {
+		rreq->netfs_ops	= ops;
+		rreq->netfs_priv = netfs_priv;
+		rreq->inode	= file_inode(file);
+		rreq->i_size	= i_size_read(rreq->inode);
+		rreq->debug_id	= atomic_inc_return(&debug_ids);
+		INIT_LIST_HEAD(&rreq->subrequests);
+		INIT_WORK(&rreq->work, netfs_rreq_work);
+		refcount_set(&rreq->usage, 1);
+		__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+		ops->init_rreq(rreq, file);
+	}
+
+	return rreq;
+}
+
+static void netfs_get_read_request(struct netfs_read_request *rreq)
+{
+	refcount_inc(&rreq->usage);
+}
+
+static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq,
+				     bool was_async)
+{
+	struct netfs_read_subrequest *subreq;
+
+	while (!list_empty(&rreq->subrequests)) {
+		subreq = list_first_entry(&rreq->subrequests,
+					  struct netfs_read_subrequest, rreq_link);
+		list_del(&subreq->rreq_link);
+		netfs_put_subrequest(subreq, was_async);
+	}
+}
+
+static void netfs_free_read_request(struct work_struct *work)
+{
+	struct netfs_read_request *rreq =
+		container_of(work, struct netfs_read_request, work);
+	netfs_rreq_clear_subreqs(rreq, false);
+	if (rreq->netfs_priv)
+		rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
+	kfree(rreq);
+}
+
+static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async)
+{
+	if (refcount_dec_and_test(&rreq->usage)) {
+		if (was_async) {
+			rreq->work.func = netfs_free_read_request;
+			if (!queue_work(system_unbound_wq, &rreq->work))
+				BUG();
+		} else {
+			netfs_free_read_request(&rreq->work);
+		}
+	}
+}
+
+/*
+ * Allocate and partially initialise an I/O request structure.
+ */
+static struct netfs_read_subrequest *netfs_alloc_subrequest(
+	struct netfs_read_request *rreq)
+{
+	struct netfs_read_subrequest *subreq;
+
+	subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL);
+	if (subreq) {
+		INIT_LIST_HEAD(&subreq->rreq_link);
+		refcount_set(&subreq->usage, 2);
+		subreq->rreq = rreq;
+		netfs_get_read_request(rreq);
+	}
+
+	return subreq;
+}
+
+static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq)
+{
+	refcount_inc(&subreq->usage);
+}
+
+static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq,
+				   bool was_async)
+{
+	struct netfs_read_request *rreq = subreq->rreq;
+
+	kfree(subreq);
+	netfs_put_read_request(rreq, was_async);
+}
+
+/*
+ * Clear the unread part of an I/O request.
+ */
+static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
+{
+	struct iov_iter iter;
+
+	iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages,
+			subreq->start + subreq->transferred,
+			subreq->len   - subreq->transferred);
+	iov_iter_zero(iov_iter_count(&iter), &iter);
+}
+
+/*
+ * Fill a subrequest region with zeroes.
+ */
+static void netfs_fill_with_zeroes(struct netfs_read_request *rreq,
+				   struct netfs_read_subrequest *subreq)
+{
+	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+	netfs_subreq_terminated(subreq, 0, false);
+}
+
+/*
+ * Ask the netfs to issue a read request to the server for us.
+ *
+ * The netfs is expected to read from subreq->pos + subreq->transferred to
+ * subreq->pos + subreq->len - 1.  It may not backtrack and write data into the
+ * buffer prior to the transferred point as it might clobber dirty data
+ * obtained from the cache.
+ *
+ * Alternatively, the netfs is allowed to indicate one of two things:
+ *
+ * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
+ *   make progress.
+ *
+ * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
+ *   cleared.
+ */
+static void netfs_read_from_server(struct netfs_read_request *rreq,
+				   struct netfs_read_subrequest *subreq)
+{
+	rreq->netfs_ops->issue_op(subreq);
+}
+
+/*
+ * Release those waiting.
+ */
+static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async)
+{
+	netfs_rreq_clear_subreqs(rreq, was_async);
+	netfs_put_read_request(rreq, was_async);
+}
+
+/*
+ * Unlock the pages in a read operation.  We need to set PG_fscache on any
+ * pages we're going to write back before we unlock them.
+ */
+static void netfs_rreq_unlock(struct netfs_read_request *rreq)
+{
+	struct netfs_read_subrequest *subreq;
+	struct page *page;
+	unsigned int iopos, account = 0;
+	pgoff_t start_page = rreq->start / PAGE_SIZE;
+	pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+	bool subreq_failed = false;
+	int i;
+
+	XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+	if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
+		__clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+		list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+			__clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
+		}
+	}
+
+	/* Walk through the pagecache and the I/O request lists simultaneously.
+	 * We may have a mixture of cached and uncached sections and we only
+	 * really want to write out the uncached sections.  This is slightly
+	 * complicated by the possibility that we might have huge pages with a
+	 * mixture inside.
+	 */
+	subreq = list_first_entry(&rreq->subrequests,
+				  struct netfs_read_subrequest, rreq_link);
+	iopos = 0;
+	subreq_failed = (subreq->error < 0);
+
+	rcu_read_lock();
+	xas_for_each(&xas, page, last_page) {
+		unsigned int pgpos = (page->index - start_page) * PAGE_SIZE;
+		unsigned int pgend = pgpos + thp_size(page);
+		bool pg_failed = false;
+
+		for (;;) {
+			if (!subreq) {
+				pg_failed = true;
+				break;
+			}
+			if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
+				set_page_fscache(page);
+			pg_failed |= subreq_failed;
+			if (pgend < iopos + subreq->len)
+				break;
+
+			account += subreq->transferred;
+			iopos += subreq->len;
+			if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+				subreq = list_next_entry(subreq, rreq_link);
+				subreq_failed = (subreq->error < 0);
+			} else {
+				subreq = NULL;
+				subreq_failed = false;
+			}
+			if (pgend == iopos)
+				break;
+		}
+
+		if (!pg_failed) {
+			for (i = 0; i < thp_nr_pages(page); i++)
+				flush_dcache_page(page);
+			SetPageUptodate(page);
+		}
+
+		if (!test_bit(NETFS_RREQ_DONT_UNLOCK_PAGES, &rreq->flags)) {
+			if (page->index == rreq->no_unlock_page &&
+			    test_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags))
+				_debug("no unlock");
+			else
+				unlock_page(page);
+		}
+	}
+	rcu_read_unlock();
+
+	task_io_account_read(account);
+	if (rreq->netfs_ops->done)
+		rreq->netfs_ops->done(rreq);
+}
+
+/*
+ * Handle a short read.
+ */
+static void netfs_rreq_short_read(struct netfs_read_request *rreq,
+				  struct netfs_read_subrequest *subreq)
+{
+	__clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
+	__set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
+
+	netfs_get_read_subrequest(subreq);
+	atomic_inc(&rreq->nr_rd_ops);
+	netfs_read_from_server(rreq, subreq);
+}
+
+/*
+ * Resubmit any short or failed operations.  Returns true if we got the rreq
+ * ref back.
+ */
+static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
+{
+	struct netfs_read_subrequest *subreq;
+
+	WARN_ON(in_interrupt());
+
+	/* We don't want terminating submissions trying to wake us up whilst
+	 * we're still going through the list.
+	 */
+	atomic_inc(&rreq->nr_rd_ops);
+
+	__clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+	list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+		if (subreq->error) {
+			if (subreq->source != NETFS_READ_FROM_CACHE)
+				break;
+			subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+			subreq->error = 0;
+			netfs_get_read_subrequest(subreq);
+			atomic_inc(&rreq->nr_rd_ops);
+			netfs_read_from_server(rreq, subreq);
+		} else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) {
+			netfs_rreq_short_read(rreq, subreq);
+		}
+	}
+
+	/* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */
+	if (atomic_dec_and_test(&rreq->nr_rd_ops))
+		return true;
+
+	wake_up_var(&rreq->nr_rd_ops);
+	return false;
+}
+
+/*
+ * Assess the state of a read request and decide what to do next.
+ *
+ * Note that we could be in an ordinary kernel thread, on a workqueue or in
+ * softirq context at this point.  We inherit a ref from the caller.
+ */
+static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async)
+{
+again:
+	if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
+	    test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
+		if (netfs_rreq_perform_resubmissions(rreq))
+			goto again;
+		return;
+	}
+
+	netfs_rreq_unlock(rreq);
+
+	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+	wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+	netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_work(struct work_struct *work)
+{
+	struct netfs_read_request *rreq =
+		container_of(work, struct netfs_read_request, work);
+	netfs_rreq_assess(rreq, false);
+}
+
+/*
+ * Handle the completion of all outstanding I/O operations on a read request.
+ * We inherit a ref from the caller.
+ */
+static void netfs_rreq_terminated(struct netfs_read_request *rreq,
+				  bool was_async)
+{
+	if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
+	    was_async) {
+		if (!queue_work(system_unbound_wq, &rreq->work))
+			BUG();
+	} else {
+		netfs_rreq_assess(rreq, was_async);
+	}
+}
+
+/**
+ * netfs_subreq_terminated - Note the termination of an I/O operation.
+ * @subreq: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the read helper that a contributory I/O operation has terminated,
+ * one way or another, and that it should integrate the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred, 0 to
+ * indicate a failure to transfer anything that should be retried or a negative
+ * error code.  The helper will look after reissuing I/O operations as
+ * appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ */
+void netfs_subreq_terminated(struct netfs_read_subrequest *subreq,
+			     ssize_t transferred_or_error,
+			     bool was_async)
+{
+	struct netfs_read_request *rreq = subreq->rreq;
+	int u;
+
+	_enter("[%u]{%llx,%lx},%zd",
+	       subreq->debug_index, subreq->start, subreq->flags,
+	       transferred_or_error);
+
+	if (IS_ERR_VALUE(transferred_or_error)) {
+		subreq->error = transferred_or_error;
+		goto failed;
+	}
+
+	if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+		 "Subreq overread: R%x[%x] %zd > %zu - %zu",
+		 rreq->debug_id, subreq->debug_index,
+		 transferred_or_error, subreq->len, subreq->transferred))
+		transferred_or_error = subreq->len - subreq->transferred;
+
+	subreq->error = 0;
+	subreq->transferred += transferred_or_error;
+	if (subreq->transferred < subreq->len)
+		goto incomplete;
+
+complete:
+	__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+	if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
+		set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+
+out:
+	/* If we decrement nr_rd_ops to 0, the ref belongs to us. */
+	u = atomic_dec_return(&rreq->nr_rd_ops);
+	if (u == 0)
+		netfs_rreq_terminated(rreq, was_async);
+	else if (u == 1)
+		wake_up_var(&rreq->nr_rd_ops);
+
+	netfs_put_subrequest(subreq, was_async);
+	return;
+
+incomplete:
+	if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
+		netfs_clear_unread(subreq);
+		subreq->transferred = subreq->len;
+		goto complete;
+	}
+
+	if (transferred_or_error == 0) {
+		if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+			subreq->error = -ENODATA;
+			goto failed;
+		}
+	} else {
+		__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+	}
+
+	__set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
+	set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+	goto out;
+
+failed:
+	if (subreq->source == NETFS_READ_FROM_CACHE) {
+		set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+	} else {
+		set_bit(NETFS_RREQ_FAILED, &rreq->flags);
+		rreq->error = subreq->error;
+	}
+	goto out;
+}
+EXPORT_SYMBOL(netfs_subreq_terminated);
+
+static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq,
+						       loff_t i_size)
+{
+	struct netfs_read_request *rreq = subreq->rreq;
+
+	if (subreq->start >= rreq->i_size)
+		return NETFS_FILL_WITH_ZEROES;
+	return NETFS_DOWNLOAD_FROM_SERVER;
+}
+
+/*
+ * Work out what sort of subrequest the next one will be.
+ */
+static enum netfs_read_source
+netfs_rreq_prepare_read(struct netfs_read_request *rreq,
+			struct netfs_read_subrequest *subreq)
+{
+	enum netfs_read_source source;
+
+	_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
+
+	source = netfs_cache_prepare_read(subreq, rreq->i_size);
+	if (source == NETFS_INVALID_READ)
+		goto out;
+
+	if (source == NETFS_DOWNLOAD_FROM_SERVER) {
+		/* Call out to the netfs to let it shrink the request to fit
+		 * its own I/O sizes and boundaries.  If it shinks it here, it
+		 * will be called again to make simultaneous calls; if it wants
+		 * to make serial calls, it can indicate a short read and then
+		 * we will call it again.
+		 */
+		if (subreq->len > rreq->i_size - subreq->start)
+			subreq->len = rreq->i_size - subreq->start;
+
+		if (rreq->netfs_ops->clamp_length &&
+		    !rreq->netfs_ops->clamp_length(subreq)) {
+			source = NETFS_INVALID_READ;
+			goto out;
+		}
+	}
+
+	if (WARN_ON(subreq->len == 0))
+		source = NETFS_INVALID_READ;
+
+out:
+	subreq->source = source;
+	return source;
+}
+
+/*
+ * Slice off a piece of a read request and submit an I/O request for it.
+ */
+static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq,
+				    unsigned int *_debug_index)
+{
+	struct netfs_read_subrequest *subreq;
+	enum netfs_read_source source;
+
+	subreq = netfs_alloc_subrequest(rreq);
+	if (!subreq)
+		return false;
+
+	subreq->debug_index	= (*_debug_index)++;
+	subreq->start		= rreq->start + rreq->submitted;
+	subreq->len		= rreq->len   - rreq->submitted;
+
+	_debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
+	list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+
+	/* Call out to the cache to find out what it can do with the remaining
+	 * subset.  It tells us in subreq->flags what it decided should be done
+	 * and adjusts subreq->len down if the subset crosses a cache boundary.
+	 *
+	 * Then when we hand the subset, it can choose to take a subset of that
+	 * (the starts must coincide), in which case, we go around the loop
+	 * again and ask it to download the next piece.
+	 */
+	source = netfs_rreq_prepare_read(rreq, subreq);
+	if (source == NETFS_INVALID_READ)
+		goto subreq_failed;
+
+	atomic_inc(&rreq->nr_rd_ops);
+
+	rreq->submitted += subreq->len;
+
+	switch (source) {
+	case NETFS_FILL_WITH_ZEROES:
+		netfs_fill_with_zeroes(rreq, subreq);
+		break;
+	case NETFS_DOWNLOAD_FROM_SERVER:
+		netfs_read_from_server(rreq, subreq);
+		break;
+	default:
+		BUG();
+	}
+
+	return true;
+
+subreq_failed:
+	rreq->error = subreq->error;
+	netfs_put_subrequest(subreq, false);
+	return false;
+}
+
+static void netfs_rreq_expand(struct netfs_read_request *rreq,
+			      struct readahead_control *ractl)
+{
+	/* Give the netfs a chance to change the request parameters.  The
+	 * resultant request must contain the original region.
+	 */
+	if (rreq->netfs_ops->expand_readahead)
+		rreq->netfs_ops->expand_readahead(rreq);
+
+	/* Expand the request if the cache wants it to start earlier.  Note
+	 * that the expansion may get further extended if the VM wishes to
+	 * insert THPs and the preferred start and/or end wind up in the middle
+	 * of THPs.
+	 *
+	 * If this is the case, however, the THP size should be an integer
+	 * multiple of the cache granule size, so we get a whole number of
+	 * granules to deal with.
+	 */
+	if (rreq->start  != readahead_pos(ractl) ||
+	    rreq->len != readahead_length(ractl)) {
+		readahead_expand(ractl, rreq->start, rreq->len);
+		rreq->start  = readahead_pos(ractl);
+		rreq->len = readahead_length(ractl);
+	}
+}
+
+/**
+ * netfs_readahead - Helper to manage a read request
+ * @ractl: The description of the readahead request
+ * @ops: The network filesystem's operations for the helper to use
+ * @netfs_priv: Private netfs data to be retained in the request
+ *
+ * Fulfil a readahead request by drawing data from the cache if possible, or
+ * the netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O
+ * requests from different sources will get munged together.  If necessary, the
+ * readahead window can be expanded in either direction to a more convenient
+ * alighment for RPC efficiency or to make storage in the cache feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.  It may also be passed a private token, which will
+ * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
+ *
+ * This is usable whether or not caching is enabled.
+ */
+void netfs_readahead(struct readahead_control *ractl,
+		     const struct netfs_read_request_ops *ops,
+		     void *netfs_priv)
+{
+	struct netfs_read_request *rreq;
+	struct page *page;
+	unsigned int debug_index = 0;
+
+	_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
+
+	if (readahead_count(ractl) == 0)
+		goto cleanup;
+
+	rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file);
+	if (!rreq)
+		goto cleanup;
+	rreq->mapping	= ractl->mapping;
+	rreq->start	= readahead_pos(ractl);
+	rreq->len	= readahead_length(ractl);
+
+	netfs_rreq_expand(rreq, ractl);
+
+	atomic_set(&rreq->nr_rd_ops, 1);
+	do {
+		if (!netfs_rreq_submit_slice(rreq, &debug_index))
+			break;
+
+	} while (rreq->submitted < rreq->len);
+
+	/* Drop the refs on the pages here rather than in the cache or
+	 * filesystem.  The locks will be dropped in netfs_rreq_unlock().
+	 */
+	while ((page = readahead_page(ractl)))
+		put_page(page);
+
+	/* If we decrement nr_rd_ops to 0, the ref belongs to us. */
+	if (atomic_dec_and_test(&rreq->nr_rd_ops))
+		netfs_rreq_assess(rreq, false);
+	return;
+
+cleanup:
+	if (netfs_priv)
+		ops->cleanup(ractl->mapping, netfs_priv);
+	return;
+}
+EXPORT_SYMBOL(netfs_readahead);
+
+/**
+ * netfs_page - Helper to manage a readpage request
+ * @file: The file to read from
+ * @page: The page to read
+ * @ops: The network filesystem's operations for the helper to use
+ * @netfs_priv: Private netfs data to be retained in the request
+ *
+ * Fulfil a readpage request by drawing data from the cache if possible, or the
+ * netfs if not.  Space beyond the EOF is zero-filled.  Multiple I/O requests
+ * from different sources will get munged together.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.  It may also be passed a private token, which will
+ * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_readpage(struct file *file,
+		   struct page *page,
+		   const struct netfs_read_request_ops *ops,
+		   void *netfs_priv)
+{
+	struct netfs_read_request *rreq;
+	unsigned int debug_index = 0;
+	int ret;
+
+	_enter("%lx", page_index(page));
+
+	rreq = netfs_alloc_read_request(ops, netfs_priv, file);
+	if (!rreq) {
+		if (netfs_priv)
+			ops->cleanup(netfs_priv, page_file_mapping(page));
+		unlock_page(page);
+		return -ENOMEM;
+	}
+	rreq->mapping	= page_file_mapping(page);
+	rreq->start	= page_index(page) * PAGE_SIZE;
+	rreq->len	= thp_size(page);
+
+	netfs_get_read_request(rreq);
+
+	atomic_set(&rreq->nr_rd_ops, 1);
+	do {
+		if (!netfs_rreq_submit_slice(rreq, &debug_index))
+			break;
+
+	} while (rreq->submitted < rreq->len);
+
+	/* Keep nr_rd_ops incremented so that the ref always belongs to us, and
+	 * the service code isn't punted off to a random thread pool to
+	 * process.
+	 */
+	do {
+		wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
+		netfs_rreq_assess(rreq, false);
+	} while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags));
+
+	ret = rreq->error;
+	if (ret == 0 && rreq->submitted < rreq->len)
+		ret = -EIO;
+	netfs_put_read_request(rreq, false);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_readpage);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 8479d63406f7..59e926e62d2e 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -14,6 +14,8 @@
 #ifndef _LINUX_NETFS_H
 #define _LINUX_NETFS_H
 
+#include <linux/workqueue.h>
+#include <linux/fs.h>
 #include <linux/pagemap.h>
 
 /*
@@ -83,4 +85,85 @@ static inline int wait_on_page_fscache_killable(struct page *page)
 	return wait_on_page_private_2_killable(page);
 }
 
+enum netfs_read_source {
+	NETFS_FILL_WITH_ZEROES,
+	NETFS_DOWNLOAD_FROM_SERVER,
+	NETFS_READ_FROM_CACHE,
+	NETFS_INVALID_READ,
+} __mode(byte);
+
+/*
+ * Descriptor for a single component subrequest.
+ */
+struct netfs_read_subrequest {
+	struct netfs_read_request *rreq;	/* Supervising read request */
+	struct list_head	rreq_link;	/* Link in rreq->subrequests */
+	loff_t			start;		/* Where to start the I/O */
+	size_t			len;		/* Size of the I/O */
+	size_t			transferred;	/* Amount of data transferred */
+	refcount_t		usage;
+	short			error;		/* 0 or error that occurred */
+	unsigned short		debug_index;	/* Index in list (for debugging output) */
+	enum netfs_read_source	source;		/* Where to read from */
+	unsigned long		flags;
+#define NETFS_SREQ_WRITE_TO_CACHE	0	/* Set if should write to cache */
+#define NETFS_SREQ_CLEAR_TAIL		1	/* Set if the rest of the read should be cleared */
+#define NETFS_SREQ_SHORT_READ		2	/* Set if there was a short read from the cache */
+#define NETFS_SREQ_SEEK_DATA_READ	3	/* Set if ->read() should SEEK_DATA first */
+#define NETFS_SREQ_NO_PROGRESS		4	/* Set if we didn't manage to read any data */
+};
+
+/*
+ * Descriptor for a read helper request.  This is used to make multiple I/O
+ * requests on a variety of sources and then stitch the result together.
+ */
+struct netfs_read_request {
+	struct work_struct	work;
+	struct inode		*inode;		/* The file being accessed */
+	struct address_space	*mapping;	/* The mapping being accessed */
+	struct list_head	subrequests;	/* Requests to fetch I/O from disk or net */
+	void			*netfs_priv;	/* Private data for the netfs */
+	unsigned int		debug_id;
+	atomic_t		nr_rd_ops;	/* Number of read ops in progress */
+	size_t			submitted;	/* Amount submitted for I/O so far */
+	size_t			len;		/* Length of the request */
+	short			error;		/* 0 or error that occurred */
+	loff_t			i_size;		/* Size of the file */
+	loff_t			start;		/* Start position */
+	pgoff_t			no_unlock_page;	/* Don't unlock this page after read */
+	refcount_t		usage;
+	unsigned long		flags;
+#define NETFS_RREQ_INCOMPLETE_IO	0	/* Some ioreqs terminated short or with error */
+#define NETFS_RREQ_WRITE_TO_CACHE	1	/* Need to write to the cache */
+#define NETFS_RREQ_NO_UNLOCK_PAGE	2	/* Don't unlock no_unlock_page on completion */
+#define NETFS_RREQ_DONT_UNLOCK_PAGES	3	/* Don't unlock the pages on completion */
+#define NETFS_RREQ_FAILED		4	/* The request failed */
+#define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes */
+	const struct netfs_read_request_ops *netfs_ops;
+};
+
+/*
+ * Operations the network filesystem can/must provide to the helpers.
+ */
+struct netfs_read_request_ops {
+	void (*init_rreq)(struct netfs_read_request *rreq, struct file *file);
+	void (*expand_readahead)(struct netfs_read_request *rreq);
+	bool (*clamp_length)(struct netfs_read_subrequest *subreq);
+	void (*issue_op)(struct netfs_read_subrequest *subreq);
+	bool (*is_still_valid)(struct netfs_read_request *rreq);
+	void (*done)(struct netfs_read_request *rreq);
+	void (*cleanup)(struct address_space *mapping, void *netfs_priv);
+};
+
+struct readahead_control;
+extern void netfs_readahead(struct readahead_control *,
+			    const struct netfs_read_request_ops *,
+			    void *);
+extern int netfs_readpage(struct file *,
+			  struct page *,
+			  const struct netfs_read_request_ops *,
+			  void *);
+
+extern void netfs_subreq_terminated(struct netfs_read_subrequest *, ssize_t, bool);
+
 #endif /* _LINUX_NETFS_H */
-- 
cgit v1.2.3


From 77b4d2c6316ab096e3f77eea240144941434f2a4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 18 Sep 2020 09:25:13 +0100
Subject: netfs: Add tracepoints

Add three tracepoints to track the activity of the read helpers:

 (1) netfs/netfs_read

     This logs entry to the read helpers and also expansion of the range in
     a readahead request.

 (2) netfs/netfs_rreq

     This logs the progress of netfs_read_request objects which track
     read requests.  A read request may be a compound of multiple
     subrequests.

 (3) netfs/netfs_sreq

     This logs the progress of netfs_read_subrequest objects, which track
     the contributions from various sources to a read request.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/161118138060.1232039.5353374588021776217.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161161033468.2537118.14021843889844001905.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/161340395843.1303470.7355519662919639648.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539538693.286939.10171713520419106334.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653796447.2770958.1870655382450862155.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789078003.6155.17814844411672989942.stgit@warthog.procyon.org.uk/ # v6
---
 fs/netfs/read_helper.c       |  26 ++++++
 include/linux/netfs.h        |   1 +
 include/trace/events/netfs.h | 199 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 226 insertions(+)
 create mode 100644 include/trace/events/netfs.h

(limited to 'include/linux')

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 30d4bf6bf28a..799eee7f4ee6 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -16,6 +16,8 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/netfs.h>
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/netfs.h>
 
 MODULE_DESCRIPTION("Network fs support");
 MODULE_AUTHOR("Red Hat, Inc.");
@@ -84,6 +86,7 @@ static void netfs_free_read_request(struct work_struct *work)
 	netfs_rreq_clear_subreqs(rreq, false);
 	if (rreq->netfs_priv)
 		rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
+	trace_netfs_rreq(rreq, netfs_rreq_trace_free);
 	kfree(rreq);
 }
 
@@ -129,6 +132,7 @@ static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq,
 {
 	struct netfs_read_request *rreq = subreq->rreq;
 
+	trace_netfs_sreq(subreq, netfs_sreq_trace_free);
 	kfree(subreq);
 	netfs_put_read_request(rreq, was_async);
 }
@@ -183,6 +187,7 @@ static void netfs_read_from_server(struct netfs_read_request *rreq,
  */
 static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async)
 {
+	trace_netfs_rreq(rreq, netfs_rreq_trace_done);
 	netfs_rreq_clear_subreqs(rreq, was_async);
 	netfs_put_read_request(rreq, was_async);
 }
@@ -221,6 +226,8 @@ static void netfs_rreq_unlock(struct netfs_read_request *rreq)
 	iopos = 0;
 	subreq_failed = (subreq->error < 0);
 
+	trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
+
 	rcu_read_lock();
 	xas_for_each(&xas, page, last_page) {
 		unsigned int pgpos = (page->index - start_page) * PAGE_SIZE;
@@ -281,6 +288,8 @@ static void netfs_rreq_short_read(struct netfs_read_request *rreq,
 	__clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
 	__set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
 
+	trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
+
 	netfs_get_read_subrequest(subreq);
 	atomic_inc(&rreq->nr_rd_ops);
 	netfs_read_from_server(rreq, subreq);
@@ -296,6 +305,8 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
 
 	WARN_ON(in_interrupt());
 
+	trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+
 	/* We don't want terminating submissions trying to wake us up whilst
 	 * we're still going through the list.
 	 */
@@ -308,6 +319,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
 				break;
 			subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
 			subreq->error = 0;
+			trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
 			netfs_get_read_subrequest(subreq);
 			atomic_inc(&rreq->nr_rd_ops);
 			netfs_read_from_server(rreq, subreq);
@@ -332,6 +344,8 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
  */
 static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async)
 {
+	trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
+
 again:
 	if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
 	    test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
@@ -422,6 +436,8 @@ complete:
 		set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
 
 out:
+	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
 	/* If we decrement nr_rd_ops to 0, the ref belongs to us. */
 	u = atomic_dec_return(&rreq->nr_rd_ops);
 	if (u == 0)
@@ -510,6 +526,7 @@ netfs_rreq_prepare_read(struct netfs_read_request *rreq,
 
 out:
 	subreq->source = source;
+	trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
 	return source;
 }
 
@@ -549,6 +566,7 @@ static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq,
 
 	rreq->submitted += subreq->len;
 
+	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 	switch (source) {
 	case NETFS_FILL_WITH_ZEROES:
 		netfs_fill_with_zeroes(rreq, subreq);
@@ -591,6 +609,9 @@ static void netfs_rreq_expand(struct netfs_read_request *rreq,
 		readahead_expand(ractl, rreq->start, rreq->len);
 		rreq->start  = readahead_pos(ractl);
 		rreq->len = readahead_length(ractl);
+
+		trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+				 netfs_read_trace_expanded);
 	}
 }
 
@@ -632,6 +653,9 @@ void netfs_readahead(struct readahead_control *ractl,
 	rreq->start	= readahead_pos(ractl);
 	rreq->len	= readahead_length(ractl);
 
+	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+			 netfs_read_trace_readahead);
+
 	netfs_rreq_expand(rreq, ractl);
 
 	atomic_set(&rreq->nr_rd_ops, 1);
@@ -698,6 +722,8 @@ int netfs_readpage(struct file *file,
 	rreq->start	= page_index(page) * PAGE_SIZE;
 	rreq->len	= thp_size(page);
 
+	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
+
 	netfs_get_read_request(rreq);
 
 	atomic_set(&rreq->nr_rd_ops, 1);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 59e926e62d2e..8e8c6a4e4dde 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -124,6 +124,7 @@ struct netfs_read_request {
 	struct list_head	subrequests;	/* Requests to fetch I/O from disk or net */
 	void			*netfs_priv;	/* Private data for the netfs */
 	unsigned int		debug_id;
+	unsigned int		cookie_debug_id;
 	atomic_t		nr_rd_ops;	/* Number of read ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
 	size_t			len;		/* Length of the request */
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
new file mode 100644
index 000000000000..12ad382764c5
--- /dev/null
+++ b/include/trace/events/netfs.h
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Network filesystem support module tracepoints
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM netfs
+
+#if !defined(_TRACE_NETFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NETFS_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Define enums for tracing information.
+ */
+#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
+#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
+
+enum netfs_read_trace {
+	netfs_read_trace_expanded,
+	netfs_read_trace_readahead,
+	netfs_read_trace_readpage,
+};
+
+enum netfs_rreq_trace {
+	netfs_rreq_trace_assess,
+	netfs_rreq_trace_done,
+	netfs_rreq_trace_free,
+	netfs_rreq_trace_resubmit,
+	netfs_rreq_trace_unlock,
+	netfs_rreq_trace_unmark,
+	netfs_rreq_trace_write,
+};
+
+enum netfs_sreq_trace {
+	netfs_sreq_trace_download_instead,
+	netfs_sreq_trace_free,
+	netfs_sreq_trace_prepare,
+	netfs_sreq_trace_resubmit_short,
+	netfs_sreq_trace_submit,
+	netfs_sreq_trace_terminated,
+	netfs_sreq_trace_write,
+	netfs_sreq_trace_write_term,
+};
+
+#endif
+
+#define netfs_read_traces					\
+	EM(netfs_read_trace_expanded,		"EXPANDED ")	\
+	EM(netfs_read_trace_readahead,		"READAHEAD")	\
+	E_(netfs_read_trace_readpage,		"READPAGE ")
+
+#define netfs_rreq_traces					\
+	EM(netfs_rreq_trace_assess,		"ASSESS")	\
+	EM(netfs_rreq_trace_done,		"DONE  ")	\
+	EM(netfs_rreq_trace_free,		"FREE  ")	\
+	EM(netfs_rreq_trace_resubmit,		"RESUBM")	\
+	EM(netfs_rreq_trace_unlock,		"UNLOCK")	\
+	EM(netfs_rreq_trace_unmark,		"UNMARK")	\
+	E_(netfs_rreq_trace_write,		"WRITE ")
+
+#define netfs_sreq_sources					\
+	EM(NETFS_FILL_WITH_ZEROES,		"ZERO")		\
+	EM(NETFS_DOWNLOAD_FROM_SERVER,		"DOWN")		\
+	EM(NETFS_READ_FROM_CACHE,		"READ")		\
+	E_(NETFS_INVALID_READ,			"INVL")		\
+
+#define netfs_sreq_traces					\
+	EM(netfs_sreq_trace_download_instead,	"RDOWN")	\
+	EM(netfs_sreq_trace_free,		"FREE ")	\
+	EM(netfs_sreq_trace_prepare,		"PREP ")	\
+	EM(netfs_sreq_trace_resubmit_short,	"SHORT")	\
+	EM(netfs_sreq_trace_submit,		"SUBMT")	\
+	EM(netfs_sreq_trace_terminated,		"TERM ")	\
+	EM(netfs_sreq_trace_write,		"WRITE")	\
+	E_(netfs_sreq_trace_write_term,		"WTERM")
+
+
+/*
+ * Export enum symbols via userspace.
+ */
+#undef EM
+#undef E_
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define E_(a, b) TRACE_DEFINE_ENUM(a);
+
+netfs_read_traces;
+netfs_rreq_traces;
+netfs_sreq_sources;
+netfs_sreq_traces;
+
+/*
+ * Now redefine the EM() and E_() macros to map the enums to the strings that
+ * will be printed in the output.
+ */
+#undef EM
+#undef E_
+#define EM(a, b)	{ a, b },
+#define E_(a, b)	{ a, b }
+
+TRACE_EVENT(netfs_read,
+	    TP_PROTO(struct netfs_read_request *rreq,
+		     loff_t start, size_t len,
+		     enum netfs_read_trace what),
+
+	    TP_ARGS(rreq, start, len, what),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		rreq		)
+		    __field(unsigned int,		cookie		)
+		    __field(loff_t,			start		)
+		    __field(size_t,			len		)
+		    __field(enum netfs_read_trace,	what		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->rreq	= rreq->debug_id;
+		    __entry->cookie	= rreq->cookie_debug_id;
+		    __entry->start	= start;
+		    __entry->len	= len;
+		    __entry->what	= what;
+			   ),
+
+	    TP_printk("R=%08x %s c=%08x s=%llx %zx",
+		      __entry->rreq,
+		      __print_symbolic(__entry->what, netfs_read_traces),
+		      __entry->cookie,
+		      __entry->start, __entry->len)
+	    );
+
+TRACE_EVENT(netfs_rreq,
+	    TP_PROTO(struct netfs_read_request *rreq,
+		     enum netfs_rreq_trace what),
+
+	    TP_ARGS(rreq, what),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		rreq		)
+		    __field(unsigned short,		flags		)
+		    __field(enum netfs_rreq_trace,	what		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->rreq	= rreq->debug_id;
+		    __entry->flags	= rreq->flags;
+		    __entry->what	= what;
+			   ),
+
+	    TP_printk("R=%08x %s f=%02x",
+		      __entry->rreq,
+		      __print_symbolic(__entry->what, netfs_rreq_traces),
+		      __entry->flags)
+	    );
+
+TRACE_EVENT(netfs_sreq,
+	    TP_PROTO(struct netfs_read_subrequest *sreq,
+		     enum netfs_sreq_trace what),
+
+	    TP_ARGS(sreq, what),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		rreq		)
+		    __field(unsigned short,		index		)
+		    __field(short,			error		)
+		    __field(unsigned short,		flags		)
+		    __field(enum netfs_read_source,	source		)
+		    __field(enum netfs_sreq_trace,	what		)
+		    __field(size_t,			len		)
+		    __field(size_t,			transferred	)
+		    __field(loff_t,			start		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->rreq	= sreq->rreq->debug_id;
+		    __entry->index	= sreq->debug_index;
+		    __entry->error	= sreq->error;
+		    __entry->flags	= sreq->flags;
+		    __entry->source	= sreq->source;
+		    __entry->what	= what;
+		    __entry->len	= sreq->len;
+		    __entry->transferred = sreq->transferred;
+		    __entry->start	= sreq->start;
+			   ),
+
+	    TP_printk("R=%08x[%u] %s %s f=%02x s=%llx %zx/%zx e=%d",
+		      __entry->rreq, __entry->index,
+		      __print_symbolic(__entry->what, netfs_sreq_traces),
+		      __print_symbolic(__entry->source, netfs_sreq_sources),
+		      __entry->flags,
+		      __entry->start, __entry->transferred, __entry->len,
+		      __entry->error)
+	    );
+
+#endif /* _TRACE_NETFS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 289af54cc67ace285b6d4335a54324562894c4e2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 3 Nov 2020 11:32:41 +0000
Subject: netfs: Gather stats

Gather statistics from the netfs interface that can be exported through a
seqfile.  This is intended to be called by a later patch when viewing
/proc/fs/fscache/stats.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/161118139247.1232039.10556850937548511068.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161161034669.2537118.2761232524997091480.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/161340397101.1303470.17581910581108378458.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539539959.286939.6794352576462965914.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653797700.2770958.5801990354413178228.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789079281.6155.17141344853277186500.stgit@warthog.procyon.org.uk/ # v6
---
 fs/netfs/Kconfig       | 15 ++++++++++++++
 fs/netfs/Makefile      |  3 +--
 fs/netfs/internal.h    | 34 +++++++++++++++++++++++++++++++
 fs/netfs/read_helper.c | 23 +++++++++++++++++++++
 fs/netfs/stats.c       | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h  |  1 +
 6 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 fs/netfs/stats.c

(limited to 'include/linux')

diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index 2ebf90e6ca95..578112713703 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -6,3 +6,18 @@ config NETFS_SUPPORT
 	  This option enables support for network filesystems, including
 	  helpers for high-level buffered I/O, abstracting out read
 	  segmentation, local caching and transparent huge page support.
+
+config NETFS_STATS
+	bool "Gather statistical information on local caching"
+	depends on NETFS_SUPPORT && PROC_FS
+	help
+	  This option causes statistical information to be gathered on local
+	  caching and exported through file:
+
+		/proc/fs/fscache/stats
+
+	  The gathering of statistics adds a certain amount of overhead to
+	  execution as there are a quite a few stats gathered, and on a
+	  multi-CPU system these may be on cachelines that keep bouncing
+	  between CPUs.  On the other hand, the stats are very useful for
+	  debugging purposes.  Saying 'Y' here is recommended.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 4b4eff2ba369..c15bfc966d96 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
-netfs-y := \
-	read_helper.o
+netfs-y := read_helper.o stats.o
 
 obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ee665c0e7dc8..98b6f4516da1 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -16,8 +16,42 @@
  */
 extern unsigned int netfs_debug;
 
+/*
+ * stats.c
+ */
+#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_readahead;
+extern atomic_t netfs_n_rh_readpage;
+extern atomic_t netfs_n_rh_rreq;
+extern atomic_t netfs_n_rh_sreq;
+extern atomic_t netfs_n_rh_download;
+extern atomic_t netfs_n_rh_download_done;
+extern atomic_t netfs_n_rh_download_failed;
+extern atomic_t netfs_n_rh_download_instead;
+extern atomic_t netfs_n_rh_read;
+extern atomic_t netfs_n_rh_read_done;
+extern atomic_t netfs_n_rh_read_failed;
+extern atomic_t netfs_n_rh_zero;
+extern atomic_t netfs_n_rh_short_read;
+extern atomic_t netfs_n_rh_write;
+extern atomic_t netfs_n_rh_write_done;
+extern atomic_t netfs_n_rh_write_failed;
+
+
+static inline void netfs_stat(atomic_t *stat)
+{
+	atomic_inc(stat);
+}
+
+static inline void netfs_stat_d(atomic_t *stat)
+{
+	atomic_dec(stat);
+}
+
+#else
 #define netfs_stat(x) do {} while(0)
 #define netfs_stat_d(x) do {} while(0)
+#endif
 
 /*****************************************************************************/
 /*
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 799eee7f4ee6..6d6ed30f417e 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -56,6 +56,7 @@ static struct netfs_read_request *netfs_alloc_read_request(
 		refcount_set(&rreq->usage, 1);
 		__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 		ops->init_rreq(rreq, file);
+		netfs_stat(&netfs_n_rh_rreq);
 	}
 
 	return rreq;
@@ -88,6 +89,7 @@ static void netfs_free_read_request(struct work_struct *work)
 		rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
 	trace_netfs_rreq(rreq, netfs_rreq_trace_free);
 	kfree(rreq);
+	netfs_stat_d(&netfs_n_rh_rreq);
 }
 
 static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async)
@@ -117,6 +119,7 @@ static struct netfs_read_subrequest *netfs_alloc_subrequest(
 		refcount_set(&subreq->usage, 2);
 		subreq->rreq = rreq;
 		netfs_get_read_request(rreq);
+		netfs_stat(&netfs_n_rh_sreq);
 	}
 
 	return subreq;
@@ -134,6 +137,7 @@ static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq,
 
 	trace_netfs_sreq(subreq, netfs_sreq_trace_free);
 	kfree(subreq);
+	netfs_stat_d(&netfs_n_rh_sreq);
 	netfs_put_read_request(rreq, was_async);
 }
 
@@ -156,6 +160,7 @@ static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
 static void netfs_fill_with_zeroes(struct netfs_read_request *rreq,
 				   struct netfs_read_subrequest *subreq)
 {
+	netfs_stat(&netfs_n_rh_zero);
 	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 	netfs_subreq_terminated(subreq, 0, false);
 }
@@ -179,6 +184,7 @@ static void netfs_fill_with_zeroes(struct netfs_read_request *rreq,
 static void netfs_read_from_server(struct netfs_read_request *rreq,
 				   struct netfs_read_subrequest *subreq)
 {
+	netfs_stat(&netfs_n_rh_download);
 	rreq->netfs_ops->issue_op(subreq);
 }
 
@@ -288,6 +294,7 @@ static void netfs_rreq_short_read(struct netfs_read_request *rreq,
 	__clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
 	__set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
 
+	netfs_stat(&netfs_n_rh_short_read);
 	trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
 
 	netfs_get_read_subrequest(subreq);
@@ -319,6 +326,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
 				break;
 			subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
 			subreq->error = 0;
+			netfs_stat(&netfs_n_rh_download_instead);
 			trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
 			netfs_get_read_subrequest(subreq);
 			atomic_inc(&rreq->nr_rd_ops);
@@ -414,6 +422,17 @@ void netfs_subreq_terminated(struct netfs_read_subrequest *subreq,
 	       subreq->debug_index, subreq->start, subreq->flags,
 	       transferred_or_error);
 
+	switch (subreq->source) {
+	case NETFS_READ_FROM_CACHE:
+		netfs_stat(&netfs_n_rh_read_done);
+		break;
+	case NETFS_DOWNLOAD_FROM_SERVER:
+		netfs_stat(&netfs_n_rh_download_done);
+		break;
+	default:
+		break;
+	}
+
 	if (IS_ERR_VALUE(transferred_or_error)) {
 		subreq->error = transferred_or_error;
 		goto failed;
@@ -470,8 +489,10 @@ incomplete:
 
 failed:
 	if (subreq->source == NETFS_READ_FROM_CACHE) {
+		netfs_stat(&netfs_n_rh_read_failed);
 		set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
 	} else {
+		netfs_stat(&netfs_n_rh_download_failed);
 		set_bit(NETFS_RREQ_FAILED, &rreq->flags);
 		rreq->error = subreq->error;
 	}
@@ -653,6 +674,7 @@ void netfs_readahead(struct readahead_control *ractl,
 	rreq->start	= readahead_pos(ractl);
 	rreq->len	= readahead_length(ractl);
 
+	netfs_stat(&netfs_n_rh_readahead);
 	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 			 netfs_read_trace_readahead);
 
@@ -722,6 +744,7 @@ int netfs_readpage(struct file *file,
 	rreq->start	= page_index(page) * PAGE_SIZE;
 	rreq->len	= thp_size(page);
 
+	netfs_stat(&netfs_n_rh_readpage);
 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
 
 	netfs_get_read_request(rreq);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
new file mode 100644
index 000000000000..df6ff5718f25
--- /dev/null
+++ b/fs/netfs/stats.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Netfs support statistics
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/seq_file.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+atomic_t netfs_n_rh_readahead;
+atomic_t netfs_n_rh_readpage;
+atomic_t netfs_n_rh_rreq;
+atomic_t netfs_n_rh_sreq;
+atomic_t netfs_n_rh_download;
+atomic_t netfs_n_rh_download_done;
+atomic_t netfs_n_rh_download_failed;
+atomic_t netfs_n_rh_download_instead;
+atomic_t netfs_n_rh_read;
+atomic_t netfs_n_rh_read_done;
+atomic_t netfs_n_rh_read_failed;
+atomic_t netfs_n_rh_zero;
+atomic_t netfs_n_rh_short_read;
+atomic_t netfs_n_rh_write;
+atomic_t netfs_n_rh_write_done;
+atomic_t netfs_n_rh_write_failed;
+
+void netfs_stats_show(struct seq_file *m)
+{
+	seq_printf(m, "RdHelp : RA=%u RP=%u rr=%u sr=%u\n",
+		   atomic_read(&netfs_n_rh_readahead),
+		   atomic_read(&netfs_n_rh_readpage),
+		   atomic_read(&netfs_n_rh_rreq),
+		   atomic_read(&netfs_n_rh_sreq));
+	seq_printf(m, "RdHelp : ZR=%u sh=%u\n",
+		   atomic_read(&netfs_n_rh_zero),
+		   atomic_read(&netfs_n_rh_short_read));
+	seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
+		   atomic_read(&netfs_n_rh_download),
+		   atomic_read(&netfs_n_rh_download_done),
+		   atomic_read(&netfs_n_rh_download_failed),
+		   atomic_read(&netfs_n_rh_download_instead));
+	seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n",
+		   atomic_read(&netfs_n_rh_read),
+		   atomic_read(&netfs_n_rh_read_done),
+		   atomic_read(&netfs_n_rh_read_failed));
+	seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n",
+		   atomic_read(&netfs_n_rh_write),
+		   atomic_read(&netfs_n_rh_write_done),
+		   atomic_read(&netfs_n_rh_write_failed));
+}
+EXPORT_SYMBOL(netfs_stats_show);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 8e8c6a4e4dde..db4af80cbae3 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -166,5 +166,6 @@ extern int netfs_readpage(struct file *,
 			  void *);
 
 extern void netfs_subreq_terminated(struct netfs_read_subrequest *, ssize_t, bool);
+extern void netfs_stats_show(struct seq_file *);
 
 #endif /* _LINUX_NETFS_H */
-- 
cgit v1.2.3


From e1b1240c1ff5f8bfba797f14996d8bac8a9ec437 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Sep 2020 11:06:07 +0100
Subject: netfs: Add write_begin helper

Add a helper to do the pre-reading work for the netfs write_begin address
space op.

Changes
v6:
- Fixed a missing rreq put in netfs_write_begin()[3].
- Use DEFINE_READAHEAD()[4].

v5:
- Made the wait for PG_fscache in netfs_write_begin() killable[2].

v4:
- Added flag to netfs_subreq_terminated() to indicate that the caller may
  have been running async and stuff that might sleep needs punting to a
  workqueue (can't use in_softirq()[1]).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20210216084230.GA23669@lst.de/ [1]
Link: https://lore.kernel.org/r/2499407.1616505440@warthog.procyon.org.uk/ [2]
Link: https://lore.kernel.org/r/161781042127.463527.9154479794406046987.stgit@warthog.procyon.org.uk/ [3]
Link: https://lore.kernel.org/r/1234933.1617886271@warthog.procyon.org.uk/ [4]
Link: https://lore.kernel.org/r/160588543960.3465195.2792938973035886168.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161118140165.1232039.16418853874312234477.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161161035539.2537118.15674887534950908530.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/161340398368.1303470.11242918276563276090.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539541541.286939.1889738674057013729.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653798616.2770958.17213315845968485563.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789080530.6155.1011847312392330491.stgit@warthog.procyon.org.uk/ # v6
---
 fs/netfs/internal.h          |   2 +
 fs/netfs/read_helper.c       | 164 +++++++++++++++++++++++++++++++++++++++++++
 fs/netfs/stats.c             |  11 ++-
 include/linux/netfs.h        |   8 +++
 include/trace/events/netfs.h |   4 +-
 5 files changed, 185 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 98b6f4516da1..b7f2c4459f33 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -34,8 +34,10 @@ extern atomic_t netfs_n_rh_read_failed;
 extern atomic_t netfs_n_rh_zero;
 extern atomic_t netfs_n_rh_short_read;
 extern atomic_t netfs_n_rh_write;
+extern atomic_t netfs_n_rh_write_begin;
 extern atomic_t netfs_n_rh_write_done;
 extern atomic_t netfs_n_rh_write_failed;
+extern atomic_t netfs_n_rh_write_zskip;
 
 
 static inline void netfs_stat(atomic_t *stat)
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 6d6ed30f417e..da34aedea053 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -772,3 +772,167 @@ int netfs_readpage(struct file *file,
 	return ret;
 }
 EXPORT_SYMBOL(netfs_readpage);
+
+static void netfs_clear_thp(struct page *page)
+{
+	unsigned int i;
+
+	for (i = 0; i < thp_nr_pages(page); i++)
+		clear_highpage(page + i);
+}
+
+/**
+ * netfs_write_begin - Helper to prepare for writing
+ * @file: The file to read from
+ * @mapping: The mapping to read from
+ * @pos: File position at which the write will begin
+ * @len: The length of the write in this page
+ * @flags: AOP_* flags
+ * @_page: Where to put the resultant page
+ * @_fsdata: Place for the netfs to store a cookie
+ * @ops: The network filesystem's operations for the helper to use
+ * @netfs_priv: Private netfs data to be retained in the request
+ *
+ * Pre-read data for a write-begin request by drawing data from the cache if
+ * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together.  If
+ * necessary, the readahead window can be expanded in either direction to a
+ * more convenient alighment for RPC efficiency or to make storage in the cache
+ * feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.
+ *
+ * The check_write_begin() operation can be provided to check for and flush
+ * conflicting writes once the page is grabbed and locked.  It is passed a
+ * pointer to the fsdata cookie that gets returned to the VM to be passed to
+ * write_end.  It is permitted to sleep.  It should return 0 if the request
+ * should go ahead; unlock the page and return -EAGAIN to cause the page to be
+ * regot; or return an error.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_write_begin(struct file *file, struct address_space *mapping,
+		      loff_t pos, unsigned int len, unsigned int flags,
+		      struct page **_page, void **_fsdata,
+		      const struct netfs_read_request_ops *ops,
+		      void *netfs_priv)
+{
+	struct netfs_read_request *rreq;
+	struct page *page, *xpage;
+	struct inode *inode = file_inode(file);
+	unsigned int debug_index = 0;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	int pos_in_page = pos & ~PAGE_MASK;
+	loff_t size;
+	int ret;
+
+	DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
+
+retry:
+	page = grab_cache_page_write_begin(mapping, index, 0);
+	if (!page)
+		return -ENOMEM;
+
+	if (ops->check_write_begin) {
+		/* Allow the netfs (eg. ceph) to flush conflicts. */
+		ret = ops->check_write_begin(file, pos, len, page, _fsdata);
+		if (ret < 0) {
+			if (ret == -EAGAIN)
+				goto retry;
+			goto error;
+		}
+	}
+
+	if (PageUptodate(page))
+		goto have_page;
+
+	/* If the page is beyond the EOF, we want to clear it - unless it's
+	 * within the cache granule containing the EOF, in which case we need
+	 * to preload the granule.
+	 */
+	size = i_size_read(inode);
+	if (!ops->is_cache_enabled(inode) &&
+	    ((pos_in_page == 0 && len == thp_size(page)) ||
+	     (pos >= size) ||
+	     (pos_in_page == 0 && (pos + len) >= size))) {
+		netfs_clear_thp(page);
+		SetPageUptodate(page);
+		netfs_stat(&netfs_n_rh_write_zskip);
+		goto have_page_no_wait;
+	}
+
+	ret = -ENOMEM;
+	rreq = netfs_alloc_read_request(ops, netfs_priv, file);
+	if (!rreq)
+		goto error;
+	rreq->mapping		= page->mapping;
+	rreq->start		= page->index * PAGE_SIZE;
+	rreq->len		= thp_size(page);
+	rreq->no_unlock_page	= page->index;
+	__set_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags);
+	netfs_priv = NULL;
+
+	netfs_stat(&netfs_n_rh_write_begin);
+	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
+
+	/* Expand the request to meet caching requirements and download
+	 * preferences.
+	 */
+	ractl._nr_pages = thp_nr_pages(page);
+	netfs_rreq_expand(rreq, &ractl);
+	netfs_get_read_request(rreq);
+
+	/* We hold the page locks, so we can drop the references */
+	while ((xpage = readahead_page(&ractl)))
+		if (xpage != page)
+			put_page(xpage);
+
+	atomic_set(&rreq->nr_rd_ops, 1);
+	do {
+		if (!netfs_rreq_submit_slice(rreq, &debug_index))
+			break;
+
+	} while (rreq->submitted < rreq->len);
+
+	/* Keep nr_rd_ops incremented so that the ref always belongs to us, and
+	 * the service code isn't punted off to a random thread pool to
+	 * process.
+	 */
+	for (;;) {
+		wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
+		netfs_rreq_assess(rreq, false);
+		if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
+			break;
+		cond_resched();
+	}
+
+	ret = rreq->error;
+	if (ret == 0 && rreq->submitted < rreq->len)
+		ret = -EIO;
+	netfs_put_read_request(rreq, false);
+	if (ret < 0)
+		goto error;
+
+have_page:
+	ret = wait_on_page_fscache_killable(page);
+	if (ret < 0)
+		goto error;
+have_page_no_wait:
+	if (netfs_priv)
+		ops->cleanup(netfs_priv, mapping);
+	*_page = page;
+	_leave(" = 0");
+	return 0;
+
+error_put:
+	netfs_put_read_request(rreq, false);
+error:
+	unlock_page(page);
+	put_page(page);
+	if (netfs_priv)
+		ops->cleanup(netfs_priv, mapping);
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(netfs_write_begin);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index df6ff5718f25..9ae538c85378 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -24,19 +24,24 @@ atomic_t netfs_n_rh_read_failed;
 atomic_t netfs_n_rh_zero;
 atomic_t netfs_n_rh_short_read;
 atomic_t netfs_n_rh_write;
+atomic_t netfs_n_rh_write_begin;
 atomic_t netfs_n_rh_write_done;
 atomic_t netfs_n_rh_write_failed;
+atomic_t netfs_n_rh_write_zskip;
 
 void netfs_stats_show(struct seq_file *m)
 {
-	seq_printf(m, "RdHelp : RA=%u RP=%u rr=%u sr=%u\n",
+	seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
 		   atomic_read(&netfs_n_rh_readahead),
 		   atomic_read(&netfs_n_rh_readpage),
+		   atomic_read(&netfs_n_rh_write_begin),
+		   atomic_read(&netfs_n_rh_write_zskip),
 		   atomic_read(&netfs_n_rh_rreq),
 		   atomic_read(&netfs_n_rh_sreq));
-	seq_printf(m, "RdHelp : ZR=%u sh=%u\n",
+	seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n",
 		   atomic_read(&netfs_n_rh_zero),
-		   atomic_read(&netfs_n_rh_short_read));
+		   atomic_read(&netfs_n_rh_short_read),
+		   atomic_read(&netfs_n_rh_write_zskip));
 	seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
 		   atomic_read(&netfs_n_rh_download),
 		   atomic_read(&netfs_n_rh_download_done),
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index db4af80cbae3..99659ed9524e 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -147,11 +147,14 @@ struct netfs_read_request {
  * Operations the network filesystem can/must provide to the helpers.
  */
 struct netfs_read_request_ops {
+	bool (*is_cache_enabled)(struct inode *inode);
 	void (*init_rreq)(struct netfs_read_request *rreq, struct file *file);
 	void (*expand_readahead)(struct netfs_read_request *rreq);
 	bool (*clamp_length)(struct netfs_read_subrequest *subreq);
 	void (*issue_op)(struct netfs_read_subrequest *subreq);
 	bool (*is_still_valid)(struct netfs_read_request *rreq);
+	int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
+				 struct page *page, void **_fsdata);
 	void (*done)(struct netfs_read_request *rreq);
 	void (*cleanup)(struct address_space *mapping, void *netfs_priv);
 };
@@ -164,6 +167,11 @@ extern int netfs_readpage(struct file *,
 			  struct page *,
 			  const struct netfs_read_request_ops *,
 			  void *);
+extern int netfs_write_begin(struct file *, struct address_space *,
+			     loff_t, unsigned int, unsigned int, struct page **,
+			     void **,
+			     const struct netfs_read_request_ops *,
+			     void *);
 
 extern void netfs_subreq_terminated(struct netfs_read_subrequest *, ssize_t, bool);
 extern void netfs_stats_show(struct seq_file *);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 12ad382764c5..a2bf6cd84bd4 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -22,6 +22,7 @@ enum netfs_read_trace {
 	netfs_read_trace_expanded,
 	netfs_read_trace_readahead,
 	netfs_read_trace_readpage,
+	netfs_read_trace_write_begin,
 };
 
 enum netfs_rreq_trace {
@@ -50,7 +51,8 @@ enum netfs_sreq_trace {
 #define netfs_read_traces					\
 	EM(netfs_read_trace_expanded,		"EXPANDED ")	\
 	EM(netfs_read_trace_readahead,		"READAHEAD")	\
-	E_(netfs_read_trace_readpage,		"READPAGE ")
+	EM(netfs_read_trace_readpage,		"READPAGE ")	\
+	E_(netfs_read_trace_write_begin,	"WRITEBEGN")
 
 #define netfs_rreq_traces					\
 	EM(netfs_rreq_trace_assess,		"ASSESS")	\
-- 
cgit v1.2.3


From 726218fdc22c9b52f16e1228499a804bbf262a20 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 6 Feb 2020 14:22:24 +0000
Subject: netfs: Define an interface to talk to a cache

Add an interface to the netfs helper library for reading data from the
cache instead of downloading it from the server and support for writing
data just downloaded or cleared to the cache.

The API passes an iov_iter to the cache read/write routines to indicate the
data/buffer to be used.  This is done using the ITER_XARRAY type to provide
direct access to the netfs inode's pagecache.

When the netfs's ->begin_cache_operation() method is called, this must fill
in the cache_resources in the netfs_read_request struct, including the
netfs_cache_ops used by the helper lib to talk to the cache.  The helper
lib does not directly access the cache.

Changes:
v6:
- Call trace_netfs_read() after beginning the cache op so that the cookie
  debug ID can be logged[3].
- Don't record the error from writing to the cache.  We don't want to pass
  it back to the netfs[4].
- Fix copy-to-cache subreq amalgamation to not round up as it goes along
  otherwise it overcalculates the length of the write[5].

v5:
- Use end_page_fscache() rather than unlock_page_fscache()[2].

v4:
- Added flag to netfs_subreq_terminated() to indicate that the caller may
  have been running async and stuff that might sleep needs punting to a
  workqueue (can't use in_softirq()[1]).
- Add missing inc of netfs_n_rh_read stat.
- Move initial definition of fscache_begin_read_operation() elsewhere.
- Need to call op->begin_cache_operation() from netfs_write_begin().

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-mm@kvack.org
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20210216084230.GA23669@lst.de/ [1]
Link: https://lore.kernel.org/r/2499407.1616505440@warthog.procyon.org.uk/ [2]
Link: https://lore.kernel.org/r/161781045123.463527.14533348855710902201.stgit@warthog.procyon.org.uk/ [3]
Link: https://lore.kernel.org/r/161781046256.463527.18158681600085556192.stgit@warthog.procyon.org.uk/ [4]
Link: https://lore.kernel.org/r/161781047695.463527.7463536103593997492.stgit@warthog.procyon.org.uk/ [5]
Link: https://lore.kernel.org/r/161118141321.1232039.8296910406755622458.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161161036700.2537118.11170748455436854978.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/161340399569.1303470.1138884774643385730.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539542874.286939.13337898213448136687.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653799826.2770958.9015430297426331950.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789081462.6155.3853904866933313256.stgit@warthog.procyon.org.uk/ # v6
---
 fs/netfs/read_helper.c       | 239 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/netfs.h        |  55 ++++++++++
 include/trace/events/netfs.h |   2 +
 3 files changed, 295 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index da34aedea053..cd3b61d5e192 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -88,6 +88,8 @@ static void netfs_free_read_request(struct work_struct *work)
 	if (rreq->netfs_priv)
 		rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
 	trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+	if (rreq->cache_resources.ops)
+		rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
 	kfree(rreq);
 	netfs_stat_d(&netfs_n_rh_rreq);
 }
@@ -154,6 +156,34 @@ static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
 	iov_iter_zero(iov_iter_count(&iter), &iter);
 }
 
+static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
+					bool was_async)
+{
+	struct netfs_read_subrequest *subreq = priv;
+
+	netfs_subreq_terminated(subreq, transferred_or_error, was_async);
+}
+
+/*
+ * Issue a read against the cache.
+ * - Eats the caller's ref on subreq.
+ */
+static void netfs_read_from_cache(struct netfs_read_request *rreq,
+				  struct netfs_read_subrequest *subreq,
+				  bool seek_data)
+{
+	struct netfs_cache_resources *cres = &rreq->cache_resources;
+	struct iov_iter iter;
+
+	netfs_stat(&netfs_n_rh_read);
+	iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+			subreq->start + subreq->transferred,
+			subreq->len   - subreq->transferred);
+
+	cres->ops->read(cres, subreq->start, &iter, seek_data,
+			netfs_cache_read_terminated, subreq);
+}
+
 /*
  * Fill a subrequest region with zeroes.
  */
@@ -198,6 +228,141 @@ static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async
 	netfs_put_read_request(rreq, was_async);
 }
 
+/*
+ * Deal with the completion of writing the data to the cache.  We have to clear
+ * the PG_fscache bits on the pages involved and release the caller's ref.
+ *
+ * May be called in softirq mode and we inherit a ref from the caller.
+ */
+static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
+					  bool was_async)
+{
+	struct netfs_read_subrequest *subreq;
+	struct page *page;
+	pgoff_t unlocked = 0;
+	bool have_unlocked = false;
+
+	rcu_read_lock();
+
+	list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+		XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
+
+		xas_for_each(&xas, page, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
+			/* We might have multiple writes from the same huge
+			 * page, but we mustn't unlock a page more than once.
+			 */
+			if (have_unlocked && page->index <= unlocked)
+				continue;
+			unlocked = page->index;
+			end_page_fscache(page);
+			have_unlocked = true;
+		}
+	}
+
+	rcu_read_unlock();
+	netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
+				       bool was_async)
+{
+	struct netfs_read_subrequest *subreq = priv;
+	struct netfs_read_request *rreq = subreq->rreq;
+
+	if (IS_ERR_VALUE(transferred_or_error)) {
+		netfs_stat(&netfs_n_rh_write_failed);
+	} else {
+		netfs_stat(&netfs_n_rh_write_done);
+	}
+
+	trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
+
+	/* If we decrement nr_wr_ops to 0, the ref belongs to us. */
+	if (atomic_dec_and_test(&rreq->nr_wr_ops))
+		netfs_rreq_unmark_after_write(rreq, was_async);
+
+	netfs_put_subrequest(subreq, was_async);
+}
+
+/*
+ * Perform any outstanding writes to the cache.  We inherit a ref from the
+ * caller.
+ */
+static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq)
+{
+	struct netfs_cache_resources *cres = &rreq->cache_resources;
+	struct netfs_read_subrequest *subreq, *next, *p;
+	struct iov_iter iter;
+	int ret;
+
+	trace_netfs_rreq(rreq, netfs_rreq_trace_write);
+
+	/* We don't want terminating writes trying to wake us up whilst we're
+	 * still going through the list.
+	 */
+	atomic_inc(&rreq->nr_wr_ops);
+
+	list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
+		if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) {
+			list_del_init(&subreq->rreq_link);
+			netfs_put_subrequest(subreq, false);
+		}
+	}
+
+	list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+		/* Amalgamate adjacent writes */
+		while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+			next = list_next_entry(subreq, rreq_link);
+			if (next->start != subreq->start + subreq->len)
+				break;
+			subreq->len += next->len;
+			list_del_init(&next->rreq_link);
+			netfs_put_subrequest(next, false);
+		}
+
+		ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
+					       rreq->i_size);
+		if (ret < 0) {
+			trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
+			continue;
+		}
+
+		iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
+				subreq->start, subreq->len);
+
+		atomic_inc(&rreq->nr_wr_ops);
+		netfs_stat(&netfs_n_rh_write);
+		netfs_get_read_subrequest(subreq);
+		trace_netfs_sreq(subreq, netfs_sreq_trace_write);
+		cres->ops->write(cres, subreq->start, &iter,
+				 netfs_rreq_copy_terminated, subreq);
+	}
+
+	/* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */
+	if (atomic_dec_and_test(&rreq->nr_wr_ops))
+		netfs_rreq_unmark_after_write(rreq, false);
+}
+
+static void netfs_rreq_write_to_cache_work(struct work_struct *work)
+{
+	struct netfs_read_request *rreq =
+		container_of(work, struct netfs_read_request, work);
+
+	netfs_rreq_do_write_to_cache(rreq);
+}
+
+static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq,
+				      bool was_async)
+{
+	if (was_async) {
+		rreq->work.func = netfs_rreq_write_to_cache_work;
+		if (!queue_work(system_unbound_wq, &rreq->work))
+			BUG();
+	} else {
+		netfs_rreq_do_write_to_cache(rreq);
+	}
+}
+
 /*
  * Unlock the pages in a read operation.  We need to set PG_fscache on any
  * pages we're going to write back before we unlock them.
@@ -299,7 +464,10 @@ static void netfs_rreq_short_read(struct netfs_read_request *rreq,
 
 	netfs_get_read_subrequest(subreq);
 	atomic_inc(&rreq->nr_rd_ops);
-	netfs_read_from_server(rreq, subreq);
+	if (subreq->source == NETFS_READ_FROM_CACHE)
+		netfs_read_from_cache(rreq, subreq, true);
+	else
+		netfs_read_from_server(rreq, subreq);
 }
 
 /*
@@ -344,6 +512,25 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
 	return false;
 }
 
+/*
+ * Check to see if the data read is still valid.
+ */
+static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq)
+{
+	struct netfs_read_subrequest *subreq;
+
+	if (!rreq->netfs_ops->is_still_valid ||
+	    rreq->netfs_ops->is_still_valid(rreq))
+		return;
+
+	list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+		if (subreq->source == NETFS_READ_FROM_CACHE) {
+			subreq->error = -ESTALE;
+			__set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+		}
+	}
+}
+
 /*
  * Assess the state of a read request and decide what to do next.
  *
@@ -355,6 +542,8 @@ static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async)
 	trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
 
 again:
+	netfs_rreq_is_still_valid(rreq);
+
 	if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
 	    test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
 		if (netfs_rreq_perform_resubmissions(rreq))
@@ -367,6 +556,9 @@ again:
 	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 	wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
 
+	if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags))
+		return netfs_rreq_write_to_cache(rreq, was_async);
+
 	netfs_rreq_completed(rreq, was_async);
 }
 
@@ -504,7 +696,10 @@ static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequ
 						       loff_t i_size)
 {
 	struct netfs_read_request *rreq = subreq->rreq;
+	struct netfs_cache_resources *cres = &rreq->cache_resources;
 
+	if (cres->ops)
+		return cres->ops->prepare_read(subreq, i_size);
 	if (subreq->start >= rreq->i_size)
 		return NETFS_FILL_WITH_ZEROES;
 	return NETFS_DOWNLOAD_FROM_SERVER;
@@ -595,6 +790,9 @@ static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq,
 	case NETFS_DOWNLOAD_FROM_SERVER:
 		netfs_read_from_server(rreq, subreq);
 		break;
+	case NETFS_READ_FROM_CACHE:
+		netfs_read_from_cache(rreq, subreq, false);
+		break;
 	default:
 		BUG();
 	}
@@ -607,9 +805,23 @@ subreq_failed:
 	return false;
 }
 
+static void netfs_cache_expand_readahead(struct netfs_read_request *rreq,
+					 loff_t *_start, size_t *_len, loff_t i_size)
+{
+	struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+	if (cres->ops && cres->ops->expand_readahead)
+		cres->ops->expand_readahead(cres, _start, _len, i_size);
+}
+
 static void netfs_rreq_expand(struct netfs_read_request *rreq,
 			      struct readahead_control *ractl)
 {
+	/* Give the cache a chance to change the request parameters.  The
+	 * resultant request must contain the original region.
+	 */
+	netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
+
 	/* Give the netfs a chance to change the request parameters.  The
 	 * resultant request must contain the original region.
 	 */
@@ -661,6 +873,7 @@ void netfs_readahead(struct readahead_control *ractl,
 	struct netfs_read_request *rreq;
 	struct page *page;
 	unsigned int debug_index = 0;
+	int ret;
 
 	_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
 
@@ -674,6 +887,12 @@ void netfs_readahead(struct readahead_control *ractl,
 	rreq->start	= readahead_pos(ractl);
 	rreq->len	= readahead_length(ractl);
 
+	if (ops->begin_cache_operation) {
+		ret = ops->begin_cache_operation(rreq);
+		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+			goto cleanup_free;
+	}
+
 	netfs_stat(&netfs_n_rh_readahead);
 	trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
 			 netfs_read_trace_readahead);
@@ -698,6 +917,9 @@ void netfs_readahead(struct readahead_control *ractl,
 		netfs_rreq_assess(rreq, false);
 	return;
 
+cleanup_free:
+	netfs_put_read_request(rreq, false);
+	return;
 cleanup:
 	if (netfs_priv)
 		ops->cleanup(ractl->mapping, netfs_priv);
@@ -744,6 +966,14 @@ int netfs_readpage(struct file *file,
 	rreq->start	= page_index(page) * PAGE_SIZE;
 	rreq->len	= thp_size(page);
 
+	if (ops->begin_cache_operation) {
+		ret = ops->begin_cache_operation(rreq);
+		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) {
+			unlock_page(page);
+			goto out;
+		}
+	}
+
 	netfs_stat(&netfs_n_rh_readpage);
 	trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
 
@@ -768,6 +998,7 @@ int netfs_readpage(struct file *file,
 	ret = rreq->error;
 	if (ret == 0 && rreq->submitted < rreq->len)
 		ret = -EIO;
+out:
 	netfs_put_read_request(rreq, false);
 	return ret;
 }
@@ -873,6 +1104,12 @@ retry:
 	__set_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags);
 	netfs_priv = NULL;
 
+	if (ops->begin_cache_operation) {
+		ret = ops->begin_cache_operation(rreq);
+		if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+			goto error_put;
+	}
+
 	netfs_stat(&netfs_n_rh_write_begin);
 	trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 99659ed9524e..9062adfa2fb9 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -92,6 +92,18 @@ enum netfs_read_source {
 	NETFS_INVALID_READ,
 } __mode(byte);
 
+typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
+				      bool was_async);
+
+/*
+ * Resources required to do operations on a cache.
+ */
+struct netfs_cache_resources {
+	const struct netfs_cache_ops	*ops;
+	void				*cache_priv;
+	void				*cache_priv2;
+};
+
 /*
  * Descriptor for a single component subrequest.
  */
@@ -121,11 +133,13 @@ struct netfs_read_request {
 	struct work_struct	work;
 	struct inode		*inode;		/* The file being accessed */
 	struct address_space	*mapping;	/* The mapping being accessed */
+	struct netfs_cache_resources cache_resources;
 	struct list_head	subrequests;	/* Requests to fetch I/O from disk or net */
 	void			*netfs_priv;	/* Private data for the netfs */
 	unsigned int		debug_id;
 	unsigned int		cookie_debug_id;
 	atomic_t		nr_rd_ops;	/* Number of read ops in progress */
+	atomic_t		nr_wr_ops;	/* Number of write ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
 	size_t			len;		/* Length of the request */
 	short			error;		/* 0 or error that occurred */
@@ -149,6 +163,7 @@ struct netfs_read_request {
 struct netfs_read_request_ops {
 	bool (*is_cache_enabled)(struct inode *inode);
 	void (*init_rreq)(struct netfs_read_request *rreq, struct file *file);
+	int (*begin_cache_operation)(struct netfs_read_request *rreq);
 	void (*expand_readahead)(struct netfs_read_request *rreq);
 	bool (*clamp_length)(struct netfs_read_subrequest *subreq);
 	void (*issue_op)(struct netfs_read_subrequest *subreq);
@@ -159,6 +174,46 @@ struct netfs_read_request_ops {
 	void (*cleanup)(struct address_space *mapping, void *netfs_priv);
 };
 
+/*
+ * Table of operations for access to a cache.  This is obtained by
+ * rreq->ops->begin_cache_operation().
+ */
+struct netfs_cache_ops {
+	/* End an operation */
+	void (*end_operation)(struct netfs_cache_resources *cres);
+
+	/* Read data from the cache */
+	int (*read)(struct netfs_cache_resources *cres,
+		    loff_t start_pos,
+		    struct iov_iter *iter,
+		    bool seek_data,
+		    netfs_io_terminated_t term_func,
+		    void *term_func_priv);
+
+	/* Write data to the cache */
+	int (*write)(struct netfs_cache_resources *cres,
+		     loff_t start_pos,
+		     struct iov_iter *iter,
+		     netfs_io_terminated_t term_func,
+		     void *term_func_priv);
+
+	/* Expand readahead request */
+	void (*expand_readahead)(struct netfs_cache_resources *cres,
+				 loff_t *_start, size_t *_len, loff_t i_size);
+
+	/* Prepare a read operation, shortening it to a cached/uncached
+	 * boundary as appropriate.
+	 */
+	enum netfs_read_source (*prepare_read)(struct netfs_read_subrequest *subreq,
+					       loff_t i_size);
+
+	/* Prepare a write operation, working out what part of the write we can
+	 * actually do.
+	 */
+	int (*prepare_write)(struct netfs_cache_resources *cres,
+			     loff_t *_start, size_t *_len, loff_t i_size);
+};
+
 struct readahead_control;
 extern void netfs_readahead(struct readahead_control *,
 			    const struct netfs_read_request_ops *,
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index a2bf6cd84bd4..e3ebeabd3852 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -43,6 +43,7 @@ enum netfs_sreq_trace {
 	netfs_sreq_trace_submit,
 	netfs_sreq_trace_terminated,
 	netfs_sreq_trace_write,
+	netfs_sreq_trace_write_skip,
 	netfs_sreq_trace_write_term,
 };
 
@@ -77,6 +78,7 @@ enum netfs_sreq_trace {
 	EM(netfs_sreq_trace_submit,		"SUBMT")	\
 	EM(netfs_sreq_trace_terminated,		"TERM ")	\
 	EM(netfs_sreq_trace_write,		"WRITE")	\
+	EM(netfs_sreq_trace_write_skip,		"SKIP ")	\
 	E_(netfs_sreq_trace_write_term,		"WTERM")
 
 
-- 
cgit v1.2.3


From 26aaeffcafe6cbb7c3978fa6ed7555122f8c9f8c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 22 Feb 2021 11:39:47 +0000
Subject: fscache, cachefiles: Add alternate API to use kiocb for read/write to
 cache

Add an alternate API by which the cache can be accessed through a kiocb,
doing async DIO, rather than using the current API that tells the cache
where all the pages are.

The new API is intended to be used in conjunction with the netfs helper
library.  A filesystem must pick one or the other and not mix them.

Filesystems wanting to use the new API must #define FSCACHE_USE_NEW_IO_API
before #including the header.  This prevents them from continuing to use
the old API at the same time as there are incompatibilities in how the
PG_fscache page bit is used.

Changes:
v6:
 - Provide a routine to shape a write so that the start and length can be
   aligned for DIO[3].

v4:
 - Use the vfs_iocb_iter_read/write() helpers[1]
 - Move initial definition of fscache_begin_read_operation() here.
 - Remove a commented-out line[2]
 - Combine ki->term_func calls in cachefiles_read_complete()[2].
 - Remove explicit NULL initialiser[2].
 - Remove extern on func decl[2].
 - Put in param names on func decl[2].
 - Remove redundant else[2].
 - Fill out the kdoc comment for fscache_begin_read_operation().
 - Rename fs/fscache/page2.c to io.c to match later patches.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Tested-By: Marc Dionne <marc.dionne@auristor.com>
cc: Christoph Hellwig <hch@lst.de>
cc: linux-cachefs@redhat.com
cc: linux-afs@lists.infradead.org
cc: linux-nfs@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: v9fs-developer@lists.sourceforge.net
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20210216102614.GA27555@lst.de/ [1]
Link: https://lore.kernel.org/r/20210216084230.GA23669@lst.de/ [2]
Link: https://lore.kernel.org/r/161781047695.463527.7463536103593997492.stgit@warthog.procyon.org.uk/ [3]
Link: https://lore.kernel.org/r/161118142558.1232039.17993829899588971439.stgit@warthog.procyon.org.uk/ # rfc
Link: https://lore.kernel.org/r/161161037850.2537118.8819808229350326503.stgit@warthog.procyon.org.uk/ # v2
Link: https://lore.kernel.org/r/161340402057.1303470.8038373593844486698.stgit@warthog.procyon.org.uk/ # v3
Link: https://lore.kernel.org/r/161539545919.286939.14573472672781434757.stgit@warthog.procyon.org.uk/ # v4
Link: https://lore.kernel.org/r/161653801477.2770958.10543270629064934227.stgit@warthog.procyon.org.uk/ # v5
Link: https://lore.kernel.org/r/161789084517.6155.12799689829859169640.stgit@warthog.procyon.org.uk/ # v6
---
 fs/cachefiles/Makefile        |   1 +
 fs/cachefiles/interface.c     |   5 +-
 fs/cachefiles/internal.h      |   9 +
 fs/cachefiles/io.c            | 420 ++++++++++++++++++++++++++++++++++++++++++
 fs/fscache/Kconfig            |   1 +
 fs/fscache/Makefile           |   1 +
 fs/fscache/internal.h         |   4 +
 fs/fscache/io.c               | 116 ++++++++++++
 fs/fscache/page.c             |   2 +-
 fs/fscache/stats.c            |   1 +
 include/linux/fscache-cache.h |   4 +
 include/linux/fscache.h       |  39 ++++
 12 files changed, 600 insertions(+), 3 deletions(-)
 create mode 100644 fs/cachefiles/io.c
 create mode 100644 fs/fscache/io.c

(limited to 'include/linux')

diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 891dedda5905..2227dc2d5498 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -7,6 +7,7 @@ cachefiles-y := \
 	bind.o \
 	daemon.o \
 	interface.o \
+	io.o \
 	key.o \
 	main.o \
 	namei.o \
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 5efa6a3702c0..da3948fdb615 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -319,8 +319,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 /*
  * dispose of a reference to an object
  */
-static void cachefiles_put_object(struct fscache_object *_object,
-				  enum fscache_obj_ref_trace why)
+void cachefiles_put_object(struct fscache_object *_object,
+			   enum fscache_obj_ref_trace why)
 {
 	struct cachefiles_object *object;
 	struct fscache_cache *cache;
@@ -568,4 +568,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
 	.uncache_page		= cachefiles_uncache_page,
 	.dissociate_pages	= cachefiles_dissociate_pages,
 	.check_consistency	= cachefiles_check_consistency,
+	.begin_read_operation	= cachefiles_begin_read_operation,
 };
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index cf9bd6401c2d..4ed83aa5253b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -150,6 +150,9 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
  */
 extern const struct fscache_cache_ops cachefiles_cache_ops;
 
+void cachefiles_put_object(struct fscache_object *_object,
+			   enum fscache_obj_ref_trace why);
+
 /*
  * key.c
  */
@@ -217,6 +220,12 @@ extern int cachefiles_allocate_pages(struct fscache_retrieval *,
 extern int cachefiles_write_page(struct fscache_storage *, struct page *);
 extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
 
+/*
+ * rdwr2.c
+ */
+extern int cachefiles_begin_read_operation(struct netfs_read_request *,
+					   struct fscache_retrieval *);
+
 /*
  * security.c
  */
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
new file mode 100644
index 000000000000..b13fb45fc3f3
--- /dev/null
+++ b/fs/cachefiles/io.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* kiocb-using read/write
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+struct cachefiles_kiocb {
+	struct kiocb		iocb;
+	refcount_t		ki_refcnt;
+	loff_t			start;
+	union {
+		size_t		skipped;
+		size_t		len;
+	};
+	netfs_io_terminated_t	term_func;
+	void			*term_func_priv;
+	bool			was_async;
+};
+
+static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
+{
+	if (refcount_dec_and_test(&ki->ki_refcnt)) {
+		fput(ki->iocb.ki_filp);
+		kfree(ki);
+	}
+}
+
+/*
+ * Handle completion of a read from the cache.
+ */
+static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+
+	_enter("%ld,%ld", ret, ret2);
+
+	if (ki->term_func) {
+		if (ret >= 0)
+			ret += ki->skipped;
+		ki->term_func(ki->term_func_priv, ret, ki->was_async);
+	}
+
+	cachefiles_put_kiocb(ki);
+}
+
+/*
+ * Initiate a read from the cache.
+ */
+static int cachefiles_read(struct netfs_cache_resources *cres,
+			   loff_t start_pos,
+			   struct iov_iter *iter,
+			   bool seek_data,
+			   netfs_io_terminated_t term_func,
+			   void *term_func_priv)
+{
+	struct cachefiles_kiocb *ki;
+	struct file *file = cres->cache_priv2;
+	unsigned int old_nofs;
+	ssize_t ret = -ENOBUFS;
+	size_t len = iov_iter_count(iter), skipped = 0;
+
+	_enter("%pD,%li,%llx,%zx/%llx",
+	       file, file_inode(file)->i_ino, start_pos, len,
+	       i_size_read(file->f_inode));
+
+	/* If the caller asked us to seek for data before doing the read, then
+	 * we should do that now.  If we find a gap, we fill it with zeros.
+	 */
+	if (seek_data) {
+		loff_t off = start_pos, off2;
+
+		off2 = vfs_llseek(file, off, SEEK_DATA);
+		if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) {
+			skipped = 0;
+			ret = off2;
+			goto presubmission_error;
+		}
+
+		if (off2 == -ENXIO || off2 >= start_pos + len) {
+			/* The region is beyond the EOF or there's no more data
+			 * in the region, so clear the rest of the buffer and
+			 * return success.
+			 */
+			iov_iter_zero(len, iter);
+			skipped = len;
+			ret = 0;
+			goto presubmission_error;
+		}
+
+		skipped = off2 - off;
+		iov_iter_zero(skipped, iter);
+	}
+
+	ret = -ENOBUFS;
+	ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
+	if (!ki)
+		goto presubmission_error;
+
+	refcount_set(&ki->ki_refcnt, 2);
+	ki->iocb.ki_filp	= file;
+	ki->iocb.ki_pos		= start_pos + skipped;
+	ki->iocb.ki_flags	= IOCB_DIRECT;
+	ki->iocb.ki_hint	= ki_hint_validate(file_write_hint(file));
+	ki->iocb.ki_ioprio	= get_current_ioprio();
+	ki->skipped		= skipped;
+	ki->term_func		= term_func;
+	ki->term_func_priv	= term_func_priv;
+	ki->was_async		= true;
+
+	if (ki->term_func)
+		ki->iocb.ki_complete = cachefiles_read_complete;
+
+	get_file(ki->iocb.ki_filp);
+
+	old_nofs = memalloc_nofs_save();
+	ret = vfs_iocb_iter_read(file, &ki->iocb, iter);
+	memalloc_nofs_restore(old_nofs);
+	switch (ret) {
+	case -EIOCBQUEUED:
+		goto in_progress;
+
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/* There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		ret = -EINTR;
+		fallthrough;
+	default:
+		ki->was_async = false;
+		cachefiles_read_complete(&ki->iocb, ret, 0);
+		if (ret > 0)
+			ret = 0;
+		break;
+	}
+
+in_progress:
+	cachefiles_put_kiocb(ki);
+	_leave(" = %zd", ret);
+	return ret;
+
+presubmission_error:
+	if (term_func)
+		term_func(term_func_priv, ret < 0 ? ret : skipped, false);
+	return ret;
+}
+
+/*
+ * Handle completion of a write to the cache.
+ */
+static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+	struct inode *inode = file_inode(ki->iocb.ki_filp);
+
+	_enter("%ld,%ld", ret, ret2);
+
+	/* Tell lockdep we inherited freeze protection from submission thread */
+	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+	__sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
+
+	if (ki->term_func)
+		ki->term_func(ki->term_func_priv, ret, ki->was_async);
+
+	cachefiles_put_kiocb(ki);
+}
+
+/*
+ * Initiate a write to the cache.
+ */
+static int cachefiles_write(struct netfs_cache_resources *cres,
+			    loff_t start_pos,
+			    struct iov_iter *iter,
+			    netfs_io_terminated_t term_func,
+			    void *term_func_priv)
+{
+	struct cachefiles_kiocb *ki;
+	struct inode *inode;
+	struct file *file = cres->cache_priv2;
+	unsigned int old_nofs;
+	ssize_t ret = -ENOBUFS;
+	size_t len = iov_iter_count(iter);
+
+	_enter("%pD,%li,%llx,%zx/%llx",
+	       file, file_inode(file)->i_ino, start_pos, len,
+	       i_size_read(file->f_inode));
+
+	ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
+	if (!ki)
+		goto presubmission_error;
+
+	refcount_set(&ki->ki_refcnt, 2);
+	ki->iocb.ki_filp	= file;
+	ki->iocb.ki_pos		= start_pos;
+	ki->iocb.ki_flags	= IOCB_DIRECT | IOCB_WRITE;
+	ki->iocb.ki_hint	= ki_hint_validate(file_write_hint(file));
+	ki->iocb.ki_ioprio	= get_current_ioprio();
+	ki->start		= start_pos;
+	ki->len			= len;
+	ki->term_func		= term_func;
+	ki->term_func_priv	= term_func_priv;
+	ki->was_async		= true;
+
+	if (ki->term_func)
+		ki->iocb.ki_complete = cachefiles_write_complete;
+
+	/* Open-code file_start_write here to grab freeze protection, which
+	 * will be released by another thread in aio_complete_rw().  Fool
+	 * lockdep by telling it the lock got released so that it doesn't
+	 * complain about the held lock when we return to userspace.
+	 */
+	inode = file_inode(file);
+	__sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
+	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+
+	get_file(ki->iocb.ki_filp);
+
+	old_nofs = memalloc_nofs_save();
+	ret = vfs_iocb_iter_write(file, &ki->iocb, iter);
+	memalloc_nofs_restore(old_nofs);
+	switch (ret) {
+	case -EIOCBQUEUED:
+		goto in_progress;
+
+	case -ERESTARTSYS:
+	case -ERESTARTNOINTR:
+	case -ERESTARTNOHAND:
+	case -ERESTART_RESTARTBLOCK:
+		/* There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		ret = -EINTR;
+		fallthrough;
+	default:
+		ki->was_async = false;
+		cachefiles_write_complete(&ki->iocb, ret, 0);
+		if (ret > 0)
+			ret = 0;
+		break;
+	}
+
+in_progress:
+	cachefiles_put_kiocb(ki);
+	_leave(" = %zd", ret);
+	return ret;
+
+presubmission_error:
+	if (term_func)
+		term_func(term_func_priv, -ENOMEM, false);
+	return -ENOMEM;
+}
+
+/*
+ * Prepare a read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq,
+						      loff_t i_size)
+{
+	struct fscache_retrieval *op = subreq->rreq->cache_resources.cache_priv;
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	const struct cred *saved_cred;
+	struct file *file = subreq->rreq->cache_resources.cache_priv2;
+	loff_t off, to;
+
+	_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	if (!file)
+		goto cache_fail_nosec;
+
+	if (subreq->start >= i_size)
+		return NETFS_FILL_WITH_ZEROES;
+
+	cachefiles_begin_secure(cache, &saved_cred);
+
+	off = vfs_llseek(file, subreq->start, SEEK_DATA);
+	if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
+		if (off == (loff_t)-ENXIO)
+			goto download_and_store;
+		goto cache_fail;
+	}
+
+	if (off >= subreq->start + subreq->len)
+		goto download_and_store;
+
+	if (off > subreq->start) {
+		off = round_up(off, cache->bsize);
+		subreq->len = off - subreq->start;
+		goto download_and_store;
+	}
+
+	to = vfs_llseek(file, subreq->start, SEEK_HOLE);
+	if (to < 0 && to >= (loff_t)-MAX_ERRNO)
+		goto cache_fail;
+
+	if (to < subreq->start + subreq->len) {
+		if (subreq->start + subreq->len >= i_size)
+			to = round_up(to, cache->bsize);
+		else
+			to = round_down(to, cache->bsize);
+		subreq->len = to - subreq->start;
+	}
+
+	cachefiles_end_secure(cache, saved_cred);
+	return NETFS_READ_FROM_CACHE;
+
+download_and_store:
+	if (cachefiles_has_space(cache, 0, (subreq->len + PAGE_SIZE - 1) / PAGE_SIZE) == 0)
+		__set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
+cache_fail:
+	cachefiles_end_secure(cache, saved_cred);
+cache_fail_nosec:
+	return NETFS_DOWNLOAD_FROM_SERVER;
+}
+
+/*
+ * Prepare for a write to occur.
+ */
+static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
+				    loff_t *_start, size_t *_len, loff_t i_size)
+{
+	loff_t start = *_start;
+	size_t len = *_len, down;
+
+	/* Round to DIO size */
+	down = start - round_down(start, PAGE_SIZE);
+	*_start = start - down;
+	*_len = round_up(down + len, PAGE_SIZE);
+	return 0;
+}
+
+/*
+ * Clean up an operation.
+ */
+static void cachefiles_end_operation(struct netfs_cache_resources *cres)
+{
+	struct fscache_retrieval *op = cres->cache_priv;
+	struct file *file = cres->cache_priv2;
+
+	_enter("");
+
+	if (file)
+		fput(file);
+	if (op) {
+		fscache_op_complete(&op->op, false);
+		fscache_put_retrieval(op);
+	}
+
+	_leave("");
+}
+
+static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
+	.end_operation		= cachefiles_end_operation,
+	.read			= cachefiles_read,
+	.write			= cachefiles_write,
+	.prepare_read		= cachefiles_prepare_read,
+	.prepare_write		= cachefiles_prepare_write,
+};
+
+/*
+ * Open the cache file when beginning a cache operation.
+ */
+int cachefiles_begin_read_operation(struct netfs_read_request *rreq,
+				    struct fscache_retrieval *op)
+{
+	struct cachefiles_object *object;
+	struct cachefiles_cache *cache;
+	struct path path;
+	struct file *file;
+
+	_enter("");
+
+	object = container_of(op->op.object,
+			      struct cachefiles_object, fscache);
+	cache = container_of(object->fscache.cache,
+			     struct cachefiles_cache, cache);
+
+	path.mnt = cache->mnt;
+	path.dentry = object->backer;
+	file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
+				   d_inode(object->backer), cache->cache_cred);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	if (!S_ISREG(file_inode(file)->i_mode))
+		goto error_file;
+	if (unlikely(!file->f_op->read_iter) ||
+	    unlikely(!file->f_op->write_iter)) {
+		pr_notice("Cache does not support read_iter and write_iter\n");
+		goto error_file;
+	}
+
+	fscache_get_retrieval(op);
+	rreq->cache_resources.cache_priv = op;
+	rreq->cache_resources.cache_priv2 = file;
+	rreq->cache_resources.ops = &cachefiles_netfs_cache_ops;
+	rreq->cookie_debug_id = object->fscache.debug_id;
+	_leave("");
+	return 0;
+
+error_file:
+	fput(file);
+	return -EIO;
+}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 5e796e6c38e5..427efa73b9bd 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -2,6 +2,7 @@
 
 config FSCACHE
 	tristate "General filesystem local caching manager"
+	select NETFS_SUPPORT
 	help
 	  This option enables a generic filesystem caching manager that can be
 	  used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 79e08e05ef84..3b2ffa93ac18 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -7,6 +7,7 @@ fscache-y := \
 	cache.o \
 	cookie.o \
 	fsdef.o \
+	io.o \
 	main.o \
 	netfs.o \
 	object.o \
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 08e91efbce53..c483863b740a 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -142,6 +142,10 @@ extern int fscache_wait_for_operation_activation(struct fscache_object *,
 						 atomic_t *,
 						 atomic_t *);
 extern void fscache_invalidate_writes(struct fscache_cookie *);
+struct fscache_retrieval *fscache_alloc_retrieval(struct fscache_cookie *cookie,
+						  struct address_space *mapping,
+						  fscache_rw_complete_t end_io_func,
+						  void *context);
 
 /*
  * proc.c
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
new file mode 100644
index 000000000000..8ecc1141802f
--- /dev/null
+++ b/fs/fscache/io.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Cache data I/O routines
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#define FSCACHE_USE_NEW_IO_API
+#include <linux/fscache-cache.h>
+#include <linux/slab.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/*
+ * Start a cache read operation.
+ * - we return:
+ *   -ENOMEM	- out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS	- no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA	- no data available in the backing object for some or all of
+ *                the pages
+ *   0		- dispatched a read on all pages
+ */
+int __fscache_begin_read_operation(struct netfs_read_request *rreq,
+				   struct fscache_cookie *cookie)
+{
+	struct fscache_retrieval *op;
+	struct fscache_object *object;
+	bool wake_cookie = false;
+	int ret;
+
+	_enter("rr=%08x", rreq->debug_id);
+
+	fscache_stat(&fscache_n_retrievals);
+
+	if (hlist_empty(&cookie->backing_objects))
+		goto nobufs;
+
+	if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+		_leave(" = -ENOBUFS [invalidating]");
+		return -ENOBUFS;
+	}
+
+	ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+	if (fscache_wait_for_deferred_lookup(cookie) < 0)
+		return -ERESTARTSYS;
+
+	op = fscache_alloc_retrieval(cookie, NULL, NULL, NULL);
+	if (!op)
+		return -ENOMEM;
+	trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi);
+
+	spin_lock(&cookie->lock);
+
+	if (!fscache_cookie_enabled(cookie) ||
+	    hlist_empty(&cookie->backing_objects))
+		goto nobufs_unlock;
+	object = hlist_entry(cookie->backing_objects.first,
+			     struct fscache_object, cookie_link);
+
+	__fscache_use_cookie(cookie);
+	atomic_inc(&object->n_reads);
+	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
+	if (fscache_submit_op(object, &op->op) < 0)
+		goto nobufs_unlock_dec;
+	spin_unlock(&cookie->lock);
+
+	fscache_stat(&fscache_n_retrieval_ops);
+
+	/* we wait for the operation to become active, and then process it
+	 * *here*, in this thread, and not in the thread pool */
+	ret = fscache_wait_for_operation_activation(
+		object, &op->op,
+		__fscache_stat(&fscache_n_retrieval_op_waits),
+		__fscache_stat(&fscache_n_retrievals_object_dead));
+	if (ret < 0)
+		goto error;
+
+	/* ask the cache to honour the operation */
+	ret = object->cache->ops->begin_read_operation(rreq, op);
+
+error:
+	if (ret == -ENOMEM)
+		fscache_stat(&fscache_n_retrievals_nomem);
+	else if (ret == -ERESTARTSYS)
+		fscache_stat(&fscache_n_retrievals_intr);
+	else if (ret == -ENODATA)
+		fscache_stat(&fscache_n_retrievals_nodata);
+	else if (ret < 0)
+		fscache_stat(&fscache_n_retrievals_nobufs);
+	else
+		fscache_stat(&fscache_n_retrievals_ok);
+
+	fscache_put_retrieval(op);
+	_leave(" = %d", ret);
+	return ret;
+
+nobufs_unlock_dec:
+	atomic_dec(&object->n_reads);
+	wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs_unlock:
+	spin_unlock(&cookie->lock);
+	fscache_put_retrieval(op);
+	if (wake_cookie)
+		__fscache_wake_unused_cookie(cookie);
+nobufs:
+	fscache_stat(&fscache_n_retrievals_nobufs);
+	_leave(" = -ENOBUFS");
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_begin_read_operation);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 26af6fdf1538..991b0a871744 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -299,7 +299,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
 /*
  * allocate a retrieval op
  */
-static struct fscache_retrieval *fscache_alloc_retrieval(
+struct fscache_retrieval *fscache_alloc_retrieval(
 	struct fscache_cookie *cookie,
 	struct address_space *mapping,
 	fscache_rw_complete_t end_io_func,
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index a5aa93ece8c5..a7c3ed89a3e0 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -278,5 +278,6 @@ int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_cache_stale_objects),
 		   atomic_read(&fscache_n_cache_retired_objects),
 		   atomic_read(&fscache_n_cache_culled_objects));
+	netfs_stats_show(m);
 	return 0;
 }
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 3f0b19dcfae7..3235ddbdcc09 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -304,6 +304,10 @@ struct fscache_cache_ops {
 
 	/* dissociate a cache from all the pages it was backing */
 	void (*dissociate_pages)(struct fscache_cache *cache);
+
+	/* Begin a read operation for the netfs lib */
+	int (*begin_read_operation)(struct netfs_read_request *rreq,
+				    struct fscache_retrieval *op);
 };
 
 extern struct fscache_cookie fscache_fsdef_index;
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 1f8dc72369ee..abc1c4737fb8 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -37,6 +37,7 @@ struct pagevec;
 struct fscache_cache_tag;
 struct fscache_cookie;
 struct fscache_netfs;
+struct netfs_read_request;
 
 typedef void (*fscache_rw_complete_t)(struct page *page,
 				      void *context,
@@ -191,6 +192,10 @@ extern void __fscache_update_cookie(struct fscache_cookie *, const void *);
 extern int __fscache_attr_changed(struct fscache_cookie *);
 extern void __fscache_invalidate(struct fscache_cookie *);
 extern void __fscache_wait_on_invalidate(struct fscache_cookie *);
+
+#ifdef FSCACHE_USE_NEW_IO_API
+extern int __fscache_begin_read_operation(struct netfs_read_request *, struct fscache_cookie *);
+#else
 extern int __fscache_read_or_alloc_page(struct fscache_cookie *,
 					struct page *,
 					fscache_rw_complete_t,
@@ -214,6 +219,8 @@ extern void __fscache_uncache_all_inode_pages(struct fscache_cookie *,
 					      struct inode *);
 extern void __fscache_readpages_cancel(struct fscache_cookie *cookie,
 				       struct list_head *pages);
+#endif /* FSCACHE_USE_NEW_IO_API */
+
 extern void __fscache_disable_cookie(struct fscache_cookie *, const void *, bool);
 extern void __fscache_enable_cookie(struct fscache_cookie *, const void *, loff_t,
 				    bool (*)(void *), void *);
@@ -498,6 +505,36 @@ int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size)
 	return -ENOBUFS;
 }
 
+#ifdef FSCACHE_USE_NEW_IO_API
+
+/**
+ * fscache_begin_read_operation - Begin a read operation for the netfs lib
+ * @rreq: The read request being undertaken
+ * @cookie: The cookie representing the cache object
+ *
+ * Begin a read operation on behalf of the netfs helper library.  @rreq
+ * indicates the read request to which the operation state should be attached;
+ * @cookie indicates the cache object that will be accessed.
+ *
+ * This is intended to be called from the ->begin_cache_operation() netfs lib
+ * operation as implemented by the network filesystem.
+ *
+ * Returns:
+ * * 0		- Success
+ * * -ENOBUFS	- No caching available
+ * * Other error code from the cache, such as -ENOMEM.
+ */
+static inline
+int fscache_begin_read_operation(struct netfs_read_request *rreq,
+				 struct fscache_cookie *cookie)
+{
+	if (fscache_cookie_valid(cookie) && fscache_cookie_enabled(cookie))
+		return __fscache_begin_read_operation(rreq, cookie);
+	return -ENOBUFS;
+}
+
+#else /* FSCACHE_USE_NEW_IO_API */
+
 /**
  * fscache_read_or_alloc_page - Read a page from the cache or allocate a block
  * in which to store it
@@ -777,6 +814,8 @@ void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 		__fscache_uncache_all_inode_pages(cookie, inode);
 }
 
+#endif /* FSCACHE_USE_NEW_IO_API */
+
 /**
  * fscache_disable_cookie - Disable a cookie
  * @cookie: The cookie representing the cache object
-- 
cgit v1.2.3


From a8ce7bd89689997537dd22dcbced46cf23dc19da Mon Sep 17 00:00:00 2001
From: Vincent Whitchurch <vincent.whitchurch@axis.com>
Date: Fri, 23 Apr 2021 13:45:24 +0200
Subject: regulator: core: Fix off_on_delay handling

The jiffies-based off_on_delay implementation has a couple of problems
that cause it to sometimes not actually delay for the required time:

 (1) If, for example, the off_on_delay time is equivalent to one jiffy,
     and the ->last_off_jiffy is set just before a new jiffy starts,
     then _regulator_do_enable() does not wait at all since it checks
     using time_before().

 (2) When jiffies overflows, the value of "remaining" becomes higher
     than "max_delay" and the code simply proceeds without waiting.

Fix these problems by changing it to use ktime_t instead.

[Note that since jiffies doesn't start at zero but at INITIAL_JIFFIES
 ("-5 minutes"), (2) above also led to the code not delaying if
 the first regulator_enable() is called when the ->last_off_jiffy is not
 initialised, such as for regulators with ->constraints->boot_on set.
 It's not clear to me if this was intended or not, but I've preserved
 this behaviour explicitly with the check for a non-zero ->last_off.]

Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
Link: https://lore.kernel.org/r/20210423114524.26414-1-vincent.whitchurch@axis.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 33 ++++++++-------------------------
 include/linux/regulator/driver.h |  2 +-
 2 files changed, 9 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 504898ba265e..f192bf19492e 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1443,7 +1443,7 @@ static int set_machine_constraints(struct regulator_dev *rdev)
 		if (rdev->constraints->always_on)
 			rdev->use_count++;
 	} else if (rdev->desc->off_on_delay) {
-		rdev->last_off_jiffy = jiffies;
+		rdev->last_off = ktime_get();
 	}
 
 	print_constraints(rdev);
@@ -2488,29 +2488,15 @@ static int _regulator_do_enable(struct regulator_dev *rdev)
 
 	trace_regulator_enable(rdev_get_name(rdev));
 
-	if (rdev->desc->off_on_delay) {
+	if (rdev->desc->off_on_delay && rdev->last_off) {
 		/* if needed, keep a distance of off_on_delay from last time
 		 * this regulator was disabled.
 		 */
-		unsigned long start_jiffy = jiffies;
-		unsigned long intended, max_delay, remaining;
-
-		max_delay = usecs_to_jiffies(rdev->desc->off_on_delay);
-		intended = rdev->last_off_jiffy + max_delay;
-
-		if (time_before(start_jiffy, intended)) {
-			/* calc remaining jiffies to deal with one-time
-			 * timer wrapping.
-			 * in case of multiple timer wrapping, either it can be
-			 * detected by out-of-range remaining, or it cannot be
-			 * detected and we get a penalty of
-			 * _regulator_enable_delay().
-			 */
-			remaining = intended - start_jiffy;
-			if (remaining <= max_delay)
-				_regulator_enable_delay(
-						jiffies_to_usecs(remaining));
-		}
+		ktime_t end = ktime_add_us(rdev->last_off, rdev->desc->off_on_delay);
+		s64 remaining = ktime_us_delta(end, ktime_get());
+
+		if (remaining > 0)
+			_regulator_enable_delay(remaining);
 	}
 
 	if (rdev->ena_pin) {
@@ -2740,11 +2726,8 @@ static int _regulator_do_disable(struct regulator_dev *rdev)
 			return ret;
 	}
 
-	/* cares about last_off_jiffy only if off_on_delay is required by
-	 * device.
-	 */
 	if (rdev->desc->off_on_delay)
-		rdev->last_off_jiffy = jiffies;
+		rdev->last_off = ktime_get();
 
 	trace_regulator_disable_complete(rdev_get_name(rdev));
 
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 597ed117086f..4ea520c248e9 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -476,7 +476,7 @@ struct regulator_dev {
 	unsigned int is_switch:1;
 
 	/* time when this regulator was disabled last time */
-	unsigned long last_off_jiffy;
+	ktime_t last_off;
 };
 
 struct regulator_dev *
-- 
cgit v1.2.3


From 06ec5acc7747f225154fcafaf2afe52324694baa Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 2 Mar 2021 13:54:42 +0200
Subject: net/mlx5: E-Switch, Return eswitch max ports when eswitch is
 supported

mlx5_eswitch_get_total_vports() doesn't honor MLX5_ESWICH Kconfig flag.

When MLX5_ESWITCH is disabled, FS layer continues to initialize eswitch
specific ACL namespaces.
Instead, start honoring MLX5_ESWITCH flag and perform vport specific
initialization only when vport count is non zero.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Vu Pham <vuhuong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 13 +++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   | 14 --------------
 include/linux/mlx5/eswitch.h                      | 11 +++++++++--
 3 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 1bb229ecd43b..c3a58224ae12 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2205,3 +2205,16 @@ void mlx5_esw_unlock(struct mlx5_eswitch *esw)
 {
 	up_write(&esw->mode_lock);
 }
+
+/**
+ * mlx5_eswitch_get_total_vports - Get total vports of the eswitch
+ *
+ * @dev: Pointer to core device
+ *
+ * mlx5_eswitch_get_total_vports returns total number of eswitch vports.
+ */
+u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev)
+{
+	return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + mlx5_sf_max_functions(dev);
+}
+EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index e05c5c0f3ae1..457ad42eaa2a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -1151,20 +1151,6 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid);
 
-/**
- * mlx5_eswitch_get_total_vports - Get total vports of the eswitch
- *
- * @dev:	Pointer to core device
- *
- * mlx5_eswitch_get_total_vports returns total number of vports for
- * the eswitch.
- */
-u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev)
-{
-	return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + mlx5_sf_max_functions(dev);
-}
-EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports);
-
 int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out)
 {
 	u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 9cf1da2883c6..17109b65c1ac 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -65,8 +65,6 @@ struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw,
 				    struct mlx5_eswitch_rep *rep, u32 sqn);
 
-u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
-
 #ifdef CONFIG_MLX5_ESWITCH
 enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
@@ -126,6 +124,8 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw,
 #define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK
 
 u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev);
+u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
+
 #else  /* CONFIG_MLX5_ESWITCH */
 
 static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev)
@@ -162,10 +162,17 @@ mlx5_eswitch_get_vport_metadata_mask(void)
 {
 	return 0;
 }
+
+static inline u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev)
+{
+	return 0;
+}
+
 #endif /* CONFIG_MLX5_ESWITCH */
 
 static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev)
 {
 	return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS;
 }
+
 #endif
-- 
cgit v1.2.3


From 9f8c7100c8f9879b7e972205cd1f33f0bc1cc8cb Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 2 Mar 2021 14:10:49 +0200
Subject: net/mlx5: E-Switch, Prepare to return total vports from eswitch
 struct

Total vports are already stored during eswitch initialization. Instead
of calculating everytime, read directly from eswitch.

Additionally, host PF's SF vport information is available using
QUERY_HCA_CAP command. It is not available through HCA_CAP of the
eswitch manager PF.
Hence, this patch prepares the return total eswitch vport count from the
existing eswitch struct.

This further helps to keep eswitch port counting macros and logic within
eswitch.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 9 ++++++---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 8 ++++++++
 include/linux/mlx5/vport.h                        | 8 --------
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index c3a58224ae12..f0974aa94574 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1573,8 +1573,8 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	if (!MLX5_VPORT_MANAGER(dev))
 		return 0;
 
-	total_vports = mlx5_eswitch_get_total_vports(dev);
-
+	total_vports = MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) +
+			mlx5_sf_max_functions(dev);
 	esw_info(dev,
 		 "Total vports %d, per vport: max uc(%d) max mc(%d)\n",
 		 total_vports,
@@ -2215,6 +2215,9 @@ void mlx5_esw_unlock(struct mlx5_eswitch *esw)
  */
 u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev)
 {
-	return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + mlx5_sf_max_functions(dev);
+	struct mlx5_eswitch *esw;
+
+	esw = dev->priv.eswitch;
+	return mlx5_esw_allowed(esw) ? esw->total_vports : 0;
 }
 EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index b289d756a7e4..5ab480a5745d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -545,6 +545,14 @@ static inline u16 mlx5_eswitch_first_host_vport_num(struct mlx5_core_dev *dev)
 		MLX5_VPORT_PF : MLX5_VPORT_FIRST_VF;
 }
 
+#define MLX5_VPORT_PF_PLACEHOLDER		(1u)
+#define MLX5_VPORT_UPLINK_PLACEHOLDER		(1u)
+#define MLX5_VPORT_ECPF_PLACEHOLDER(mdev)	(mlx5_ecpf_vport_exists(mdev))
+
+#define MLX5_SPECIAL_VPORTS(mdev) (MLX5_VPORT_PF_PLACEHOLDER +		\
+				   MLX5_VPORT_UPLINK_PLACEHOLDER +	\
+				   MLX5_VPORT_ECPF_PLACEHOLDER(mdev))
+
 static inline int mlx5_esw_sf_start_idx(const struct mlx5_eswitch *esw)
 {
 	/* PF and VF vports indices start from 0 to max_vfs */
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 4db87bcfce7b..aad53cb72f17 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -36,14 +36,6 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/device.h>
 
-#define MLX5_VPORT_PF_PLACEHOLDER		(1u)
-#define MLX5_VPORT_UPLINK_PLACEHOLDER		(1u)
-#define MLX5_VPORT_ECPF_PLACEHOLDER(mdev)	(mlx5_ecpf_vport_exists(mdev))
-
-#define MLX5_SPECIAL_VPORTS(mdev) (MLX5_VPORT_PF_PLACEHOLDER +		\
-				   MLX5_VPORT_UPLINK_PLACEHOLDER +	\
-				   MLX5_VPORT_ECPF_PLACEHOLDER(mdev))
-
 #define MLX5_VPORT_MANAGER(mdev)					\
 	(MLX5_CAP_GEN(mdev, vport_group_manager) &&			\
 	 (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&	\
-- 
cgit v1.2.3


From 1fdd7433a98a2f5511f49ad3f3b82bdd6f77265c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 1 Apr 2021 16:27:23 -0700
Subject: kbuild: add an elfnote for whether vmlinux is built with lto

Currently, clang LTO built vmlinux won't work with pahole.
LTO introduced cross-cu dwarf tag references and broke
current pahole model which handles one cu as a time.
The solution is to merge all cu's as one pahole cu as in [1].
We would like to do this merging only if cross-cu dwarf
references happens. The LTO build mode is a pretty good
indication for that.

In earlier version of this patch ([2]), clang flag
-grecord-gcc-switches is proposed to add to compilation flags
so pahole could detect "-flto" and then merging cu's.
This will increate the binary size of 1% without LTO though.

Arnaldo suggested to use a note to indicate the vmlinux
is built with LTO. Such a cheap way to get whether the vmlinux
is built with LTO or not helps pahole but is also useful
for tracing as LTO may inline/delete/demote global functions,
promote static functions, etc.

So this patch added an elfnote with a new type LINUX_ELFNOTE_LTO_INFO.
The owner of the note is "Linux".

With gcc 8.4.1 and clang trunk, without LTO, I got
  $ readelf -n vmlinux
  Displaying notes found in: .notes
    Owner                Data size        Description
  ...
    Linux                0x00000004       func
     description data: 00 00 00 00
  ...
With "readelf -x ".notes" vmlinux", I can verify the above "func"
with type code 0x101.

With clang thin-LTO, I got the same as above except the following:
     description data: 01 00 00 00
which indicates the vmlinux is built with LTO.

  [1] https://lore.kernel.org/bpf/20210325065316.3121287-1-yhs@fb.com/
  [2] https://lore.kernel.org/bpf/20210331001623.2778934-1-yhs@fb.com/

Suggested-by: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # LLVM/Clang v12.0.0-rc4 (x86-64)
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/elfnote-lto.h | 14 ++++++++++++++
 init/version.c              |  2 ++
 scripts/mod/modpost.c       |  2 ++
 3 files changed, 18 insertions(+)
 create mode 100644 include/linux/elfnote-lto.h

(limited to 'include/linux')

diff --git a/include/linux/elfnote-lto.h b/include/linux/elfnote-lto.h
new file mode 100644
index 000000000000..d4635a3ecc4f
--- /dev/null
+++ b/include/linux/elfnote-lto.h
@@ -0,0 +1,14 @@
+#ifndef __ELFNOTE_LTO_H
+#define __ELFNOTE_LTO_H
+
+#include <linux/elfnote.h>
+
+#define LINUX_ELFNOTE_LTO_INFO	0x101
+
+#ifdef CONFIG_LTO
+#define BUILD_LTO_INFO	ELFNOTE32("Linux", LINUX_ELFNOTE_LTO_INFO, 1)
+#else
+#define BUILD_LTO_INFO	ELFNOTE32("Linux", LINUX_ELFNOTE_LTO_INFO, 0)
+#endif
+
+#endif /* __ELFNOTE_LTO_H */
diff --git a/init/version.c b/init/version.c
index 92afc782b043..1a356f5493e8 100644
--- a/init/version.c
+++ b/init/version.c
@@ -9,6 +9,7 @@
 
 #include <generated/compile.h>
 #include <linux/build-salt.h>
+#include <linux/elfnote-lto.h>
 #include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
@@ -45,3 +46,4 @@ const char linux_proc_banner[] =
 	" (" LINUX_COMPILER ") %s\n";
 
 BUILD_SALT;
+BUILD_LTO_INFO;
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 20aab6960559..3e623ccc020b 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -2193,10 +2193,12 @@ static void add_header(struct buffer *b, struct module *mod)
 	 */
 	buf_printf(b, "#define INCLUDE_VERMAGIC\n");
 	buf_printf(b, "#include <linux/build-salt.h>\n");
+	buf_printf(b, "#include <linux/elfnote-lto.h>\n");
 	buf_printf(b, "#include <linux/vermagic.h>\n");
 	buf_printf(b, "#include <linux/compiler.h>\n");
 	buf_printf(b, "\n");
 	buf_printf(b, "BUILD_SALT;\n");
+	buf_printf(b, "BUILD_LTO_INFO;\n");
 	buf_printf(b, "\n");
 	buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
 	buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n");
-- 
cgit v1.2.3


From 0e0345b77ac4605d5447b252d220e4a2ee118da7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 15 Apr 2021 20:36:07 +0300
Subject: kbuild: redo fake deps at include/config/*.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make include/config/foo/bar.h fake deps files generation simpler.

* delete .h suffix
	those aren't header files, shorten filenames,

* delete tolower()
	Linux filesystems can deal with both upper and lowercase
	filenames very well,

* put everything in 1 directory
	Presumably 'mkdir -p' split is from dark times when filesystems
	handled huge directories badly, disks were round adding to
	seek times.

	x86_64 allmodconfig lists 12364 files in include/config.

	../obj/include/config/
	├── 104_QUAD_8
	├── 60XX_WDT
	├── 64BIT
		...
	├── ZSWAP_DEFAULT_ON
	├── ZSWAP_ZPOOL_DEFAULT
	└── ZSWAP_ZPOOL_DEFAULT_ZBUD

	0 directories, 12364 files

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/linux/compiler-version.h |  2 +-
 init/Kconfig                     |  2 +-
 kernel/gen_kheaders.sh           |  2 +-
 scripts/Makefile.build           |  4 ++--
 scripts/basic/fixdep.c           | 43 ++++++----------------------------------
 scripts/kconfig/confdata.c       | 15 +++++---------
 6 files changed, 16 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-version.h b/include/linux/compiler-version.h
index 2b2972c77c62..573fa85b6c0c 100644
--- a/include/linux/compiler-version.h
+++ b/include/linux/compiler-version.h
@@ -9,6 +9,6 @@
  * This header exists to force full rebuild when the compiler is upgraded.
  *
  * When fixdep scans this, it will find this string "CONFIG_CC_VERSION_TEXT"
- * and add dependency on include/config/cc/version/text.h, which is touched
+ * and add dependency on include/config/CC_VERSION_TEXT, which is touched
  * by Kconfig when the version string from the compiler changes.
  */
diff --git a/init/Kconfig b/init/Kconfig
index b5744d32c4df..c2339da03468 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -21,7 +21,7 @@ config CC_VERSION_TEXT
 
 	  - Ensure full rebuild when the compiler is updated
 	    include/linux/compiler-version.h contains this option in the comment
-	    line so fixdep adds include/config/cc/version/text.h into the
+	    line so fixdep adds include/config/CC_VERSION_TEXT into the
 	    auto-generated dependency. When the compiler is updated, syncconfig
 	    will touch it and then every file will be rebuilt.
 
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index c1510f0ab3ea..34a1dc2abc7d 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -36,7 +36,7 @@ all_dirs="$all_dirs $dir_list"
 #
 # When Kconfig regenerates include/generated/autoconf.h, its timestamp is
 # updated, but the contents might be still the same. When any CONFIG option is
-# changed, Kconfig touches the corresponding timestamp file include/config/*.h.
+# changed, Kconfig touches the corresponding timestamp file include/config/*.
 # Hence, the md5sum detects the configuration change anyway. We do not need to
 # check include/generated/autoconf.h explicitly.
 #
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index e47c054d3db2..5e39b0517186 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -239,8 +239,8 @@ endif # CONFIG_STACK_VALIDATION
 
 # Rebuild all objects when objtool changes, or is enabled/disabled.
 objtool_dep = $(objtool_obj)					\
-	      $(wildcard include/config/orc/unwinder.h		\
-			 include/config/stack/validation.h)
+	      $(wildcard include/config/ORC_UNWINDER		\
+			 include/config/STACK_VALIDATION)
 
 ifdef CONFIG_TRIM_UNUSED_KSYMS
 cmd_gen_ksymdeps = \
diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c
index d98540552941..44e887cff49b 100644
--- a/scripts/basic/fixdep.c
+++ b/scripts/basic/fixdep.c
@@ -34,7 +34,7 @@
  * the config symbols are rebuilt.
  *
  * So if the user changes his CONFIG_HIS_DRIVER option, only the objects
- * which depend on "include/config/his/driver.h" will be rebuilt,
+ * which depend on "include/config/HIS_DRIVER" will be rebuilt,
  * so most likely only his driver ;-)
  *
  * The idea above dates, by the way, back to Michael E Chastain, AFAIK.
@@ -74,7 +74,7 @@
  *
  * and then basically copies the .<target>.d file to stdout, in the
  * process filtering out the dependency on autoconf.h and adding
- * dependencies on include/config/my/option.h for every
+ * dependencies on include/config/MY_OPTION for every
  * CONFIG_MY_OPTION encountered in any of the prerequisites.
  *
  * We don't even try to really parse the header files, but
@@ -107,8 +107,8 @@ static void usage(void)
 
 /*
  * In the intended usage of this program, the stdout is redirected to .*.cmd
- * files. The return value of printf() and putchar() must be checked to catch
- * any error, e.g. "No space left on device".
+ * files. The return value of printf() must be checked to catch any error,
+ * e.g. "No space left on device".
  */
 static void xprintf(const char *format, ...)
 {
@@ -124,38 +124,6 @@ static void xprintf(const char *format, ...)
 	va_end(ap);
 }
 
-static void xputchar(int c)
-{
-	int ret;
-
-	ret = putchar(c);
-	if (ret == EOF) {
-		perror("fixdep");
-		exit(1);
-	}
-}
-
-/*
- * Print out a dependency path from a symbol name
- */
-static void print_dep(const char *m, int slen, const char *dir)
-{
-	int c, prev_c = '/', i;
-
-	xprintf("    $(wildcard %s/", dir);
-	for (i = 0; i < slen; i++) {
-		c = m[i];
-		if (c == '_')
-			c = '/';
-		else
-			c = tolower(c);
-		if (c != '/' || prev_c != '/')
-			xputchar(c);
-		prev_c = c;
-	}
-	xprintf(".h) \\\n");
-}
-
 struct item {
 	struct item	*next;
 	unsigned int	len;
@@ -220,7 +188,8 @@ static void use_config(const char *m, int slen)
 	    return;
 
 	define_config(m, slen, hash);
-	print_dep(m, slen, "include/config");
+	/* Print out a dependency path from a symbol name. */
+	xprintf("    $(wildcard include/config/%.*s) \\\n", slen, m);
 }
 
 /* test if s ends in sub */
diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index 2568dbe16ed6..c7b7e1c62a9c 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c
@@ -130,19 +130,14 @@ static size_t depfile_prefix_len;
 static int conf_touch_dep(const char *name)
 {
 	int fd, ret;
-	const char *s;
-	char *d, c;
+	char *d;
 
-	/* check overflow: prefix + name + ".h" + '\0' must fit in buffer. */
-	if (depfile_prefix_len + strlen(name) + 3 > sizeof(depfile_path))
+	/* check overflow: prefix + name + '\0' must fit in buffer. */
+	if (depfile_prefix_len + strlen(name) + 1 > sizeof(depfile_path))
 		return -1;
 
 	d = depfile_path + depfile_prefix_len;
-	s = name;
-
-	while ((c = *s++))
-		*d++ = (c == '_') ? '/' : tolower(c);
-	strcpy(d, ".h");
+	strcpy(d, name);
 
 	/* Assume directory path already exists. */
 	fd = open(depfile_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
@@ -465,7 +460,7 @@ load:
 					 * Reading from include/config/auto.conf
 					 * If CONFIG_FOO previously existed in
 					 * auto.conf but it is missing now,
-					 * include/config/foo.h must be touched.
+					 * include/config/FOO must be touched.
 					 */
 					conf_touch_dep(line + strlen(CONFIG_));
 				else
-- 
cgit v1.2.3


From 81dd4d4d6178306ab31db91bdc7353d485bdafce Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sat, 24 Apr 2021 10:04:15 -0500
Subject: dmaengine: idxd: Add IDXD performance monitor support

Implement the IDXD performance monitor capability (named 'perfmon' in
the DSA (Data Streaming Accelerator) spec [1]), which supports the
collection of information about key events occurring during DSA and
IAX (Intel Analytics Accelerator) device execution, to assist in
performance tuning and debugging.

The idxd perfmon support is implemented as part of the IDXD driver and
interfaces with the Linux perf framework.  It has several features in
common with the existing uncore pmu support:

  - it does not support sampling
  - does not support per-thread counting

However it also has some unique features not present in the core and
uncore support:

  - all general-purpose counters are identical, thus no event constraints
  - operation is always system-wide

While the core perf subsystem assumes that all counters are by default
per-cpu, the uncore pmus are socket-scoped and use a cpu mask to
restrict counting to one cpu from each socket.  IDXD counters use a
similar strategy but expand the scope even further; since IDXD
counters are system-wide and can be read from any cpu, the IDXD perf
driver picks a single cpu to do the work (with cpu hotplug notifiers
to choose a different cpu if the chosen one is taken off-line).

More specifically, the perf userspace tool by default opens a counter
for each cpu for an event.  However, if it finds a cpumask file
associated with the pmu under sysfs, as is the case with the uncore
pmus, it will open counters only on the cpus specified by the cpumask.
Since perfmon only needs to open a single counter per event for a
given IDXD device, the perfmon driver will create a sysfs cpumask file
for the device and insert the first cpu of the system into it.  When a
user uses perf to open an event, perf will open a single counter on
the cpu specified by the cpu mask.  This amounts to the default
system-wide rather than per-cpu counting mentioned previously for
perfmon pmu events.  In order to keep the cpu mask up-to-date, the
driver implements cpu hotplug support for multiple devices, as IDXD
usually enumerates and registers more than one idxd device.

The perfmon driver implements basic perfmon hardware capability
discovery and configuration, and is initialized by the IDXD driver's
probe function.  During initialization, the driver retrieves the total
number of supported performance counters, the pmu ID, and the device
type from idxd device, and registers itself under the Linux perf
framework.

The perf userspace tool can be used to monitor single or multiple
events depending on the given configuration, as well as event groups,
which are also supported by the perfmon driver.  The user configures
events using the perf tool command-line interface by specifying the
event and corresponding event category, along with an optional set of
filters that can be used to restrict counting to specific work queues,
traffic classes, page and transfer sizes, and engines (See [1] for
specifics).

With the configuration specified by the user, the perf tool issues a
system call passing that information to the kernel, which uses it to
initialize the specified event(s).  The event(s) are opened and
started, and following termination of the perf command, they're
stopped.  At that point, the perfmon driver will read the latest count
for the event(s), calculate the difference between the latest counter
values and previously tracked counter values, and display the final
incremental count as the event count for the cycle.  An overflow
handler registered on the IDXD irq path is used to account for counter
overflows, which are signaled by an overflow interrupt.

Below are a couple of examples of perf usage for monitoring DSA events.

The following monitors all events in the 'engine' category.  Becuuse
no filters are specified, this captures all engine events for the
workload, which in this case is 19 iterations of the work generated by
the kernel dmatest module.

Details describing the events can be found in Appendix D of [1],
Performance Monitoring Events, but briefly they are:

  event 0x1:  total input data processed, in 32-byte units
  event 0x2:  total data written, in 32-byte units
  event 0x4:  number of work descriptors that read the source
  event 0x8:  number of work descriptors that write the destination
  event 0x10: number of work descriptors dispatched from batch descriptors
  event 0x20: number of work descriptors dispatched from work queues

 # perf stat -e dsa0/event=0x1,event_category=0x1/,
                dsa0/event=0x2,event_category=0x1/,
		dsa0/event=0x4,event_category=0x1/,
		dsa0/event=0x8,event_category=0x1/,
		dsa0/event=0x10,event_category=0x1/,
		dsa0/event=0x20,event_category=0x1/
		  modprobe dmatest channel=dma0chan0 timeout=2000
		  iterations=19 run=1 wait=1

     Performance counter stats for 'system wide':

                 5,332      dsa0/event=0x1,event_category=0x1/
                 5,327      dsa0/event=0x2,event_category=0x1/
                    19      dsa0/event=0x4,event_category=0x1/
                    19      dsa0/event=0x8,event_category=0x1/
                     0      dsa0/event=0x10,event_category=0x1/
                    19      dsa0/event=0x20,event_category=0x1/

          21.977436186 seconds time elapsed

The command below illustrates filter usage with a simple example.  It
specifies that MEM_MOVE operations should be counted for the DSA
device dsa0 (event 0x8 corresponds to the EV_MEM_MOVE event - Number
of Memory Move Descriptors, which is part of event category 0x3 -
Operations. The detailed category and event IDs are available in
Appendix D, Performance Monitoring Events, of [1]).  In addition to
the event and event category, a number of filters are also specified
(the detailed filter values are available in Chapter 6.4 (Filter
Support) of [1]), which will restrict counting to only those events
that meet all of the filter criteria.  In this case, the filters
specify that only MEM_MOVE operations that are serviced by work queue
wq0 and specifically engine number engine0 and traffic class tc0
having sizes between 0 and 4k and page size of between 0 and 1G result
in a counter hit; anything else will be filtered out and not appear in
the final count.  Note that filters are optional - any filter not
specified is assumed to be all ones and will pass anything.

 # perf stat -e dsa0/filter_wq=0x1,filter_tc=0x1,filter_sz=0x7,
                filter_eng=0x1,event=0x8,event_category=0x3/
		  modprobe dmatest channel=dma0chan0 timeout=2000
		  iterations=19 run=1 wait=1

     Performance counter stats for 'system wide':

       19      dsa0/filter_wq=0x1,filter_tc=0x1,filter_sz=0x7,
               filter_eng=0x1,event=0x8,event_category=0x3/

          21.865914091 seconds time elapsed

The output above reflects that the unspecified workload resulted in
the counting of 19 MEM_MOVE operation events that met the filter
criteria.

[1]: https://software.intel.com/content/www/us/en/develop/download/intel-data-streaming-accelerator-preliminary-architecture-specification.html

[ Based on work originally by Jing Lin. ]

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Link: https://lore.kernel.org/r/0c5080a7d541904c4ad42b848c76a1ce056ddac7.1619276133.git.zanussi@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../ABI/testing/sysfs-bus-event_source-devices-dsa |  30 +
 drivers/dma/Kconfig                                |  12 +
 drivers/dma/idxd/Makefile                          |   2 +
 drivers/dma/idxd/idxd.h                            |  45 ++
 drivers/dma/idxd/perfmon.c                         | 662 +++++++++++++++++++++
 drivers/dma/idxd/perfmon.h                         | 119 ++++
 drivers/dma/idxd/registers.h                       | 108 ++++
 include/linux/cpuhotplug.h                         |   1 +
 8 files changed, 979 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
 create mode 100644 drivers/dma/idxd/perfmon.c
 create mode 100644 drivers/dma/idxd/perfmon.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
new file mode 100644
index 000000000000..3c7d132281b0
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa
@@ -0,0 +1,30 @@
+What:		/sys/bus/event_source/devices/dsa*/format
+Date:		April 2021
+KernelVersion:  5.13
+Contact:	Tom Zanussi <tom.zanussi@linux.intel.com>
+Description:	Read-only.  Attribute group to describe the magic bits
+		that go into perf_event_attr.config or
+		perf_event_attr.config1 for the IDXD DSA pmu.  (See also
+		ABI/testing/sysfs-bus-event_source-devices-format).
+
+		Each attribute in this group defines a bit range in
+		perf_event_attr.config or perf_event_attr.config1.
+		All supported attributes are listed below (See the
+		IDXD DSA Spec for possible attribute values)::
+
+		    event_category = "config:0-3"    - event category
+		    event          = "config:4-31"   - event ID
+
+		    filter_wq      = "config1:0-31"  - workqueue filter
+		    filter_tc      = "config1:32-39" - traffic class filter
+		    filter_pgsz    = "config1:40-43" - page size filter
+		    filter_sz      = "config1:44-51" - transfer size filter
+		    filter_eng     = "config1:52-59" - engine filter
+
+What:		/sys/bus/event_source/devices/dsa*/cpumask
+Date:		April 2021
+KernelVersion:  5.13
+Contact:	Tom Zanussi <tom.zanussi@linux.intel.com>
+Description:    Read-only.  This file always returns the cpu to which the
+                IDXD DSA pmu is bound for access to all dsa pmu
+		performance monitoring events.
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0c2827fd8c19..b417217c148c 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -300,6 +300,18 @@ config INTEL_IDXD_SVM
 	depends on PCI_PASID
 	depends on PCI_IOV
 
+config INTEL_IDXD_PERFMON
+	bool "Intel Data Accelerators performance monitor support"
+	depends on INTEL_IDXD
+	help
+	  Enable performance monitor (pmu) support for the Intel(R)
+	  data accelerators present in Intel Xeon CPU.  With this
+	  enabled, perf can be used to monitor the DSA (Intel Data
+	  Streaming Accelerator) events described in the Intel DSA
+	  spec.
+
+	  If unsure, say N.
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86_64
diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index 8978b898d777..6d11558756f8 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
 idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
+
+idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 97c96ca6ab70..26482c7d4c3a 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -9,6 +9,8 @@
 #include <linux/wait.h>
 #include <linux/cdev.h>
 #include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
 #include "registers.h"
 
 #define IDXD_DRIVER_VERSION	"1.00"
@@ -29,6 +31,7 @@ enum idxd_type {
 };
 
 #define IDXD_NAME_SIZE		128
+#define IDXD_PMU_EVENT_MAX	64
 
 struct idxd_device_driver {
 	struct device_driver drv;
@@ -61,6 +64,31 @@ struct idxd_group {
 	int tc_b;
 };
 
+struct idxd_pmu {
+	struct idxd_device *idxd;
+
+	struct perf_event *event_list[IDXD_PMU_EVENT_MAX];
+	int n_events;
+
+	DECLARE_BITMAP(used_mask, IDXD_PMU_EVENT_MAX);
+
+	struct pmu pmu;
+	char name[IDXD_NAME_SIZE];
+	int cpu;
+
+	int n_counters;
+	int counter_width;
+	int n_event_categories;
+
+	bool per_counter_caps_supported;
+	unsigned long supported_event_categories;
+
+	unsigned long supported_filters;
+	int n_filters;
+
+	struct hlist_node cpuhp_node;
+};
+
 #define IDXD_MAX_PRIORITY	0xf
 
 enum idxd_wq_state {
@@ -241,6 +269,8 @@ struct idxd_device {
 	struct work_struct work;
 
 	int *int_handles;
+
+	struct idxd_pmu *idxd_pmu;
 };
 
 /* IDXD software descriptor */
@@ -437,4 +467,19 @@ int idxd_cdev_get_major(struct idxd_device *idxd);
 int idxd_wq_add_cdev(struct idxd_wq *wq);
 void idxd_wq_del_cdev(struct idxd_wq *wq);
 
+/* perfmon */
+#if IS_ENABLED(CONFIG_INTEL_IDXD_PERFMON)
+int perfmon_pmu_init(struct idxd_device *idxd);
+void perfmon_pmu_remove(struct idxd_device *idxd);
+void perfmon_counter_overflow(struct idxd_device *idxd);
+void perfmon_init(void);
+void perfmon_exit(void);
+#else
+static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
+static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
+static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
+static inline void perfmon_init(void) {}
+static inline void perfmon_exit(void) {}
+#endif
+
 #endif
diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c
new file mode 100644
index 000000000000..d73004f47cf4
--- /dev/null
+++ b/drivers/dma/idxd/perfmon.c
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#include <linux/sched/task.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include "idxd.h"
+#include "perfmon.h"
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf);
+
+static cpumask_t		perfmon_dsa_cpu_mask;
+static bool			cpuhp_set_up;
+static enum cpuhp_state		cpuhp_slot;
+
+/*
+ * perf userspace reads this attribute to determine which cpus to open
+ * counters on.  It's connected to perfmon_dsa_cpu_mask, which is
+ * maintained by the cpu hotplug handlers.
+ */
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *perfmon_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+	.attrs = perfmon_cpumask_attrs,
+};
+
+/*
+ * These attributes specify the bits in the config word that the perf
+ * syscall uses to pass the event ids and categories to perfmon.
+ */
+DEFINE_PERFMON_FORMAT_ATTR(event_category, "config:0-3");
+DEFINE_PERFMON_FORMAT_ATTR(event, "config:4-31");
+
+/*
+ * These attributes specify the bits in the config1 word that the perf
+ * syscall uses to pass filter data to perfmon.
+ */
+DEFINE_PERFMON_FORMAT_ATTR(filter_wq, "config1:0-31");
+DEFINE_PERFMON_FORMAT_ATTR(filter_tc, "config1:32-39");
+DEFINE_PERFMON_FORMAT_ATTR(filter_pgsz, "config1:40-43");
+DEFINE_PERFMON_FORMAT_ATTR(filter_sz, "config1:44-51");
+DEFINE_PERFMON_FORMAT_ATTR(filter_eng, "config1:52-59");
+
+#define PERFMON_FILTERS_START	2
+#define PERFMON_FILTERS_MAX	5
+
+static struct attribute *perfmon_format_attrs[] = {
+	&format_attr_idxd_event_category.attr,
+	&format_attr_idxd_event.attr,
+	&format_attr_idxd_filter_wq.attr,
+	&format_attr_idxd_filter_tc.attr,
+	&format_attr_idxd_filter_pgsz.attr,
+	&format_attr_idxd_filter_sz.attr,
+	&format_attr_idxd_filter_eng.attr,
+	NULL,
+};
+
+static struct attribute_group perfmon_format_attr_group = {
+	.name = "format",
+	.attrs = perfmon_format_attrs,
+};
+
+static const struct attribute_group *perfmon_attr_groups[] = {
+	&perfmon_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
+}
+
+static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
+{
+	return &idxd_pmu->pmu == event->pmu;
+}
+
+static int perfmon_collect_events(struct idxd_pmu *idxd_pmu,
+				  struct perf_event *leader,
+				  bool do_grp)
+{
+	struct perf_event *event;
+	int n, max_count;
+
+	max_count = idxd_pmu->n_counters;
+	n = idxd_pmu->n_events;
+
+	if (n >= max_count)
+		return -EINVAL;
+
+	if (is_idxd_event(idxd_pmu, leader)) {
+		idxd_pmu->event_list[n] = leader;
+		idxd_pmu->event_list[n]->hw.idx = n;
+		n++;
+	}
+
+	if (!do_grp)
+		return n;
+
+	for_each_sibling_event(event, leader) {
+		if (!is_idxd_event(idxd_pmu, event) ||
+		    event->state <= PERF_EVENT_STATE_OFF)
+			continue;
+
+		if (n >= max_count)
+			return -EINVAL;
+
+		idxd_pmu->event_list[n] = event;
+		idxd_pmu->event_list[n]->hw.idx = n;
+		n++;
+	}
+
+	return n;
+}
+
+static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu,
+				    struct perf_event *event, int idx)
+{
+	struct idxd_device *idxd = idxd_pmu->idxd;
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = idx;
+	hwc->config_base = ioread64(CNTRCFG_REG(idxd, idx));
+	hwc->event_base = ioread64(CNTRCFG_REG(idxd, idx));
+}
+
+static int perfmon_assign_event(struct idxd_pmu *idxd_pmu,
+				struct perf_event *event)
+{
+	int i;
+
+	for (i = 0; i < IDXD_PMU_EVENT_MAX; i++)
+		if (!test_and_set_bit(i, idxd_pmu->used_mask))
+			return i;
+
+	return -EINVAL;
+}
+
+/*
+ * Check whether there are enough counters to satisfy that all the
+ * events in the group can actually be scheduled at the same time.
+ *
+ * To do this, create a fake idxd_pmu object so the event collection
+ * and assignment functions can be used without affecting the internal
+ * state of the real idxd_pmu object.
+ */
+static int perfmon_validate_group(struct idxd_pmu *pmu,
+				  struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct idxd_pmu *fake_pmu;
+	int i, ret = 0, n, idx;
+
+	fake_pmu = kzalloc(sizeof(*fake_pmu), GFP_KERNEL);
+	if (!fake_pmu)
+		return -ENOMEM;
+
+	fake_pmu->pmu.name = pmu->pmu.name;
+	fake_pmu->n_counters = pmu->n_counters;
+
+	n = perfmon_collect_events(fake_pmu, leader, true);
+	if (n < 0) {
+		ret = n;
+		goto out;
+	}
+
+	fake_pmu->n_events = n;
+	n = perfmon_collect_events(fake_pmu, event, false);
+	if (n < 0) {
+		ret = n;
+		goto out;
+	}
+
+	fake_pmu->n_events = n;
+
+	for (i = 0; i < n; i++) {
+		event = fake_pmu->event_list[i];
+
+		idx = perfmon_assign_event(fake_pmu, event);
+		if (idx < 0) {
+			ret = idx;
+			goto out;
+		}
+	}
+out:
+	kfree(fake_pmu);
+
+	return ret;
+}
+
+static int perfmon_pmu_event_init(struct perf_event *event)
+{
+	struct idxd_device *idxd;
+	int ret = 0;
+
+	idxd = event_to_idxd(event);
+	event->hw.idx = -1;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* sampling not supported */
+	if (event->attr.sample_period)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	if (event->pmu != &idxd->idxd_pmu->pmu)
+		return -EINVAL;
+
+	event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
+	event->cpu = idxd->idxd_pmu->cpu;
+	event->hw.config = event->attr.config;
+
+	if (event->group_leader != event)
+		 /* non-group events have themselves as leader */
+		ret = perfmon_validate_group(idxd->idxd_pmu, event);
+
+	return ret;
+}
+
+static inline u64 perfmon_pmu_read_counter(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct idxd_device *idxd;
+	int cntr = hwc->idx;
+
+	idxd = event_to_idxd(event);
+
+	return ioread64(CNTRDATA_REG(idxd, cntr));
+}
+
+static void perfmon_pmu_event_update(struct perf_event *event)
+{
+	struct idxd_device *idxd = event_to_idxd(event);
+	u64 prev_raw_count, new_raw_count, delta, p, n;
+	int shift = 64 - idxd->idxd_pmu->counter_width;
+	struct hw_perf_event *hwc = &event->hw;
+
+	do {
+		prev_raw_count = local64_read(&hwc->prev_count);
+		new_raw_count = perfmon_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			new_raw_count) != prev_raw_count);
+
+	n = (new_raw_count << shift);
+	p = (prev_raw_count << shift);
+
+	delta = ((n - p) >> shift);
+
+	local64_add(delta, &event->count);
+}
+
+void perfmon_counter_overflow(struct idxd_device *idxd)
+{
+	int i, n_counters, max_loop = OVERFLOW_SIZE;
+	struct perf_event *event;
+	unsigned long ovfstatus;
+
+	n_counters = min(idxd->idxd_pmu->n_counters, OVERFLOW_SIZE);
+
+	ovfstatus = ioread32(OVFSTATUS_REG(idxd));
+
+	/*
+	 * While updating overflowed counters, other counters behind
+	 * them could overflow and be missed in a given pass.
+	 * Normally this could happen at most n_counters times, but in
+	 * theory a tiny counter width could result in continual
+	 * overflows and endless looping.  max_loop provides a
+	 * failsafe in that highly unlikely case.
+	 */
+	while (ovfstatus && max_loop--) {
+		/* Figure out which counter(s) overflowed */
+		for_each_set_bit(i, &ovfstatus, n_counters) {
+			unsigned long ovfstatus_clear = 0;
+
+			/* Update event->count for overflowed counter */
+			event = idxd->idxd_pmu->event_list[i];
+			perfmon_pmu_event_update(event);
+			/* Writing 1 to OVFSTATUS bit clears it */
+			set_bit(i, &ovfstatus_clear);
+			iowrite32(ovfstatus_clear, OVFSTATUS_REG(idxd));
+		}
+
+		ovfstatus = ioread32(OVFSTATUS_REG(idxd));
+	}
+
+	/*
+	 * Should never happen.  If so, it means a counter(s) looped
+	 * around twice while this handler was running.
+	 */
+	WARN_ON_ONCE(ovfstatus);
+}
+
+static inline void perfmon_reset_config(struct idxd_device *idxd)
+{
+	iowrite32(CONFIG_RESET, PERFRST_REG(idxd));
+	iowrite32(0, OVFSTATUS_REG(idxd));
+	iowrite32(0, PERFFRZ_REG(idxd));
+}
+
+static inline void perfmon_reset_counters(struct idxd_device *idxd)
+{
+	iowrite32(CNTR_RESET, PERFRST_REG(idxd));
+}
+
+static inline void perfmon_reset(struct idxd_device *idxd)
+{
+	perfmon_reset_config(idxd);
+	perfmon_reset_counters(idxd);
+}
+
+static void perfmon_pmu_event_start(struct perf_event *event, int mode)
+{
+	u32 flt_wq, flt_tc, flt_pg_sz, flt_xfer_sz, flt_eng = 0;
+	u64 cntr_cfg, cntrdata, event_enc, event_cat = 0;
+	struct hw_perf_event *hwc = &event->hw;
+	union filter_cfg flt_cfg;
+	union event_cfg event_cfg;
+	struct idxd_device *idxd;
+	int cntr;
+
+	idxd = event_to_idxd(event);
+
+	event->hw.idx = hwc->idx;
+	cntr = hwc->idx;
+
+	/* Obtain event category and event value from user space */
+	event_cfg.val = event->attr.config;
+	flt_cfg.val = event->attr.config1;
+	event_cat = event_cfg.event_cat;
+	event_enc = event_cfg.event_enc;
+
+	/* Obtain filter configuration from user space */
+	flt_wq = flt_cfg.wq;
+	flt_tc = flt_cfg.tc;
+	flt_pg_sz = flt_cfg.pg_sz;
+	flt_xfer_sz = flt_cfg.xfer_sz;
+	flt_eng = flt_cfg.eng;
+
+	if (flt_wq && test_bit(FLT_WQ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_wq, FLTCFG_REG(idxd, cntr, FLT_WQ));
+	if (flt_tc && test_bit(FLT_TC, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_tc, FLTCFG_REG(idxd, cntr, FLT_TC));
+	if (flt_pg_sz && test_bit(FLT_PG_SZ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_pg_sz, FLTCFG_REG(idxd, cntr, FLT_PG_SZ));
+	if (flt_xfer_sz && test_bit(FLT_XFER_SZ, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_xfer_sz, FLTCFG_REG(idxd, cntr, FLT_XFER_SZ));
+	if (flt_eng && test_bit(FLT_ENG, &idxd->idxd_pmu->supported_filters))
+		iowrite32(flt_eng, FLTCFG_REG(idxd, cntr, FLT_ENG));
+
+	/* Read the start value */
+	cntrdata = ioread64(CNTRDATA_REG(idxd, cntr));
+	local64_set(&event->hw.prev_count, cntrdata);
+
+	/* Set counter to event/category */
+	cntr_cfg = event_cat << CNTRCFG_CATEGORY_SHIFT;
+	cntr_cfg |= event_enc << CNTRCFG_EVENT_SHIFT;
+	/* Set interrupt on overflow and counter enable bits */
+	cntr_cfg |= (CNTRCFG_IRQ_OVERFLOW | CNTRCFG_ENABLE);
+
+	iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr));
+}
+
+static void perfmon_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct idxd_device *idxd;
+	int i, cntr = hwc->idx;
+	u64 cntr_cfg;
+
+	idxd = event_to_idxd(event);
+
+	/* remove this event from event list */
+	for (i = 0; i < idxd->idxd_pmu->n_events; i++) {
+		if (event != idxd->idxd_pmu->event_list[i])
+			continue;
+
+		for (++i; i < idxd->idxd_pmu->n_events; i++)
+			idxd->idxd_pmu->event_list[i - 1] = idxd->idxd_pmu->event_list[i];
+		--idxd->idxd_pmu->n_events;
+		break;
+	}
+
+	cntr_cfg = ioread64(CNTRCFG_REG(idxd, cntr));
+	cntr_cfg &= ~CNTRCFG_ENABLE;
+	iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr));
+
+	if (mode == PERF_EF_UPDATE)
+		perfmon_pmu_event_update(event);
+
+	event->hw.idx = -1;
+	clear_bit(cntr, idxd->idxd_pmu->used_mask);
+}
+
+static void perfmon_pmu_event_del(struct perf_event *event, int mode)
+{
+	perfmon_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int perfmon_pmu_event_add(struct perf_event *event, int flags)
+{
+	struct idxd_device *idxd = event_to_idxd(event);
+	struct idxd_pmu *idxd_pmu = idxd->idxd_pmu;
+	struct hw_perf_event *hwc = &event->hw;
+	int idx, n;
+
+	n = perfmon_collect_events(idxd_pmu, event, false);
+	if (n < 0)
+		return n;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	idx = perfmon_assign_event(idxd_pmu, event);
+	if (idx < 0)
+		return idx;
+
+	perfmon_assign_hw_event(idxd_pmu, event, idx);
+
+	if (flags & PERF_EF_START)
+		perfmon_pmu_event_start(event, 0);
+
+	idxd_pmu->n_events = n;
+
+	return 0;
+}
+
+static void enable_perfmon_pmu(struct idxd_device *idxd)
+{
+	iowrite32(COUNTER_UNFREEZE, PERFFRZ_REG(idxd));
+}
+
+static void disable_perfmon_pmu(struct idxd_device *idxd)
+{
+	iowrite32(COUNTER_FREEZE, PERFFRZ_REG(idxd));
+}
+
+static void perfmon_pmu_enable(struct pmu *pmu)
+{
+	struct idxd_device *idxd = pmu_to_idxd(pmu);
+
+	enable_perfmon_pmu(idxd);
+}
+
+static void perfmon_pmu_disable(struct pmu *pmu)
+{
+	struct idxd_device *idxd = pmu_to_idxd(pmu);
+
+	disable_perfmon_pmu(idxd);
+}
+
+static void skip_filter(int i)
+{
+	int j;
+
+	for (j = i; j < PERFMON_FILTERS_MAX; j++)
+		perfmon_format_attrs[PERFMON_FILTERS_START + j] =
+			perfmon_format_attrs[PERFMON_FILTERS_START + j + 1];
+}
+
+static void idxd_pmu_init(struct idxd_pmu *idxd_pmu)
+{
+	int i;
+
+	for (i = 0 ; i < PERFMON_FILTERS_MAX; i++) {
+		if (!test_bit(i, &idxd_pmu->supported_filters))
+			skip_filter(i);
+	}
+
+	idxd_pmu->pmu.name		= idxd_pmu->name;
+	idxd_pmu->pmu.attr_groups	= perfmon_attr_groups;
+	idxd_pmu->pmu.task_ctx_nr	= perf_invalid_context;
+	idxd_pmu->pmu.event_init	= perfmon_pmu_event_init;
+	idxd_pmu->pmu.pmu_enable	= perfmon_pmu_enable,
+	idxd_pmu->pmu.pmu_disable	= perfmon_pmu_disable,
+	idxd_pmu->pmu.add		= perfmon_pmu_event_add;
+	idxd_pmu->pmu.del		= perfmon_pmu_event_del;
+	idxd_pmu->pmu.start		= perfmon_pmu_event_start;
+	idxd_pmu->pmu.stop		= perfmon_pmu_event_stop;
+	idxd_pmu->pmu.read		= perfmon_pmu_event_update;
+	idxd_pmu->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+	idxd_pmu->pmu.module		= THIS_MODULE;
+}
+
+void perfmon_pmu_remove(struct idxd_device *idxd)
+{
+	if (!idxd->idxd_pmu)
+		return;
+
+	cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
+	perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+	kfree(idxd->idxd_pmu);
+	idxd->idxd_pmu = NULL;
+}
+
+static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct idxd_pmu *idxd_pmu;
+
+	idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+
+	/* select the first online CPU as the designated reader */
+	if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
+		cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
+		idxd_pmu->cpu = cpu;
+	}
+
+	return 0;
+}
+
+static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct idxd_pmu *idxd_pmu;
+	unsigned int target;
+
+	idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
+
+	if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+
+	/* migrate events if there is a valid target */
+	if (target < nr_cpu_ids)
+		cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
+	else
+		target = -1;
+
+	perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
+
+	return 0;
+}
+
+int perfmon_pmu_init(struct idxd_device *idxd)
+{
+	union idxd_perfcap perfcap;
+	struct idxd_pmu *idxd_pmu;
+	int rc = -ENODEV;
+
+	/*
+	 * perfmon module initialization failed, nothing to do
+	 */
+	if (!cpuhp_set_up)
+		return -ENODEV;
+
+	/*
+	 * If perfmon_offset or num_counters is 0, it means perfmon is
+	 * not supported on this hardware.
+	 */
+	if (idxd->perfmon_offset == 0)
+		return -ENODEV;
+
+	idxd_pmu = kzalloc(sizeof(*idxd_pmu), GFP_KERNEL);
+	if (!idxd_pmu)
+		return -ENOMEM;
+
+	idxd_pmu->idxd = idxd;
+	idxd->idxd_pmu = idxd_pmu;
+
+	if (idxd->data->type == IDXD_TYPE_DSA) {
+		rc = sprintf(idxd_pmu->name, "dsa%d", idxd->id);
+		if (rc < 0)
+			goto free;
+	} else if (idxd->data->type == IDXD_TYPE_IAX) {
+		rc = sprintf(idxd_pmu->name, "iax%d", idxd->id);
+		if (rc < 0)
+			goto free;
+	} else {
+		goto free;
+	}
+
+	perfmon_reset(idxd);
+
+	perfcap.bits = ioread64(PERFCAP_REG(idxd));
+
+	/*
+	 * If total perf counter is 0, stop further registration.
+	 * This is necessary in order to support driver running on
+	 * guest which does not have pmon support.
+	 */
+	if (perfcap.num_perf_counter == 0)
+		goto free;
+
+	/* A counter width of 0 means it can't count */
+	if (perfcap.counter_width == 0)
+		goto free;
+
+	/* Overflow interrupt and counter freeze support must be available */
+	if (!perfcap.overflow_interrupt || !perfcap.counter_freeze)
+		goto free;
+
+	/* Number of event categories cannot be 0 */
+	if (perfcap.num_event_category == 0)
+		goto free;
+
+	/*
+	 * We don't support per-counter capabilities for now.
+	 */
+	if (perfcap.cap_per_counter)
+		goto free;
+
+	idxd_pmu->n_event_categories = perfcap.num_event_category;
+	idxd_pmu->supported_event_categories = perfcap.global_event_category;
+	idxd_pmu->per_counter_caps_supported = perfcap.cap_per_counter;
+
+	/* check filter capability.  If 0, then filters are not supported */
+	idxd_pmu->supported_filters = perfcap.filter;
+	if (perfcap.filter)
+		idxd_pmu->n_filters = hweight8(perfcap.filter);
+
+	/* Store the total number of counters categories, and counter width */
+	idxd_pmu->n_counters = perfcap.num_perf_counter;
+	idxd_pmu->counter_width = perfcap.counter_width;
+
+	idxd_pmu_init(idxd_pmu);
+
+	rc = perf_pmu_register(&idxd_pmu->pmu, idxd_pmu->name, -1);
+	if (rc)
+		goto free;
+
+	rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
+	if (rc) {
+		perf_pmu_unregister(&idxd->idxd_pmu->pmu);
+		goto free;
+	}
+out:
+	return rc;
+free:
+	kfree(idxd_pmu);
+	idxd->idxd_pmu = NULL;
+
+	goto out;
+}
+
+void __init perfmon_init(void)
+{
+	int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+					 "driver/dma/idxd/perf:online",
+					 perf_event_cpu_online,
+					 perf_event_cpu_offline);
+	if (WARN_ON(rc < 0))
+		return;
+
+	cpuhp_slot = rc;
+	cpuhp_set_up = true;
+}
+
+void __exit perfmon_exit(void)
+{
+	if (cpuhp_set_up)
+		cpuhp_remove_multi_state(cpuhp_slot);
+}
diff --git a/drivers/dma/idxd/perfmon.h b/drivers/dma/idxd/perfmon.h
new file mode 100644
index 000000000000..9a081a1bc605
--- /dev/null
+++ b/drivers/dma/idxd/perfmon.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */
+
+#ifndef _PERFMON_H_
+#define _PERFMON_H_
+
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/sbitmap.h>
+#include <linux/dmaengine.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/uuid.h>
+#include <linux/idxd.h>
+#include <linux/perf_event.h>
+#include "registers.h"
+
+static inline struct idxd_pmu *event_to_pmu(struct perf_event *event)
+{
+	struct idxd_pmu *idxd_pmu;
+	struct pmu *pmu;
+
+	pmu = event->pmu;
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu;
+}
+
+static inline struct idxd_device *event_to_idxd(struct perf_event *event)
+{
+	struct idxd_pmu *idxd_pmu;
+	struct pmu *pmu;
+
+	pmu = event->pmu;
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu->idxd;
+}
+
+static inline struct idxd_device *pmu_to_idxd(struct pmu *pmu)
+{
+	struct idxd_pmu *idxd_pmu;
+
+	idxd_pmu = container_of(pmu, struct idxd_pmu, pmu);
+
+	return idxd_pmu->idxd;
+}
+
+enum dsa_perf_events {
+	DSA_PERF_EVENT_WQ = 0,
+	DSA_PERF_EVENT_ENGINE,
+	DSA_PERF_EVENT_ADDR_TRANS,
+	DSA_PERF_EVENT_OP,
+	DSA_PERF_EVENT_COMPL,
+	DSA_PERF_EVENT_MAX,
+};
+
+enum filter_enc {
+	FLT_WQ = 0,
+	FLT_TC,
+	FLT_PG_SZ,
+	FLT_XFER_SZ,
+	FLT_ENG,
+	FLT_MAX,
+};
+
+#define CONFIG_RESET		0x0000000000000001
+#define CNTR_RESET		0x0000000000000002
+#define CNTR_ENABLE		0x0000000000000001
+#define INTR_OVFL		0x0000000000000002
+
+#define COUNTER_FREEZE		0x00000000FFFFFFFF
+#define COUNTER_UNFREEZE	0x0000000000000000
+#define OVERFLOW_SIZE		32
+
+#define CNTRCFG_ENABLE		BIT(0)
+#define CNTRCFG_IRQ_OVERFLOW	BIT(1)
+#define CNTRCFG_CATEGORY_SHIFT	8
+#define CNTRCFG_EVENT_SHIFT	32
+
+#define PERFMON_TABLE_OFFSET(_idxd)				\
+({								\
+	typeof(_idxd) __idxd = (_idxd);				\
+	((__idxd)->reg_base + (__idxd)->perfmon_offset);	\
+})
+#define PERFMON_REG_OFFSET(idxd, offset)			\
+	(PERFMON_TABLE_OFFSET(idxd) + (offset))
+
+#define PERFCAP_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFCAP_OFFSET))
+#define PERFRST_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFRST_OFFSET))
+#define OVFSTATUS_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_OVFSTATUS_OFFSET))
+#define PERFFRZ_REG(idxd)	(PERFMON_REG_OFFSET(idxd, IDXD_PERFFRZ_OFFSET))
+
+#define FLTCFG_REG(idxd, cntr, flt)				\
+	(PERFMON_REG_OFFSET(idxd, IDXD_FLTCFG_OFFSET) +	((cntr) * 32) + ((flt) * 4))
+
+#define CNTRCFG_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRCFG_OFFSET) + ((cntr) * 8))
+#define CNTRDATA_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRDATA_OFFSET) + ((cntr) * 8))
+#define CNTRCAP_REG(idxd, cntr)					\
+	(PERFMON_REG_OFFSET(idxd, IDXD_CNTRCAP_OFFSET) + ((cntr) * 8))
+
+#define EVNTCAP_REG(idxd, category) \
+	(PERFMON_REG_OFFSET(idxd, IDXD_EVNTCAP_OFFSET) + ((category) * 8))
+
+#define DEFINE_PERFMON_FORMAT_ATTR(_name, _format)			\
+static ssize_t __perfmon_idxd_##_name##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,		\
+				char *page)				\
+{									\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
+	return sprintf(page, _format "\n");				\
+}									\
+static struct kobj_attribute format_attr_idxd_##_name =			\
+	__ATTR(_name, 0444, __perfmon_idxd_##_name##_show, NULL)
+
+#endif
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 6c11375cc56a..c970c3f025f0 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -386,4 +386,112 @@ union wqcfg {
 #define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32)
 #define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40)
 
+/* Following is performance monitor registers */
+#define IDXD_PERFCAP_OFFSET		0x0
+union idxd_perfcap {
+	struct {
+		u64 num_perf_counter:6;
+		u64 rsvd1:2;
+		u64 counter_width:8;
+		u64 num_event_category:4;
+		u64 global_event_category:16;
+		u64 filter:8;
+		u64 rsvd2:8;
+		u64 cap_per_counter:1;
+		u64 writeable_counter:1;
+		u64 counter_freeze:1;
+		u64 overflow_interrupt:1;
+		u64 rsvd3:8;
+	};
+	u64 bits;
+} __packed;
+
+#define IDXD_EVNTCAP_OFFSET		0x80
+union idxd_evntcap {
+	struct {
+		u64 events:28;
+		u64 rsvd:36;
+	};
+	u64 bits;
+} __packed;
+
+struct idxd_event {
+	union {
+		struct {
+			u32 event_category:4;
+			u32 events:28;
+		};
+		u32 val;
+	};
+} __packed;
+
+#define IDXD_CNTRCAP_OFFSET		0x800
+struct idxd_cntrcap {
+	union {
+		struct {
+			u32 counter_width:8;
+			u32 rsvd:20;
+			u32 num_events:4;
+		};
+		u32 val;
+	};
+	struct idxd_event events[];
+} __packed;
+
+#define IDXD_PERFRST_OFFSET		0x10
+union idxd_perfrst {
+	struct {
+		u32 perfrst_config:1;
+		u32 perfrst_counter:1;
+		u32 rsvd:30;
+	};
+	u32 val;
+} __packed;
+
+#define IDXD_OVFSTATUS_OFFSET		0x30
+#define IDXD_PERFFRZ_OFFSET		0x20
+#define IDXD_CNTRCFG_OFFSET		0x100
+union idxd_cntrcfg {
+	struct {
+		u64 enable:1;
+		u64 interrupt_ovf:1;
+		u64 global_freeze_ovf:1;
+		u64 rsvd1:5;
+		u64 event_category:4;
+		u64 rsvd2:20;
+		u64 events:28;
+		u64 rsvd3:4;
+	};
+	u64 val;
+} __packed;
+
+#define IDXD_FLTCFG_OFFSET		0x300
+
+#define IDXD_CNTRDATA_OFFSET		0x200
+union idxd_cntrdata {
+	struct {
+		u64 event_count_value;
+	};
+	u64 val;
+} __packed;
+
+union event_cfg {
+	struct {
+		u64 event_cat:4;
+		u64 event_enc:28;
+	};
+	u64 val;
+} __packed;
+
+union filter_cfg {
+	struct {
+		u64 wq:32;
+		u64 tc:8;
+		u64 pg_sz:4;
+		u64 xfer_sz:8;
+		u64 eng:8;
+	};
+	u64 val;
+} __packed;
+
 #endif
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index f14adb882338..264d911424c0 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -167,6 +167,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_X86_RAPL_ONLINE,
 	CPUHP_AP_PERF_X86_CQM_ONLINE,
 	CPUHP_AP_PERF_X86_CSTATE_ONLINE,
+	CPUHP_AP_PERF_X86_IDXD_ONLINE,
 	CPUHP_AP_PERF_S390_CF_ONLINE,
 	CPUHP_AP_PERF_S390_CFD_ONLINE,
 	CPUHP_AP_PERF_S390_SF_ONLINE,
-- 
cgit v1.2.3


From 4c95e0728eee33df6b029a5fca82a67daeca201e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:50:59 +0200
Subject: netfilter: ebtables: remove the 3 ebtables pointers from struct net

ebtables stores the table internal data (what gets passed to the
ebt_do_table() interpreter) in struct net.

nftables keeps the internal interpreter format in pernet lists
and passes it via the netfilter core infrastructure (priv pointer).

Do the same for ebtables: the nf_hook_ops are duplicated via kmemdup,
then the ops->priv pointer is set to the table that is being registered.

After that, the netfilter core passes this table info to the hookfn.

This allows to remove the pointers from struct net.

Same pattern can be applied to ip/ip6/arptables.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h |  9 +++----
 include/net/netns/x_tables.h              |  8 ------
 net/bridge/netfilter/ebtable_broute.c     | 10 +++-----
 net/bridge/netfilter/ebtable_filter.c     | 26 +++++++------------
 net/bridge/netfilter/ebtable_nat.c        | 27 +++++++-------------
 net/bridge/netfilter/ebtables.c           | 42 +++++++++++++++++++++++--------
 6 files changed, 58 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 3a956145a25c..a8178253ce53 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -100,6 +100,7 @@ struct ebt_table {
 	   unsigned int valid_hooks);
 	/* the data used by the kernel */
 	struct ebt_table_info *private;
+	struct nf_hook_ops *ops;
 	struct module *me;
 };
 
@@ -108,11 +109,9 @@ struct ebt_table {
 
 extern int ebt_register_table(struct net *net,
 			      const struct ebt_table *table,
-			      const struct nf_hook_ops *ops,
-			      struct ebt_table **res);
-extern void ebt_unregister_table(struct net *net, struct ebt_table *table);
-void ebt_unregister_table_pre_exit(struct net *net, const char *tablename,
-				   const struct nf_hook_ops *ops);
+			      const struct nf_hook_ops *ops);
+extern void ebt_unregister_table(struct net *net, const char *tablename);
+void ebt_unregister_table_pre_exit(struct net *net, const char *tablename);
 extern unsigned int ebt_do_table(struct sk_buff *skb,
 				 const struct nf_hook_state *state,
 				 struct ebt_table *table);
diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h
index 83c8ea2e87a6..d02316ec2906 100644
--- a/include/net/netns/x_tables.h
+++ b/include/net/netns/x_tables.h
@@ -5,16 +5,8 @@
 #include <linux/list.h>
 #include <linux/netfilter_defs.h>
 
-struct ebt_table;
-
 struct netns_xt {
 	bool notrack_deprecated_warning;
 	bool clusterip_deprecated_warning;
-#if defined(CONFIG_BRIDGE_NF_EBTABLES) || \
-    defined(CONFIG_BRIDGE_NF_EBTABLES_MODULE)
-	struct ebt_table *broute_table;
-	struct ebt_table *frame_filter;
-	struct ebt_table *frame_nat;
-#endif
 };
 #endif
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 32bc2821027f..020b1487ee0c 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -66,8 +66,7 @@ static unsigned int ebt_broute(void *priv, struct sk_buff *skb,
 			   NFPROTO_BRIDGE, s->in, NULL, NULL,
 			   s->net, NULL);
 
-	ret = ebt_do_table(skb, &state, state.net->xt.broute_table);
-
+	ret = ebt_do_table(skb, &state, priv);
 	if (ret != NF_DROP)
 		return ret;
 
@@ -101,18 +100,17 @@ static const struct nf_hook_ops ebt_ops_broute = {
 
 static int __net_init broute_net_init(struct net *net)
 {
-	return ebt_register_table(net, &broute_table, &ebt_ops_broute,
-				  &net->xt.broute_table);
+	return ebt_register_table(net, &broute_table, &ebt_ops_broute);
 }
 
 static void __net_exit broute_net_pre_exit(struct net *net)
 {
-	ebt_unregister_table_pre_exit(net, "broute", &ebt_ops_broute);
+	ebt_unregister_table_pre_exit(net, "broute");
 }
 
 static void __net_exit broute_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.broute_table);
+	ebt_unregister_table(net, "broute");
 }
 
 static struct pernet_operations broute_net_ops = {
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index bcf982e12f16..8ec0b3736803 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -59,34 +59,27 @@ static const struct ebt_table frame_filter = {
 };
 
 static unsigned int
-ebt_in_hook(void *priv, struct sk_buff *skb,
-	    const struct nf_hook_state *state)
+ebt_filter_hook(void *priv, struct sk_buff *skb,
+		const struct nf_hook_state *state)
 {
-	return ebt_do_table(skb, state, state->net->xt.frame_filter);
-}
-
-static unsigned int
-ebt_out_hook(void *priv, struct sk_buff *skb,
-	     const struct nf_hook_state *state)
-{
-	return ebt_do_table(skb, state, state->net->xt.frame_filter);
+	return ebt_do_table(skb, state, priv);
 }
 
 static const struct nf_hook_ops ebt_ops_filter[] = {
 	{
-		.hook		= ebt_in_hook,
+		.hook		= ebt_filter_hook,
 		.pf		= NFPROTO_BRIDGE,
 		.hooknum	= NF_BR_LOCAL_IN,
 		.priority	= NF_BR_PRI_FILTER_BRIDGED,
 	},
 	{
-		.hook		= ebt_in_hook,
+		.hook		= ebt_filter_hook,
 		.pf		= NFPROTO_BRIDGE,
 		.hooknum	= NF_BR_FORWARD,
 		.priority	= NF_BR_PRI_FILTER_BRIDGED,
 	},
 	{
-		.hook		= ebt_out_hook,
+		.hook		= ebt_filter_hook,
 		.pf		= NFPROTO_BRIDGE,
 		.hooknum	= NF_BR_LOCAL_OUT,
 		.priority	= NF_BR_PRI_FILTER_OTHER,
@@ -95,18 +88,17 @@ static const struct nf_hook_ops ebt_ops_filter[] = {
 
 static int __net_init frame_filter_net_init(struct net *net)
 {
-	return ebt_register_table(net, &frame_filter, ebt_ops_filter,
-				  &net->xt.frame_filter);
+	return ebt_register_table(net, &frame_filter, ebt_ops_filter);
 }
 
 static void __net_exit frame_filter_net_pre_exit(struct net *net)
 {
-	ebt_unregister_table_pre_exit(net, "filter", ebt_ops_filter);
+	ebt_unregister_table_pre_exit(net, "filter");
 }
 
 static void __net_exit frame_filter_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.frame_filter);
+	ebt_unregister_table(net, "filter");
 }
 
 static struct pernet_operations frame_filter_net_ops = {
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 0d092773f816..7c8a1064a531 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -58,35 +58,27 @@ static const struct ebt_table frame_nat = {
 	.me		= THIS_MODULE,
 };
 
-static unsigned int
-ebt_nat_in(void *priv, struct sk_buff *skb,
-	   const struct nf_hook_state *state)
+static unsigned int ebt_nat_hook(void *priv, struct sk_buff *skb,
+				 const struct nf_hook_state *state)
 {
-	return ebt_do_table(skb, state, state->net->xt.frame_nat);
-}
-
-static unsigned int
-ebt_nat_out(void *priv, struct sk_buff *skb,
-	    const struct nf_hook_state *state)
-{
-	return ebt_do_table(skb, state, state->net->xt.frame_nat);
+	return ebt_do_table(skb, state, priv);
 }
 
 static const struct nf_hook_ops ebt_ops_nat[] = {
 	{
-		.hook		= ebt_nat_out,
+		.hook		= ebt_nat_hook,
 		.pf		= NFPROTO_BRIDGE,
 		.hooknum	= NF_BR_LOCAL_OUT,
 		.priority	= NF_BR_PRI_NAT_DST_OTHER,
 	},
 	{
-		.hook		= ebt_nat_out,
+		.hook		= ebt_nat_hook,
 		.pf		= NFPROTO_BRIDGE,
 		.hooknum	= NF_BR_POST_ROUTING,
 		.priority	= NF_BR_PRI_NAT_SRC,
 	},
 	{
-		.hook		= ebt_nat_in,
+		.hook		= ebt_nat_hook,
 		.pf		= NFPROTO_BRIDGE,
 		.hooknum	= NF_BR_PRE_ROUTING,
 		.priority	= NF_BR_PRI_NAT_DST_BRIDGED,
@@ -95,18 +87,17 @@ static const struct nf_hook_ops ebt_ops_nat[] = {
 
 static int __net_init frame_nat_net_init(struct net *net)
 {
-	return ebt_register_table(net, &frame_nat, ebt_ops_nat,
-				  &net->xt.frame_nat);
+	return ebt_register_table(net, &frame_nat, ebt_ops_nat);
 }
 
 static void __net_exit frame_nat_net_pre_exit(struct net *net)
 {
-	ebt_unregister_table_pre_exit(net, "nat", ebt_ops_nat);
+	ebt_unregister_table_pre_exit(net, "nat");
 }
 
 static void __net_exit frame_nat_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.frame_nat);
+	ebt_unregister_table(net, "nat");
 }
 
 static struct pernet_operations frame_nat_net_ops = {
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 96d789c8d1c7..a04596bb2a6e 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1136,15 +1136,18 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
 	vfree(table->private->entries);
 	ebt_free_table_info(table->private);
 	vfree(table->private);
+	kfree(table->ops);
 	kfree(table);
 }
 
 int ebt_register_table(struct net *net, const struct ebt_table *input_table,
-		       const struct nf_hook_ops *ops, struct ebt_table **res)
+		       const struct nf_hook_ops *template_ops)
 {
 	struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
 	struct ebt_table_info *newinfo;
 	struct ebt_table *t, *table;
+	struct nf_hook_ops *ops;
+	unsigned int num_ops;
 	struct ebt_replace_kernel *repl;
 	int ret, i, countersize;
 	void *p;
@@ -1213,15 +1216,31 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
 		ret = -ENOENT;
 		goto free_unlock;
 	}
+
+	num_ops = hweight32(table->valid_hooks);
+	if (num_ops == 0) {
+		ret = -EINVAL;
+		goto free_unlock;
+	}
+
+	ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+	if (!ops) {
+		ret = -ENOMEM;
+		if (newinfo->nentries)
+			module_put(table->me);
+		goto free_unlock;
+	}
+
+	for (i = 0; i < num_ops; i++)
+		ops[i].priv = table;
+
 	list_add(&table->list, &ebt_net->tables);
 	mutex_unlock(&ebt_mutex);
 
-	WRITE_ONCE(*res, table);
-	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
-	if (ret) {
+	table->ops = ops;
+	ret = nf_register_net_hooks(net, ops, num_ops);
+	if (ret)
 		__ebt_unregister_table(net, table);
-		*res = NULL;
-	}
 
 	audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries,
 			AUDIT_XT_OP_REGISTER, GFP_KERNEL);
@@ -1257,18 +1276,21 @@ static struct ebt_table *__ebt_find_table(struct net *net, const char *name)
 	return NULL;
 }
 
-void ebt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops)
+void ebt_unregister_table_pre_exit(struct net *net, const char *name)
 {
 	struct ebt_table *table = __ebt_find_table(net, name);
 
 	if (table)
-		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+		nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
 }
 EXPORT_SYMBOL(ebt_unregister_table_pre_exit);
 
-void ebt_unregister_table(struct net *net, struct ebt_table *table)
+void ebt_unregister_table(struct net *net, const char *name)
 {
-	__ebt_unregister_table(net, table);
+	struct ebt_table *table = __ebt_find_table(net, name);
+
+	if (table)
+		__ebt_unregister_table(net, table);
 }
 
 /* userspace just supplied us with counters */
-- 
cgit v1.2.3


From 7716bf090e97aec45e97907ec6a382e4610bdd8f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:51:00 +0200
Subject: netfilter: x_tables: remove ipt_unregister_table

Its the same function as ipt_unregister_table_exit.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv4/ip_tables.h  | 3 ---
 include/linux/netfilter_ipv6/ip6_tables.h | 2 --
 net/ipv4/netfilter/ip_tables.c            | 9 ---------
 net/ipv4/netfilter/iptable_nat.c          | 2 +-
 net/ipv6/netfilter/ip6_tables.c           | 9 ---------
 net/ipv6/netfilter/ip6table_nat.c         | 2 +-
 6 files changed, 2 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index c4676d6feeff..9f440eb6cf6c 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -31,9 +31,6 @@ void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table,
 
 void ipt_unregister_table_exit(struct net *net, struct xt_table *table);
 
-void ipt_unregister_table(struct net *net, struct xt_table *table,
-			  const struct nf_hook_ops *ops);
-
 /* Standard entry. */
 struct ipt_standard {
 	struct ipt_entry entry;
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 1547d5f9ae06..b88a27ce61b0 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -27,8 +27,6 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *);
 int ip6t_register_table(struct net *net, const struct xt_table *table,
 			const struct ip6t_replace *repl,
 			const struct nf_hook_ops *ops, struct xt_table **res);
-void ip6t_unregister_table(struct net *net, struct xt_table *table,
-			   const struct nf_hook_ops *ops);
 void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table,
 				    const struct nf_hook_ops *ops);
 void ip6t_unregister_table_exit(struct net *net, struct xt_table *table);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f77ea0dbe656..2fa7f28b88e3 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1770,14 +1770,6 @@ void ipt_unregister_table_exit(struct net *net, struct xt_table *table)
 	__ipt_unregister_table(net, table);
 }
 
-void ipt_unregister_table(struct net *net, struct xt_table *table,
-			  const struct nf_hook_ops *ops)
-{
-	if (ops)
-		ipt_unregister_table_pre_exit(net, table, ops);
-	__ipt_unregister_table(net, table);
-}
-
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
 static inline bool
 icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
@@ -1924,7 +1916,6 @@ static void __exit ip_tables_fini(void)
 }
 
 EXPORT_SYMBOL(ipt_register_table);
-EXPORT_SYMBOL(ipt_unregister_table);
 EXPORT_SYMBOL(ipt_unregister_table_pre_exit);
 EXPORT_SYMBOL(ipt_unregister_table_exit);
 EXPORT_SYMBOL(ipt_do_table);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index b0143b109f25..a89c1b9f94c2 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -105,7 +105,7 @@ static int __net_init iptable_nat_table_init(struct net *net)
 
 	ret = ipt_nat_register_lookups(net);
 	if (ret < 0) {
-		ipt_unregister_table(net, net->ipv4.nat_table, NULL);
+		ipt_unregister_table_exit(net, net->ipv4.nat_table);
 		net->ipv4.nat_table = NULL;
 	}
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index eb2b5404806c..e605c28cfed5 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1780,14 +1780,6 @@ void ip6t_unregister_table_exit(struct net *net, struct xt_table *table)
 	__ip6t_unregister_table(net, table);
 }
 
-void ip6t_unregister_table(struct net *net, struct xt_table *table,
-			   const struct nf_hook_ops *ops)
-{
-	if (ops)
-		ip6t_unregister_table_pre_exit(net, table, ops);
-	__ip6t_unregister_table(net, table);
-}
-
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
 static inline bool
 icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
@@ -1935,7 +1927,6 @@ static void __exit ip6_tables_fini(void)
 }
 
 EXPORT_SYMBOL(ip6t_register_table);
-EXPORT_SYMBOL(ip6t_unregister_table);
 EXPORT_SYMBOL(ip6t_unregister_table_pre_exit);
 EXPORT_SYMBOL(ip6t_unregister_table_exit);
 EXPORT_SYMBOL(ip6t_do_table);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 0a23265e3caa..4cef1b405074 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -107,7 +107,7 @@ static int __net_init ip6table_nat_table_init(struct net *net)
 
 	ret = ip6t_nat_register_lookups(net);
 	if (ret < 0) {
-		ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
+		ip6t_unregister_table_exit(net, net->ipv6.ip6table_nat);
 		net->ipv6.ip6table_nat = NULL;
 	}
 	kfree(repl);
-- 
cgit v1.2.3


From 1ef4d6d1af2d0c0c7c9b391365a3894bea291e34 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:51:01 +0200
Subject: netfilter: x_tables: add xt_find_table

This will be used to obtain the xt_table struct given address family and
table name.

Followup patches will reduce the number of direct accesses to the xt_table
structures via net->ipv{4,6}.ip(6)table_{nat,mangle,...} pointers, then
remove them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  1 +
 net/netfilter/x_tables.c           | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 8ec48466410a..b2eec7de5280 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -322,6 +322,7 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision);
 int xt_find_revision(u8 af, const char *name, u8 revision, int target,
 		     int *err);
 
+struct xt_table *xt_find_table(struct net *net, u8 af, const char *name);
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name);
 struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index b7f8d2ed3cc2..1caba9507228 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1199,6 +1199,23 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
+struct xt_table *xt_find_table(struct net *net, u8 af, const char *name)
+{
+	struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+	struct xt_table *t;
+
+	mutex_lock(&xt[af].mutex);
+	list_for_each_entry(t, &xt_net->tables[af], list) {
+		if (strcmp(t->name, name) == 0) {
+			mutex_unlock(&xt[af].mutex);
+			return t;
+		}
+	}
+	mutex_unlock(&xt[af].mutex);
+	return NULL;
+}
+EXPORT_SYMBOL(xt_find_table);
+
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
-- 
cgit v1.2.3


From 20a9df33594fe643f9cf46375a9243e3ab8ed3a6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:51:02 +0200
Subject: netfilter: iptables: unregister the tables by name

xtables stores the xt_table structs in the struct net.  This isn't
needed anymore, the structures could be passed via the netfilter hook
'private' pointer to the hook functions, which would allow us to remove
those pointers from struct net.

As a first step, reduce the number of accesses to the
net->ipv4.ip6table_{raw,filter,...} pointers.
This allows the tables to get unregistered by name instead of having to
pass the raw address.

The xt_table structure cane looked up by name+address family instead.

This patch is useless as-is (the backends still have the raw pointer
address), but it lowers the bar to remove those.

It also allows to put the 'was table registered in the first place' check
into ip_tables.c rather than have it in each table sub module.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv4/ip_tables.h |  6 +++---
 net/ipv4/netfilter/ip_tables.c           | 14 ++++++++++----
 net/ipv4/netfilter/iptable_filter.c      |  8 ++------
 net/ipv4/netfilter/iptable_mangle.c      |  8 ++------
 net/ipv4/netfilter/iptable_nat.c         |  6 ++----
 net/ipv4/netfilter/iptable_raw.c         |  8 ++------
 net/ipv4/netfilter/iptable_security.c    |  8 ++------
 7 files changed, 23 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 9f440eb6cf6c..73bcf7f261d2 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -26,10 +26,10 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
 		       const struct ipt_replace *repl,
 		       const struct nf_hook_ops *ops, struct xt_table **res);
 
-void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table,
-		       const struct nf_hook_ops *ops);
+void ipt_unregister_table_pre_exit(struct net *net, const char *name,
+				   const struct nf_hook_ops *ops);
 
-void ipt_unregister_table_exit(struct net *net, struct xt_table *table);
+void ipt_unregister_table_exit(struct net *net, const char *name);
 
 /* Standard entry. */
 struct ipt_standard {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 2fa7f28b88e3..0b859ec2d3f8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1759,15 +1759,21 @@ out_free:
 	return ret;
 }
 
-void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table,
+void ipt_unregister_table_pre_exit(struct net *net, const char *name,
 				   const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
+
+	if (table)
+		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
 }
 
-void ipt_unregister_table_exit(struct net *net, struct xt_table *table)
+void ipt_unregister_table_exit(struct net *net, const char *name)
 {
-	__ipt_unregister_table(net, table);
+	struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
+
+	if (table)
+		__ipt_unregister_table(net, table);
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 8f7bc1ee7453..a39998c7977f 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -74,16 +74,12 @@ static int __net_init iptable_filter_net_init(struct net *net)
 
 static void __net_exit iptable_filter_net_pre_exit(struct net *net)
 {
-	if (net->ipv4.iptable_filter)
-		ipt_unregister_table_pre_exit(net, net->ipv4.iptable_filter,
-					      filter_ops);
+	ipt_unregister_table_pre_exit(net, "filter", filter_ops);
 }
 
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
-	if (!net->ipv4.iptable_filter)
-		return;
-	ipt_unregister_table_exit(net, net->ipv4.iptable_filter);
+	ipt_unregister_table_exit(net, "filter");
 	net->ipv4.iptable_filter = NULL;
 }
 
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 833079589273..7d1713e22553 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -102,16 +102,12 @@ static int __net_init iptable_mangle_table_init(struct net *net)
 
 static void __net_exit iptable_mangle_net_pre_exit(struct net *net)
 {
-	if (net->ipv4.iptable_mangle)
-		ipt_unregister_table_pre_exit(net, net->ipv4.iptable_mangle,
-					      mangle_ops);
+	ipt_unregister_table_pre_exit(net, "mangle", mangle_ops);
 }
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
-	if (!net->ipv4.iptable_mangle)
-		return;
-	ipt_unregister_table_exit(net, net->ipv4.iptable_mangle);
+	ipt_unregister_table_exit(net, "mangle");
 	net->ipv4.iptable_mangle = NULL;
 }
 
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a89c1b9f94c2..16bf3009642e 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -105,7 +105,7 @@ static int __net_init iptable_nat_table_init(struct net *net)
 
 	ret = ipt_nat_register_lookups(net);
 	if (ret < 0) {
-		ipt_unregister_table_exit(net, net->ipv4.nat_table);
+		ipt_unregister_table_exit(net, "nat");
 		net->ipv4.nat_table = NULL;
 	}
 
@@ -121,9 +121,7 @@ static void __net_exit iptable_nat_net_pre_exit(struct net *net)
 
 static void __net_exit iptable_nat_net_exit(struct net *net)
 {
-	if (!net->ipv4.nat_table)
-		return;
-	ipt_unregister_table_exit(net, net->ipv4.nat_table);
+	ipt_unregister_table_exit(net, "nat");
 	net->ipv4.nat_table = NULL;
 }
 
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 9abfe6bf2cb9..a1f556464b93 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -69,16 +69,12 @@ static int __net_init iptable_raw_table_init(struct net *net)
 
 static void __net_exit iptable_raw_net_pre_exit(struct net *net)
 {
-	if (net->ipv4.iptable_raw)
-		ipt_unregister_table_pre_exit(net, net->ipv4.iptable_raw,
-					      rawtable_ops);
+	ipt_unregister_table_pre_exit(net, "raw", rawtable_ops);
 }
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
-	if (!net->ipv4.iptable_raw)
-		return;
-	ipt_unregister_table_exit(net, net->ipv4.iptable_raw);
+	ipt_unregister_table_exit(net, "raw");
 	net->ipv4.iptable_raw = NULL;
 }
 
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 415c1975d770..33eded4f9080 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -64,16 +64,12 @@ static int __net_init iptable_security_table_init(struct net *net)
 
 static void __net_exit iptable_security_net_pre_exit(struct net *net)
 {
-	if (net->ipv4.iptable_security)
-		ipt_unregister_table_pre_exit(net, net->ipv4.iptable_security,
-					      sectbl_ops);
+	ipt_unregister_table_pre_exit(net, "security", sectbl_ops);
 }
 
 static void __net_exit iptable_security_net_exit(struct net *net)
 {
-	if (!net->ipv4.iptable_security)
-		return;
-	ipt_unregister_table_exit(net, net->ipv4.iptable_security);
+	ipt_unregister_table_exit(net, "security");
 	net->ipv4.iptable_security = NULL;
 }
 
-- 
cgit v1.2.3


From 6c0717545f2ca61c95f5f739da845e77cc8bd498 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:51:03 +0200
Subject: netfilter: ip6tables: unregister the tables by name

Same as the previous patch, but for ip6tables.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6/ip6_tables.h |  4 ++--
 net/ipv6/netfilter/ip6_tables.c           | 14 ++++++++++----
 net/ipv6/netfilter/ip6table_filter.c      |  9 +++------
 net/ipv6/netfilter/ip6table_mangle.c      |  9 ++-------
 net/ipv6/netfilter/ip6table_nat.c         |  6 ++----
 net/ipv6/netfilter/ip6table_raw.c         |  9 +++------
 net/ipv6/netfilter/ip6table_security.c    |  8 ++------
 7 files changed, 24 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index b88a27ce61b0..8c07426e18a8 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -27,9 +27,9 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *);
 int ip6t_register_table(struct net *net, const struct xt_table *table,
 			const struct ip6t_replace *repl,
 			const struct nf_hook_ops *ops, struct xt_table **res);
-void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table,
+void ip6t_unregister_table_pre_exit(struct net *net, const char *name,
 				    const struct nf_hook_ops *ops);
-void ip6t_unregister_table_exit(struct net *net, struct xt_table *table);
+void ip6t_unregister_table_exit(struct net *net, const char *name);
 extern unsigned int ip6t_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e605c28cfed5..11c80da12ee3 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1769,15 +1769,21 @@ out_free:
 	return ret;
 }
 
-void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table,
+void ip6t_unregister_table_pre_exit(struct net *net, const char *name,
 				    const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
+
+	if (table)
+		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
 }
 
-void ip6t_unregister_table_exit(struct net *net, struct xt_table *table)
+void ip6t_unregister_table_exit(struct net *net, const char *name)
 {
-	__ip6t_unregister_table(net, table);
+	struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
+
+	if (table)
+		__ip6t_unregister_table(net, table);
 }
 
 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 88337b51ffbf..0c9f75e23ca0 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -75,16 +75,13 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 
 static void __net_exit ip6table_filter_net_pre_exit(struct net *net)
 {
-	if (net->ipv6.ip6table_filter)
-		ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_filter,
-					       filter_ops);
+	ip6t_unregister_table_pre_exit(net, "filter",
+				       filter_ops);
 }
 
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
-	if (!net->ipv6.ip6table_filter)
-		return;
-	ip6t_unregister_table_exit(net, net->ipv6.ip6table_filter);
+	ip6t_unregister_table_exit(net, "filter");
 	net->ipv6.ip6table_filter = NULL;
 }
 
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index cee74803d7a1..9a2266662508 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -95,17 +95,12 @@ static int __net_init ip6table_mangle_table_init(struct net *net)
 
 static void __net_exit ip6table_mangle_net_pre_exit(struct net *net)
 {
-	if (net->ipv6.ip6table_mangle)
-		ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_mangle,
-					       mangle_ops);
+	ip6t_unregister_table_pre_exit(net, "mangle", mangle_ops);
 }
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
-	if (!net->ipv6.ip6table_mangle)
-		return;
-
-	ip6t_unregister_table_exit(net, net->ipv6.ip6table_mangle);
+	ip6t_unregister_table_exit(net, "mangle");
 	net->ipv6.ip6table_mangle = NULL;
 }
 
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 4cef1b405074..7eb61e6b1e52 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -107,7 +107,7 @@ static int __net_init ip6table_nat_table_init(struct net *net)
 
 	ret = ip6t_nat_register_lookups(net);
 	if (ret < 0) {
-		ip6t_unregister_table_exit(net, net->ipv6.ip6table_nat);
+		ip6t_unregister_table_exit(net, "nat");
 		net->ipv6.ip6table_nat = NULL;
 	}
 	kfree(repl);
@@ -122,9 +122,7 @@ static void __net_exit ip6table_nat_net_pre_exit(struct net *net)
 
 static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
-	if (!net->ipv6.ip6table_nat)
-		return;
-	ip6t_unregister_table_exit(net, net->ipv6.ip6table_nat);
+	ip6t_unregister_table_exit(net, "nat");
 	net->ipv6.ip6table_nat = NULL;
 }
 
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 8f9e742226f7..c9a4aada40ba 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -68,16 +68,13 @@ static int __net_init ip6table_raw_table_init(struct net *net)
 
 static void __net_exit ip6table_raw_net_pre_exit(struct net *net)
 {
-	if (net->ipv6.ip6table_raw)
-		ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_raw,
-					       rawtable_ops);
+	ip6t_unregister_table_pre_exit(net, "raw",
+				       rawtable_ops);
 }
 
 static void __net_exit ip6table_raw_net_exit(struct net *net)
 {
-	if (!net->ipv6.ip6table_raw)
-		return;
-	ip6t_unregister_table_exit(net, net->ipv6.ip6table_raw);
+	ip6t_unregister_table_exit(net, "raw");
 	net->ipv6.ip6table_raw = NULL;
 }
 
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 5e8c48fed032..73067e08662f 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -63,16 +63,12 @@ static int __net_init ip6table_security_table_init(struct net *net)
 
 static void __net_exit ip6table_security_net_pre_exit(struct net *net)
 {
-	if (net->ipv6.ip6table_security)
-		ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_security,
-					       sectbl_ops);
+	ip6t_unregister_table_pre_exit(net, "security", sectbl_ops);
 }
 
 static void __net_exit ip6table_security_net_exit(struct net *net)
 {
-	if (!net->ipv6.ip6table_security)
-		return;
-	ip6t_unregister_table_exit(net, net->ipv6.ip6table_security);
+	ip6t_unregister_table_exit(net, "security");
 	net->ipv6.ip6table_security = NULL;
 }
 
-- 
cgit v1.2.3


From 4d705399191c3cfe1264588b3a4a8115e6c3b161 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:51:04 +0200
Subject: netfilter: arptables: unregister the tables by name

and again, this time for arptables.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_arp/arp_tables.h |  4 ++--
 net/ipv4/netfilter/arp_tables.c          | 14 ++++++++++----
 net/ipv4/netfilter/arptable_filter.c     |  8 ++------
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 26a13294318c..9ec73dcc8fd6 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -52,8 +52,8 @@ extern void *arpt_alloc_initial_table(const struct xt_table *);
 int arpt_register_table(struct net *net, const struct xt_table *table,
 			const struct arpt_replace *repl,
 			const struct nf_hook_ops *ops, struct xt_table **res);
-void arpt_unregister_table(struct net *net, struct xt_table *table);
-void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table,
+void arpt_unregister_table(struct net *net, const char *name);
+void arpt_unregister_table_pre_exit(struct net *net, const char *name,
 				    const struct nf_hook_ops *ops);
 extern unsigned int arpt_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index d6d45d820d79..8a16b0dc5271 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1541,16 +1541,22 @@ out_free:
 	return ret;
 }
 
-void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table,
+void arpt_unregister_table_pre_exit(struct net *net, const char *name,
 				    const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
+
+	if (table)
+		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
 }
 EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
 
-void arpt_unregister_table(struct net *net, struct xt_table *table)
+void arpt_unregister_table(struct net *net, const char *name)
 {
-	__arpt_unregister_table(net, table);
+	struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
+
+	if (table)
+		__arpt_unregister_table(net, table);
 }
 
 /* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 6c300ba5634e..c121e13dc78c 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -58,16 +58,12 @@ static int __net_init arptable_filter_table_init(struct net *net)
 
 static void __net_exit arptable_filter_net_pre_exit(struct net *net)
 {
-	if (net->ipv4.arptable_filter)
-		arpt_unregister_table_pre_exit(net, net->ipv4.arptable_filter,
-					       arpfilter_ops);
+	arpt_unregister_table_pre_exit(net, "filter", arpfilter_ops);
 }
 
 static void __net_exit arptable_filter_net_exit(struct net *net)
 {
-	if (!net->ipv4.arptable_filter)
-		return;
-	arpt_unregister_table(net, net->ipv4.arptable_filter);
+	arpt_unregister_table(net, "filter");
 	net->ipv4.arptable_filter = NULL;
 }
 
-- 
cgit v1.2.3


From ae689334225ff0e4ef112459ecd24aea932c2b00 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:51:07 +0200
Subject: netfilter: ip_tables: pass table pointer via nf_hook_ops

iptable_x modules rely on 'struct net' to contain a pointer to the
table that should be evaluated.

In order to remove these pointers from struct net, pass them via
the 'priv' pointer in a similar fashion as nf_tables passes the
rule data.

To do that, duplicate the nf_hook_info array passed in from the
iptable_x modules, update the ops->priv pointers of the copy to
refer to the table and then change the hookfn implementations to
just pass the 'priv' argument to the traverser.

After this patch, the xt_table pointers can already be removed
from struct net.

However, changes to struct net result in re-compile of the entire
network stack, so do the removal after arptables and ip6tables
have been converted as well.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h       |  3 ++
 include/linux/netfilter_ipv4/ip_tables.h |  6 ++--
 net/ipv4/netfilter/ip_tables.c           | 53 ++++++++++++++++++++++----------
 net/ipv4/netfilter/iptable_filter.c      |  8 ++---
 net/ipv4/netfilter/iptable_mangle.c      | 14 ++++-----
 net/ipv4/netfilter/iptable_nat.c         | 26 ++++++++--------
 net/ipv4/netfilter/iptable_raw.c         |  8 ++---
 net/ipv4/netfilter/iptable_security.c    |  8 ++---
 net/netfilter/x_tables.c                 |  1 +
 9 files changed, 71 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index b2eec7de5280..a52cc22f806a 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -229,6 +229,9 @@ struct xt_table {
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
+	/* hook ops that register the table with the netfilter core */
+	struct nf_hook_ops *ops;
+
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
 
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 73bcf7f261d2..0fdab3246ef5 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -24,11 +24,9 @@
 
 int ipt_register_table(struct net *net, const struct xt_table *table,
 		       const struct ipt_replace *repl,
-		       const struct nf_hook_ops *ops, struct xt_table **res);
-
-void ipt_unregister_table_pre_exit(struct net *net, const char *name,
-				   const struct nf_hook_ops *ops);
+		       const struct nf_hook_ops *ops);
 
+void ipt_unregister_table_pre_exit(struct net *net, const char *name);
 void ipt_unregister_table_exit(struct net *net, const char *name);
 
 /* Standard entry. */
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 0b859ec2d3f8..d6caaed5dd45 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1716,9 +1716,11 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table)
 
 int ipt_register_table(struct net *net, const struct xt_table *table,
 		       const struct ipt_replace *repl,
-		       const struct nf_hook_ops *ops, struct xt_table **res)
+		       const struct nf_hook_ops *template_ops)
 {
-	int ret;
+	struct nf_hook_ops *ops;
+	unsigned int num_ops;
+	int ret, i;
 	struct xt_table_info *newinfo;
 	struct xt_table_info bootstrap = {0};
 	void *loc_cpu_entry;
@@ -1732,40 +1734,57 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
-	if (ret != 0)
-		goto out_free;
+	if (ret != 0) {
+		xt_free_table_info(newinfo);
+		return ret;
+	}
 
 	new_table = xt_register_table(net, table, &bootstrap, newinfo);
 	if (IS_ERR(new_table)) {
-		ret = PTR_ERR(new_table);
-		goto out_free;
+		xt_free_table_info(newinfo);
+		return PTR_ERR(new_table);
 	}
 
-	/* set res now, will see skbs right after nf_register_net_hooks */
-	WRITE_ONCE(*res, new_table);
-	if (!ops)
+	/* No template? No need to do anything. This is used by 'nat' table, it registers
+	 * with the nat core instead of the netfilter core.
+	 */
+	if (!template_ops)
 		return 0;
 
-	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
-	if (ret != 0) {
-		__ipt_unregister_table(net, new_table);
-		*res = NULL;
+	num_ops = hweight32(table->valid_hooks);
+	if (num_ops == 0) {
+		ret = -EINVAL;
+		goto out_free;
 	}
 
+	ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+	if (!ops) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < num_ops; i++)
+		ops[i].priv = new_table;
+
+	new_table->ops = ops;
+
+	ret = nf_register_net_hooks(net, ops, num_ops);
+	if (ret != 0)
+		goto out_free;
+
 	return ret;
 
 out_free:
-	xt_free_table_info(newinfo);
+	__ipt_unregister_table(net, new_table);
 	return ret;
 }
 
-void ipt_unregister_table_pre_exit(struct net *net, const char *name,
-				   const struct nf_hook_ops *ops)
+void ipt_unregister_table_pre_exit(struct net *net, const char *name)
 {
 	struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
 
 	if (table)
-		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+		nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
 }
 
 void ipt_unregister_table_exit(struct net *net, const char *name)
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 84573fa78d1e..8272df7c6ad5 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -34,7 +34,7 @@ static unsigned int
 iptable_filter_hook(void *priv, struct sk_buff *skb,
 		    const struct nf_hook_state *state)
 {
-	return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
+	return ipt_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *filter_ops __read_mostly;
@@ -55,8 +55,7 @@ static int __net_init iptable_filter_table_init(struct net *net)
 	((struct ipt_standard *)repl->entries)[1].target.verdict =
 		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
 
-	err = ipt_register_table(net, &packet_filter, repl, filter_ops,
-				 &net->ipv4.iptable_filter);
+	err = ipt_register_table(net, &packet_filter, repl, filter_ops);
 	kfree(repl);
 	return err;
 }
@@ -71,13 +70,12 @@ static int __net_init iptable_filter_net_init(struct net *net)
 
 static void __net_exit iptable_filter_net_pre_exit(struct net *net)
 {
-	ipt_unregister_table_pre_exit(net, "filter", filter_ops);
+	ipt_unregister_table_pre_exit(net, "filter");
 }
 
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
 	ipt_unregister_table_exit(net, "filter");
-	net->ipv4.iptable_filter = NULL;
 }
 
 static struct pernet_operations iptable_filter_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 98e9e9053d85..2abc3836f391 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -37,7 +37,7 @@ static const struct xt_table packet_mangler = {
 };
 
 static unsigned int
-ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
+ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv)
 {
 	unsigned int ret;
 	const struct iphdr *iph;
@@ -53,7 +53,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
 	daddr = iph->daddr;
 	tos = iph->tos;
 
-	ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
+	ret = ipt_do_table(skb, state, priv);
 	/* Reroute for ANY change. */
 	if (ret != NF_DROP && ret != NF_STOLEN) {
 		iph = ip_hdr(skb);
@@ -78,8 +78,8 @@ iptable_mangle_hook(void *priv,
 		     const struct nf_hook_state *state)
 {
 	if (state->hook == NF_INET_LOCAL_OUT)
-		return ipt_mangle_out(skb, state);
-	return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
+		return ipt_mangle_out(skb, state, priv);
+	return ipt_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *mangle_ops __read_mostly;
@@ -91,21 +91,19 @@ static int __net_init iptable_mangle_table_init(struct net *net)
 	repl = ipt_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops,
-				 &net->ipv4.iptable_mangle);
+	ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops);
 	kfree(repl);
 	return ret;
 }
 
 static void __net_exit iptable_mangle_net_pre_exit(struct net *net)
 {
-	ipt_unregister_table_pre_exit(net, "mangle", mangle_ops);
+	ipt_unregister_table_pre_exit(net, "mangle");
 }
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
 	ipt_unregister_table_exit(net, "mangle");
-	net->ipv4.iptable_mangle = NULL;
 }
 
 static struct pernet_operations iptable_mangle_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index dfa9dc63a7b5..a9913842ef18 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -66,12 +66,19 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
 	},
 };
 
-static int ipt_nat_register_lookups(struct net *net, struct xt_table *table)
+static int ipt_nat_register_lookups(struct net *net)
 {
-	struct nf_hook_ops *ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL);
-	struct iptable_nat_pernet *xt_nat_net = net_generic(net, iptable_nat_net_id);
+	struct iptable_nat_pernet *xt_nat_net;
+	struct nf_hook_ops *ops;
+	struct xt_table *table;
 	int i, ret;
 
+	xt_nat_net = net_generic(net, iptable_nat_net_id);
+	table = xt_find_table(net, NFPROTO_IPV4, "nat");
+	if (WARN_ON_ONCE(!table))
+		return -ENOENT;
+
+	ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL);
 	if (!ops)
 		return -ENOMEM;
 
@@ -109,25 +116,21 @@ static void ipt_nat_unregister_lookups(struct net *net)
 static int __net_init iptable_nat_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
-	struct xt_table *table;
 	int ret;
 
 	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
-				 NULL, &table);
+
+	ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, NULL);
 	if (ret < 0) {
 		kfree(repl);
 		return ret;
 	}
 
-	ret = ipt_nat_register_lookups(net, table);
-	if (ret < 0) {
+	ret = ipt_nat_register_lookups(net);
+	if (ret < 0)
 		ipt_unregister_table_exit(net, "nat");
-	} else {
-		net->ipv4.nat_table = table;
-	}
 
 	kfree(repl);
 	return ret;
@@ -141,7 +144,6 @@ static void __net_exit iptable_nat_net_pre_exit(struct net *net)
 static void __net_exit iptable_nat_net_exit(struct net *net)
 {
 	ipt_unregister_table_exit(net, "nat");
-	net->ipv4.nat_table = NULL;
 }
 
 static struct pernet_operations iptable_nat_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 18776f5a4055..ceef397c1f5f 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -41,7 +41,7 @@ static unsigned int
 iptable_raw_hook(void *priv, struct sk_buff *skb,
 		 const struct nf_hook_state *state)
 {
-	return ipt_do_table(skb, state, state->net->ipv4.iptable_raw);
+	return ipt_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
@@ -58,21 +58,19 @@ static int __net_init iptable_raw_table_init(struct net *net)
 	repl = ipt_alloc_initial_table(table);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ipt_register_table(net, table, repl, rawtable_ops,
-				 &net->ipv4.iptable_raw);
+	ret = ipt_register_table(net, table, repl, rawtable_ops);
 	kfree(repl);
 	return ret;
 }
 
 static void __net_exit iptable_raw_net_pre_exit(struct net *net)
 {
-	ipt_unregister_table_pre_exit(net, "raw", rawtable_ops);
+	ipt_unregister_table_pre_exit(net, "raw");
 }
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
 	ipt_unregister_table_exit(net, "raw");
-	net->ipv4.iptable_raw = NULL;
 }
 
 static struct pernet_operations iptable_raw_net_ops = {
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 3df92fb394c5..77973f5fd8f6 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -40,7 +40,7 @@ static unsigned int
 iptable_security_hook(void *priv, struct sk_buff *skb,
 		      const struct nf_hook_state *state)
 {
-	return ipt_do_table(skb, state, state->net->ipv4.iptable_security);
+	return ipt_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
@@ -53,21 +53,19 @@ static int __net_init iptable_security_table_init(struct net *net)
 	repl = ipt_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ipt_register_table(net, &security_table, repl, sectbl_ops,
-				 &net->ipv4.iptable_security);
+	ret = ipt_register_table(net, &security_table, repl, sectbl_ops);
 	kfree(repl);
 	return ret;
 }
 
 static void __net_exit iptable_security_net_pre_exit(struct net *net)
 {
-	ipt_unregister_table_pre_exit(net, "security", sectbl_ops);
+	ipt_unregister_table_pre_exit(net, "security");
 }
 
 static void __net_exit iptable_security_net_exit(struct net *net)
 {
 	ipt_unregister_table_exit(net, "security");
-	net->ipv4.iptable_security = NULL;
 }
 
 static struct pernet_operations iptable_security_net_ops = {
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 1caba9507228..ef37deff8405 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1498,6 +1498,7 @@ void *xt_unregister_table(struct xt_table *table)
 	mutex_unlock(&xt[table->af].mutex);
 	audit_log_nfcfg(table->name, table->af, private->number,
 			AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
+	kfree(table->ops);
 	kfree(table);
 
 	return private;
-- 
cgit v1.2.3


From f9006acc8dfe59e25aa75729728ac57a8d84fc32 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:51:08 +0200
Subject: netfilter: arp_tables: pass table pointer via nf_hook_ops

Same change as previous patch.  Only difference:
no need to handle NULL template_ops parameter, the only caller
(arptable_filter) always passes non-NULL argument.

This removes all remaining accesses to net->ipv4.arptable_filter.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_arp/arp_tables.h |  2 +-
 net/ipv4/netfilter/arp_tables.c          | 43 +++++++++++++++++++++-----------
 net/ipv4/netfilter/arptable_filter.c     |  6 ++---
 3 files changed, 32 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 9ec73dcc8fd6..a0474b4e7782 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -51,7 +51,7 @@ struct arpt_error {
 extern void *arpt_alloc_initial_table(const struct xt_table *);
 int arpt_register_table(struct net *net, const struct xt_table *table,
 			const struct arpt_replace *repl,
-			const struct nf_hook_ops *ops, struct xt_table **res);
+			const struct nf_hook_ops *ops);
 void arpt_unregister_table(struct net *net, const char *name);
 void arpt_unregister_table_pre_exit(struct net *net, const char *name,
 				    const struct nf_hook_ops *ops);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 8a16b0dc5271..b1bb6a7e2dd7 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1499,10 +1499,11 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table)
 int arpt_register_table(struct net *net,
 			const struct xt_table *table,
 			const struct arpt_replace *repl,
-			const struct nf_hook_ops *ops,
-			struct xt_table **res)
+			const struct nf_hook_ops *template_ops)
 {
-	int ret;
+	struct nf_hook_ops *ops;
+	unsigned int num_ops;
+	int ret, i;
 	struct xt_table_info *newinfo;
 	struct xt_table_info bootstrap = {0};
 	void *loc_cpu_entry;
@@ -1516,28 +1517,42 @@ int arpt_register_table(struct net *net,
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
-	if (ret != 0)
-		goto out_free;
+	if (ret != 0) {
+		xt_free_table_info(newinfo);
+		return ret;
+	}
 
 	new_table = xt_register_table(net, table, &bootstrap, newinfo);
 	if (IS_ERR(new_table)) {
-		ret = PTR_ERR(new_table);
-		goto out_free;
+		xt_free_table_info(newinfo);
+		return PTR_ERR(new_table);
 	}
 
-	/* set res now, will see skbs right after nf_register_net_hooks */
-	WRITE_ONCE(*res, new_table);
+	num_ops = hweight32(table->valid_hooks);
+	if (num_ops == 0) {
+		ret = -EINVAL;
+		goto out_free;
+	}
 
-	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
-	if (ret != 0) {
-		__arpt_unregister_table(net, new_table);
-		*res = NULL;
+	ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+	if (!ops) {
+		ret = -ENOMEM;
+		goto out_free;
 	}
 
+	for (i = 0; i < num_ops; i++)
+		ops[i].priv = new_table;
+
+	new_table->ops = ops;
+
+	ret = nf_register_net_hooks(net, ops, num_ops);
+	if (ret != 0)
+		goto out_free;
+
 	return ret;
 
 out_free:
-	xt_free_table_info(newinfo);
+	__arpt_unregister_table(net, new_table);
 	return ret;
 }
 
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 924f096a6d89..b8f45e9bbec8 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -34,7 +34,7 @@ static unsigned int
 arptable_filter_hook(void *priv, struct sk_buff *skb,
 		     const struct nf_hook_state *state)
 {
-	return arpt_do_table(skb, state, state->net->ipv4.arptable_filter);
+	return arpt_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *arpfilter_ops __read_mostly;
@@ -47,8 +47,7 @@ static int __net_init arptable_filter_table_init(struct net *net)
 	repl = arpt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
-	err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops,
-				  &net->ipv4.arptable_filter);
+	err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops);
 	kfree(repl);
 	return err;
 }
@@ -61,7 +60,6 @@ static void __net_exit arptable_filter_net_pre_exit(struct net *net)
 static void __net_exit arptable_filter_net_exit(struct net *net)
 {
 	arpt_unregister_table(net, "filter");
-	net->ipv4.arptable_filter = NULL;
 }
 
 static struct pernet_operations arptable_filter_net_ops = {
-- 
cgit v1.2.3


From ee177a54413a33fe474d55fabb5f8ff390bb27d7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Apr 2021 09:51:09 +0200
Subject: netfilter: ip6_tables: pass table pointer via nf_hook_ops

Same patch as the ip_tables one: removal of all accesses to ip6_tables
xt_table pointers.  After this patch the struct net xt_table anchors
can be removed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6/ip6_tables.h |  5 ++-
 net/ipv6/netfilter/ip6_tables.c           | 51 ++++++++++++++++++++-----------
 net/ipv6/netfilter/ip6table_filter.c      |  9 ++----
 net/ipv6/netfilter/ip6table_mangle.c      | 14 ++++-----
 net/ipv6/netfilter/ip6table_nat.c         | 24 ++++++++-------
 net/ipv6/netfilter/ip6table_raw.c         |  9 ++----
 net/ipv6/netfilter/ip6table_security.c    |  8 ++---
 7 files changed, 63 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 8c07426e18a8..11d0e725fe79 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -26,9 +26,8 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *);
 
 int ip6t_register_table(struct net *net, const struct xt_table *table,
 			const struct ip6t_replace *repl,
-			const struct nf_hook_ops *ops, struct xt_table **res);
-void ip6t_unregister_table_pre_exit(struct net *net, const char *name,
-				    const struct nf_hook_ops *ops);
+			const struct nf_hook_ops *ops);
+void ip6t_unregister_table_pre_exit(struct net *net, const char *name);
 void ip6t_unregister_table_exit(struct net *net, const char *name);
 extern unsigned int ip6t_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 11c80da12ee3..e763716ffa25 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1725,10 +1725,11 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
 
 int ip6t_register_table(struct net *net, const struct xt_table *table,
 			const struct ip6t_replace *repl,
-			const struct nf_hook_ops *ops,
-			struct xt_table **res)
+			const struct nf_hook_ops *template_ops)
 {
-	int ret;
+	struct nf_hook_ops *ops;
+	unsigned int num_ops;
+	int ret, i;
 	struct xt_table_info *newinfo;
 	struct xt_table_info bootstrap = {0};
 	void *loc_cpu_entry;
@@ -1742,40 +1743,54 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
-	if (ret != 0)
-		goto out_free;
+	if (ret != 0) {
+		xt_free_table_info(newinfo);
+		return ret;
+	}
 
 	new_table = xt_register_table(net, table, &bootstrap, newinfo);
 	if (IS_ERR(new_table)) {
-		ret = PTR_ERR(new_table);
-		goto out_free;
+		xt_free_table_info(newinfo);
+		return PTR_ERR(new_table);
 	}
 
-	/* set res now, will see skbs right after nf_register_net_hooks */
-	WRITE_ONCE(*res, new_table);
-	if (!ops)
+	if (!template_ops)
 		return 0;
 
-	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
-	if (ret != 0) {
-		__ip6t_unregister_table(net, new_table);
-		*res = NULL;
+	num_ops = hweight32(table->valid_hooks);
+	if (num_ops == 0) {
+		ret = -EINVAL;
+		goto out_free;
 	}
 
+	ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+	if (!ops) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < num_ops; i++)
+		ops[i].priv = new_table;
+
+	new_table->ops = ops;
+
+	ret = nf_register_net_hooks(net, ops, num_ops);
+	if (ret != 0)
+		goto out_free;
+
 	return ret;
 
 out_free:
-	xt_free_table_info(newinfo);
+	__ip6t_unregister_table(net, new_table);
 	return ret;
 }
 
-void ip6t_unregister_table_pre_exit(struct net *net, const char *name,
-				    const struct nf_hook_ops *ops)
+void ip6t_unregister_table_pre_exit(struct net *net, const char *name)
 {
 	struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
 
 	if (table)
-		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+		nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
 }
 
 void ip6t_unregister_table_exit(struct net *net, const char *name)
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 2bcafa3e2d35..bb784ea7bbd3 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -35,7 +35,7 @@ static unsigned int
 ip6table_filter_hook(void *priv, struct sk_buff *skb,
 		     const struct nf_hook_state *state)
 {
-	return ip6t_do_table(skb, state, state->net->ipv6.ip6table_filter);
+	return ip6t_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *filter_ops __read_mostly;
@@ -56,8 +56,7 @@ static int __net_init ip6table_filter_table_init(struct net *net)
 	((struct ip6t_standard *)repl->entries)[1].target.verdict =
 		forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
 
-	err = ip6t_register_table(net, &packet_filter, repl, filter_ops,
-				  &net->ipv6.ip6table_filter);
+	err = ip6t_register_table(net, &packet_filter, repl, filter_ops);
 	kfree(repl);
 	return err;
 }
@@ -72,14 +71,12 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 
 static void __net_exit ip6table_filter_net_pre_exit(struct net *net)
 {
-	ip6t_unregister_table_pre_exit(net, "filter",
-				       filter_ops);
+	ip6t_unregister_table_pre_exit(net, "filter");
 }
 
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
 	ip6t_unregister_table_exit(net, "filter");
-	net->ipv6.ip6table_filter = NULL;
 }
 
 static struct pernet_operations ip6table_filter_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 14e22022bf41..c76cffd63041 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -32,7 +32,7 @@ static const struct xt_table packet_mangler = {
 };
 
 static unsigned int
-ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
+ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv)
 {
 	unsigned int ret;
 	struct in6_addr saddr, daddr;
@@ -49,7 +49,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
 	/* flowlabel and prio (includes version, which shouldn't change either */
 	flowlabel = *((u_int32_t *)ipv6_hdr(skb));
 
-	ret = ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
+	ret = ip6t_do_table(skb, state, priv);
 
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) ||
@@ -71,8 +71,8 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
 		     const struct nf_hook_state *state)
 {
 	if (state->hook == NF_INET_LOCAL_OUT)
-		return ip6t_mangle_out(skb, state);
-	return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
+		return ip6t_mangle_out(skb, state, priv);
+	return ip6t_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *mangle_ops __read_mostly;
@@ -84,21 +84,19 @@ static int __net_init ip6table_mangle_table_init(struct net *net)
 	repl = ip6t_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops,
-				  &net->ipv6.ip6table_mangle);
+	ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops);
 	kfree(repl);
 	return ret;
 }
 
 static void __net_exit ip6table_mangle_net_pre_exit(struct net *net)
 {
-	ip6t_unregister_table_pre_exit(net, "mangle", mangle_ops);
+	ip6t_unregister_table_pre_exit(net, "mangle");
 }
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
 	ip6t_unregister_table_exit(net, "mangle");
-	net->ipv6.ip6table_mangle = NULL;
 }
 
 static struct pernet_operations ip6table_mangle_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 69b7f9601d03..b0292251e655 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -68,12 +68,19 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
 	},
 };
 
-static int ip6t_nat_register_lookups(struct net *net, struct xt_table *table)
+static int ip6t_nat_register_lookups(struct net *net)
 {
-	struct nf_hook_ops *ops = kmemdup(nf_nat_ipv6_ops, sizeof(nf_nat_ipv6_ops), GFP_KERNEL);
-	struct ip6table_nat_pernet *xt_nat_net = net_generic(net, ip6table_nat_net_id);
+	struct ip6table_nat_pernet *xt_nat_net;
+	struct nf_hook_ops *ops;
+	struct xt_table *table;
 	int i, ret;
 
+	table = xt_find_table(net, NFPROTO_IPV6, "nat");
+	if (WARN_ON_ONCE(!table))
+		return -ENOENT;
+
+	xt_nat_net = net_generic(net, ip6table_nat_net_id);
+	ops = kmemdup(nf_nat_ipv6_ops, sizeof(nf_nat_ipv6_ops), GFP_KERNEL);
 	if (!ops)
 		return -ENOMEM;
 
@@ -111,25 +118,21 @@ static void ip6t_nat_unregister_lookups(struct net *net)
 static int __net_init ip6table_nat_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
-	struct xt_table *table;
 	int ret;
 
 	repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
 	if (repl == NULL)
 		return -ENOMEM;
 	ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
-				  NULL, &table);
+				  NULL);
 	if (ret < 0) {
 		kfree(repl);
 		return ret;
 	}
 
-	ret = ip6t_nat_register_lookups(net, table);
-	if (ret < 0) {
+	ret = ip6t_nat_register_lookups(net);
+	if (ret < 0)
 		ip6t_unregister_table_exit(net, "nat");
-	} else {
-		net->ipv6.ip6table_nat = table;
-	}
 
 	kfree(repl);
 	return ret;
@@ -143,7 +146,6 @@ static void __net_exit ip6table_nat_net_pre_exit(struct net *net)
 static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
 	ip6t_unregister_table_exit(net, "nat");
-	net->ipv6.ip6table_nat = NULL;
 }
 
 static struct pernet_operations ip6table_nat_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index ae3df59f0350..f63c106c521e 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -40,7 +40,7 @@ static unsigned int
 ip6table_raw_hook(void *priv, struct sk_buff *skb,
 		  const struct nf_hook_state *state)
 {
-	return ip6t_do_table(skb, state, state->net->ipv6.ip6table_raw);
+	return ip6t_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
@@ -57,22 +57,19 @@ static int __net_init ip6table_raw_table_init(struct net *net)
 	repl = ip6t_alloc_initial_table(table);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ip6t_register_table(net, table, repl, rawtable_ops,
-				  &net->ipv6.ip6table_raw);
+	ret = ip6t_register_table(net, table, repl, rawtable_ops);
 	kfree(repl);
 	return ret;
 }
 
 static void __net_exit ip6table_raw_net_pre_exit(struct net *net)
 {
-	ip6t_unregister_table_pre_exit(net, "raw",
-				       rawtable_ops);
+	ip6t_unregister_table_pre_exit(net, "raw");
 }
 
 static void __net_exit ip6table_raw_net_exit(struct net *net)
 {
 	ip6t_unregister_table_exit(net, "raw");
-	net->ipv6.ip6table_raw = NULL;
 }
 
 static struct pernet_operations ip6table_raw_net_ops = {
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 83ca632cbf88..8dc335cf450b 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -39,7 +39,7 @@ static unsigned int
 ip6table_security_hook(void *priv, struct sk_buff *skb,
 		       const struct nf_hook_state *state)
 {
-	return ip6t_do_table(skb, state, state->net->ipv6.ip6table_security);
+	return ip6t_do_table(skb, state, priv);
 }
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
@@ -52,21 +52,19 @@ static int __net_init ip6table_security_table_init(struct net *net)
 	repl = ip6t_alloc_initial_table(&security_table);
 	if (repl == NULL)
 		return -ENOMEM;
-	ret = ip6t_register_table(net, &security_table, repl, sectbl_ops,
-				  &net->ipv6.ip6table_security);
+	ret = ip6t_register_table(net, &security_table, repl, sectbl_ops);
 	kfree(repl);
 	return ret;
 }
 
 static void __net_exit ip6table_security_net_pre_exit(struct net *net)
 {
-	ip6t_unregister_table_pre_exit(net, "security", sectbl_ops);
+	ip6t_unregister_table_pre_exit(net, "security");
 }
 
 static void __net_exit ip6table_security_net_exit(struct net *net)
 {
 	ip6t_unregister_table_exit(net, "security");
-	net->ipv6.ip6table_security = NULL;
 }
 
 static struct pernet_operations ip6table_security_net_ops = {
-- 
cgit v1.2.3


From 95aafe911db602d19b00d2a88c3d54a84119f5dc Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 25 Apr 2021 02:30:38 +0200
Subject: net: ethernet: ixp4xx: Support device tree probing

This adds device tree probing to the IXP4xx ethernet
driver.

Add a platform data bool to tell us whether to
register an MDIO bus for the device or not, as well
as the corresponding NPE.

We need to drop the memory region request as part of
this since the OF core will request the memory for the
device.

Cc: Zoltan HERPAI <wigyori@uid0.hu>
Cc: Raylynn Knight <rayknight@me.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xscale/Kconfig      |   1 +
 drivers/net/ethernet/xscale/ixp4xx_eth.c | 210 +++++++++++++++++++++----------
 include/linux/platform_data/eth_ixp4xx.h |   2 +
 3 files changed, 150 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/xscale/Kconfig b/drivers/net/ethernet/xscale/Kconfig
index 7b83a6e5d894..468ffe3d1707 100644
--- a/drivers/net/ethernet/xscale/Kconfig
+++ b/drivers/net/ethernet/xscale/Kconfig
@@ -22,6 +22,7 @@ config IXP4XX_ETH
 	tristate "Intel IXP4xx Ethernet support"
 	depends on ARM && ARCH_IXP4XX && IXP4XX_NPE && IXP4XX_QMGR
 	select PHYLIB
+	select OF_MDIO if OF
 	select NET_PTP_CLASSIFY
 	help
 	  Say Y here if you want to use built-in Ethernet ports
diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c
index 9d323e8595e2..1149e88e6454 100644
--- a/drivers/net/ethernet/xscale/ixp4xx_eth.c
+++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c
@@ -28,6 +28,7 @@
 #include <linux/kernel.h>
 #include <linux/net_tstamp.h>
 #include <linux/of.h>
+#include <linux/of_mdio.h>
 #include <linux/phy.h>
 #include <linux/platform_data/eth_ixp4xx.h>
 #include <linux/platform_device.h>
@@ -165,7 +166,6 @@ struct eth_regs {
 };
 
 struct port {
-	struct resource *mem_res;
 	struct eth_regs __iomem *regs;
 	struct npe *npe;
 	struct net_device *netdev;
@@ -250,6 +250,7 @@ static inline void memcpy_swab32(u32 *dest, u32 *src, int cnt)
 static DEFINE_SPINLOCK(mdio_lock);
 static struct eth_regs __iomem *mdio_regs; /* mdio command and status only */
 static struct mii_bus *mdio_bus;
+static struct device_node *mdio_bus_np;
 static int ports_open;
 static struct port *npe_port_tab[MAX_NPES];
 static struct dma_pool *dma_pool;
@@ -533,7 +534,8 @@ static int ixp4xx_mdio_register(struct eth_regs __iomem *regs)
 	mdio_bus->write = &ixp4xx_mdio_write;
 	snprintf(mdio_bus->id, MII_BUS_ID_SIZE, "ixp4xx-eth-0");
 
-	if ((err = mdiobus_register(mdio_bus)))
+	err = of_mdiobus_register(mdio_bus, mdio_bus_np);
+	if (err)
 		mdiobus_free(mdio_bus);
 	return err;
 }
@@ -1358,18 +1360,118 @@ static const struct net_device_ops ixp4xx_netdev_ops = {
 	.ndo_validate_addr = eth_validate_addr,
 };
 
+#ifdef CONFIG_OF
+static struct eth_plat_info *ixp4xx_of_get_platdata(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct of_phandle_args queue_spec;
+	struct of_phandle_args npe_spec;
+	struct device_node *mdio_np;
+	struct eth_plat_info *plat;
+	int ret;
+
+	plat = devm_kzalloc(dev, sizeof(*plat), GFP_KERNEL);
+	if (!plat)
+		return NULL;
+
+	ret = of_parse_phandle_with_fixed_args(np, "intel,npe-handle", 1, 0,
+					       &npe_spec);
+	if (ret) {
+		dev_err(dev, "no NPE engine specified\n");
+		return NULL;
+	}
+	/* NPE ID 0x00, 0x10, 0x20... */
+	plat->npe = (npe_spec.args[0] << 4);
+
+	/* Check if this device has an MDIO bus */
+	mdio_np = of_get_child_by_name(np, "mdio");
+	if (mdio_np) {
+		plat->has_mdio = true;
+		mdio_bus_np = mdio_np;
+		/* DO NOT put the mdio_np, it will be used */
+	}
+
+	/* Get the rx queue as a resource from queue manager */
+	ret = of_parse_phandle_with_fixed_args(np, "queue-rx", 1, 0,
+					       &queue_spec);
+	if (ret) {
+		dev_err(dev, "no rx queue phandle\n");
+		return NULL;
+	}
+	plat->rxq = queue_spec.args[0];
+
+	/* Get the txready queue as resource from queue manager */
+	ret = of_parse_phandle_with_fixed_args(np, "queue-txready", 1, 0,
+					       &queue_spec);
+	if (ret) {
+		dev_err(dev, "no txready queue phandle\n");
+		return NULL;
+	}
+	plat->txreadyq = queue_spec.args[0];
+
+	return plat;
+}
+#else
+static struct eth_plat_info *ixp4xx_of_get_platdata(struct device *dev)
+{
+	return NULL;
+}
+#endif
+
 static int ixp4xx_eth_probe(struct platform_device *pdev)
 {
 	struct phy_device *phydev = NULL;
 	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
 	struct eth_plat_info *plat;
-	resource_size_t regs_phys;
 	struct net_device *ndev;
 	struct resource *res;
 	struct port *port;
 	int err;
 
-	plat = dev_get_platdata(dev);
+	if (np) {
+		plat = ixp4xx_of_get_platdata(dev);
+		if (!plat)
+			return -ENODEV;
+	} else {
+		plat = dev_get_platdata(dev);
+		if (!plat)
+			return -ENODEV;
+		plat->npe = pdev->id;
+		switch (plat->npe) {
+		case IXP4XX_ETH_NPEA:
+			/* If the MDIO bus is not up yet, defer probe */
+			break;
+		case IXP4XX_ETH_NPEB:
+			/* On all except IXP43x, NPE-B is used for the MDIO bus.
+			 * If there is no NPE-B in the feature set, bail out,
+			 * else we have the MDIO bus here.
+			 */
+			if (!cpu_is_ixp43x()) {
+				if (!(ixp4xx_read_feature_bits() &
+				      IXP4XX_FEATURE_NPEB_ETH0))
+					return -ENODEV;
+				/* Else register the MDIO bus on NPE-B */
+				plat->has_mdio = true;
+			}
+			break;
+		case IXP4XX_ETH_NPEC:
+			/* IXP43x lacks NPE-B and uses NPE-C for the MDIO bus
+			 * access, if there is no NPE-C, no bus, nothing works,
+			 * so bail out.
+			 */
+			if (cpu_is_ixp43x()) {
+				if (!(ixp4xx_read_feature_bits() &
+				      IXP4XX_FEATURE_NPEC_ETH))
+					return -ENODEV;
+				/* Else register the MDIO bus on NPE-B */
+				plat->has_mdio = true;
+			}
+			break;
+		default:
+			return -ENODEV;
+		}
+	}
 
 	if (!(ndev = devm_alloc_etherdev(dev, sizeof(struct port))))
 		return -ENOMEM;
@@ -1377,59 +1479,29 @@ static int ixp4xx_eth_probe(struct platform_device *pdev)
 	SET_NETDEV_DEV(ndev, dev);
 	port = netdev_priv(ndev);
 	port->netdev = ndev;
-	port->id = pdev->id;
+	port->id = plat->npe;
 
 	/* Get the port resource and remap */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res)
 		return -ENODEV;
-	regs_phys = res->start;
 	port->regs = devm_ioremap_resource(dev, res);
 	if (IS_ERR(port->regs))
 		return PTR_ERR(port->regs);
 
-	switch (port->id) {
-	case IXP4XX_ETH_NPEA:
-		/* If the MDIO bus is not up yet, defer probe */
-		if (!mdio_bus)
-			return -EPROBE_DEFER;
-		break;
-	case IXP4XX_ETH_NPEB:
-		/*
-		 * On all except IXP43x, NPE-B is used for the MDIO bus.
-		 * If there is no NPE-B in the feature set, bail out, else
-		 * register the MDIO bus.
-		 */
-		if (!cpu_is_ixp43x()) {
-			if (!(ixp4xx_read_feature_bits() &
-			      IXP4XX_FEATURE_NPEB_ETH0))
-				return -ENODEV;
-			/* Else register the MDIO bus on NPE-B */
-			if ((err = ixp4xx_mdio_register(port->regs)))
-				return err;
-		}
-		if (!mdio_bus)
-			return -EPROBE_DEFER;
-		break;
-	case IXP4XX_ETH_NPEC:
-		/*
-		 * IXP43x lacks NPE-B and uses NPE-C for the MDIO bus access,
-		 * of there is no NPE-C, no bus, nothing works, so bail out.
-		 */
-		if (cpu_is_ixp43x()) {
-			if (!(ixp4xx_read_feature_bits() &
-			      IXP4XX_FEATURE_NPEC_ETH))
-				return -ENODEV;
-			/* Else register the MDIO bus on NPE-C */
-			if ((err = ixp4xx_mdio_register(port->regs)))
-				return err;
+	/* Register the MDIO bus if we have it */
+	if (plat->has_mdio) {
+		err = ixp4xx_mdio_register(port->regs);
+		if (err) {
+			dev_err(dev, "failed to register MDIO bus\n");
+			return err;
 		}
-		if (!mdio_bus)
-			return -EPROBE_DEFER;
-		break;
-	default:
-		return -ENODEV;
 	}
+	/* If the instance with the MDIO bus has not yet appeared,
+	 * defer probing until it gets probed.
+	 */
+	if (!mdio_bus)
+		return -EPROBE_DEFER;
 
 	ndev->netdev_ops = &ixp4xx_netdev_ops;
 	ndev->ethtool_ops = &ixp4xx_ethtool_ops;
@@ -1440,12 +1512,6 @@ static int ixp4xx_eth_probe(struct platform_device *pdev)
 	if (!(port->npe = npe_request(NPE_ID(port->id))))
 		return -EIO;
 
-	port->mem_res = request_mem_region(regs_phys, REGS_SIZE, ndev->name);
-	if (!port->mem_res) {
-		err = -EBUSY;
-		goto err_npe_rel;
-	}
-
 	port->plat = plat;
 	npe_port_tab[NPE_ID(port->id)] = port;
 	memcpy(ndev->dev_addr, plat->hwaddr, ETH_ALEN);
@@ -1458,15 +1524,26 @@ static int ixp4xx_eth_probe(struct platform_device *pdev)
 	__raw_writel(DEFAULT_CORE_CNTRL, &port->regs->core_control);
 	udelay(50);
 
-	phydev = mdiobus_get_phy(mdio_bus, plat->phy);
-	if (IS_ERR(phydev)) {
-		err = PTR_ERR(phydev);
-		goto err_free_mem;
+	if (np) {
+		phydev = of_phy_get_and_connect(ndev, np, ixp4xx_adjust_link);
+	} else {
+		phydev = mdiobus_get_phy(mdio_bus, plat->phy);
+		if (IS_ERR(phydev)) {
+			err = PTR_ERR(phydev);
+			dev_err(dev, "could not connect phydev (%d)\n", err);
+			goto err_free_mem;
+		}
+		err = phy_connect_direct(ndev, phydev, ixp4xx_adjust_link,
+					 PHY_INTERFACE_MODE_MII);
+		if (err)
+			goto err_free_mem;
+
 	}
-	err = phy_connect_direct(ndev, phydev, ixp4xx_adjust_link,
-				 PHY_INTERFACE_MODE_MII);
-	if (err)
+	if (!phydev) {
+		err = -ENODEV;
+		dev_err(dev, "no phydev\n");
 		goto err_free_mem;
+	}
 
 	phydev->irq = PHY_POLL;
 
@@ -1482,8 +1559,6 @@ err_phy_dis:
 	phy_disconnect(phydev);
 err_free_mem:
 	npe_port_tab[NPE_ID(port->id)] = NULL;
-	release_resource(port->mem_res);
-err_npe_rel:
 	npe_release(port->npe);
 	return err;
 }
@@ -1499,12 +1574,21 @@ static int ixp4xx_eth_remove(struct platform_device *pdev)
 	ixp4xx_mdio_remove();
 	npe_port_tab[NPE_ID(port->id)] = NULL;
 	npe_release(port->npe);
-	release_resource(port->mem_res);
 	return 0;
 }
 
+static const struct of_device_id ixp4xx_eth_of_match[] = {
+	{
+		.compatible = "intel,ixp4xx-ethernet",
+	},
+	{ },
+};
+
 static struct platform_driver ixp4xx_eth_driver = {
-	.driver.name	= DRV_NAME,
+	.driver = {
+		.name = DRV_NAME,
+		.of_match_table = of_match_ptr(ixp4xx_eth_of_match),
+	},
 	.probe		= ixp4xx_eth_probe,
 	.remove		= ixp4xx_eth_remove,
 };
diff --git a/include/linux/platform_data/eth_ixp4xx.h b/include/linux/platform_data/eth_ixp4xx.h
index 6f652ea0c6ae..114b0940729f 100644
--- a/include/linux/platform_data/eth_ixp4xx.h
+++ b/include/linux/platform_data/eth_ixp4xx.h
@@ -14,6 +14,8 @@ struct eth_plat_info {
 	u8 rxq;		/* configurable, currently 0 - 31 only */
 	u8 txreadyq;
 	u8 hwaddr[6];
+	u8 npe;		/* NPE instance used by this interface */
+	bool has_mdio;	/* If this instance has an MDIO bus */
 };
 
 #endif
-- 
cgit v1.2.3


From a655536571747575fcaac3c93252b0032d878545 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 23 Apr 2021 00:17:09 +0200
Subject: netfilter: nfnetlink: add struct nfnl_info and pass it to callbacks

Add a new structure to reduce callback footprint and to facilite
extensions of the nfnetlink callback interface in the future.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h  |  13 ++-
 net/netfilter/ipset/ip_set_core.c    | 149 ++++++++++--------------
 net/netfilter/nf_conntrack_netlink.c | 214 +++++++++++++++++------------------
 net/netfilter/nfnetlink.c            |  18 ++-
 net/netfilter/nfnetlink_acct.c       |  44 ++++---
 net/netfilter/nfnetlink_cthelper.c   |  30 ++---
 net/netfilter/nfnetlink_cttimeout.c  | 101 ++++++++---------
 net/netfilter/nfnetlink_log.c        |  26 ++---
 net/netfilter/nfnetlink_osf.c        |  19 ++--
 net/netfilter/nfnetlink_queue.c      |  12 +-
 10 files changed, 286 insertions(+), 340 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index d4c14257db5d..1baa3205b199 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -7,11 +7,16 @@
 #include <net/netlink.h>
 #include <uapi/linux/netfilter/nfnetlink.h>
 
+struct nfnl_info {
+	struct net		*net;
+	struct sock		*sk;
+	const struct nlmsghdr	*nlh;
+	struct netlink_ext_ack	*extack;
+};
+
 struct nfnl_callback {
-	int (*call)(struct net *net, struct sock *nl, struct sk_buff *skb,
-		    const struct nlmsghdr *nlh,
-		    const struct nlattr * const cda[],
-		    struct netlink_ext_ack *extack);
+	int (*call)(struct sk_buff *skb, const struct nfnl_info *info,
+		    const struct nlattr * const cda[]);
 	int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb,
 			const struct nlmsghdr *nlh,
 			const struct nlattr * const cda[],
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 359ff8ec236a..bf9902c1daa8 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1031,26 +1031,22 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
 	return 0;
 }
 
-static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb,
-		       const struct nlmsghdr *nlh,
-		       const struct nlattr * const attr[],
-		       struct netlink_ext_ack *extack)
+static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info,
+		       const struct nlattr * const attr[])
 {
 	return -EOPNOTSUPP;
 }
 
-static int ip_set_create(struct net *net, struct sock *ctnl,
-			 struct sk_buff *skb, const struct nlmsghdr *nlh,
-			 const struct nlattr * const attr[],
-			 struct netlink_ext_ack *extack)
+static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info,
+			 const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	struct ip_set *set, *clash = NULL;
 	ip_set_id_t index = IPSET_INVALID_ID;
 	struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {};
 	const char *name, *typename;
 	u8 family, revision;
-	u32 flags = flag_exist(nlh);
+	u32 flags = flag_exist(info->nlh);
 	int ret = 0;
 
 	if (unlikely(protocol_min_failed(attr) ||
@@ -1101,7 +1097,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
 	/* Set create flags depending on the type revision */
 	set->flags |= set->type->create_flags[revision];
 
-	ret = set->type->create(net, set, tb, flags);
+	ret = set->type->create(info->net, set, tb, flags);
 	if (ret != 0)
 		goto put_out;
 
@@ -1183,12 +1179,10 @@ ip_set_destroy_set(struct ip_set *set)
 	kfree(set);
 }
 
-static int ip_set_destroy(struct net *net, struct sock *ctnl,
-			  struct sk_buff *skb, const struct nlmsghdr *nlh,
-			  const struct nlattr * const attr[],
-			  struct netlink_ext_ack *extack)
+static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
+			  const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	struct ip_set *s;
 	ip_set_id_t i;
 	int ret = 0;
@@ -1230,7 +1224,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
 		/* Modified by ip_set_destroy() only, which is serialized */
 		inst->is_destroyed = false;
 	} else {
-		u32 flags = flag_exist(nlh);
+		u32 flags = flag_exist(info->nlh);
 		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
 				    &i);
 		if (!s) {
@@ -1264,12 +1258,10 @@ ip_set_flush_set(struct ip_set *set)
 	ip_set_unlock(set);
 }
 
-static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
-			const struct nlmsghdr *nlh,
-			const struct nlattr * const attr[],
-			struct netlink_ext_ack *extack)
+static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info,
+			const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	struct ip_set *s;
 	ip_set_id_t i;
 
@@ -1304,12 +1296,10 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
 				    .len = IPSET_MAXNAMELEN - 1 },
 };
 
-static int ip_set_rename(struct net *net, struct sock *ctnl,
-			 struct sk_buff *skb, const struct nlmsghdr *nlh,
-			 const struct nlattr * const attr[],
-			 struct netlink_ext_ack *extack)
+static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info,
+			 const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	struct ip_set *set, *s;
 	const char *name2;
 	ip_set_id_t i;
@@ -1354,12 +1344,10 @@ out:
  * so the ip_set_list always contains valid pointers to the sets.
  */
 
-static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
-		       const struct nlmsghdr *nlh,
-		       const struct nlattr * const attr[],
-		       struct netlink_ext_ack *extack)
+static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
+		       const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	struct ip_set *from, *to;
 	ip_set_id_t from_id, to_id;
 	char from_name[IPSET_MAXNAMELEN];
@@ -1669,10 +1657,8 @@ out:
 	return ret < 0 ? ret : skb->len;
 }
 
-static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
-		       const struct nlmsghdr *nlh,
-		       const struct nlattr * const attr[],
-		       struct netlink_ext_ack *extack)
+static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info,
+		       const struct nlattr * const attr[])
 {
 	if (unlikely(protocol_min_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
@@ -1683,7 +1669,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 			.dump = ip_set_dump_do,
 			.done = ip_set_dump_done,
 		};
-		return netlink_dump_start(ctnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 }
 
@@ -1817,30 +1803,24 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
 	return ret;
 }
 
-static int ip_set_uadd(struct net *net, struct sock *ctnl,
-		       struct sk_buff *skb, const struct nlmsghdr *nlh,
-		       const struct nlattr * const attr[],
-		       struct netlink_ext_ack *extack)
+static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info,
+		       const struct nlattr * const attr[])
 {
-	return ip_set_ad(net, ctnl, skb,
-			 IPSET_ADD, nlh, attr, extack);
+	return ip_set_ad(info->net, info->sk, skb,
+			 IPSET_ADD, info->nlh, attr, info->extack);
 }
 
-static int ip_set_udel(struct net *net, struct sock *ctnl,
-		       struct sk_buff *skb, const struct nlmsghdr *nlh,
-		       const struct nlattr * const attr[],
-		       struct netlink_ext_ack *extack)
+static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info,
+		       const struct nlattr * const attr[])
 {
-	return ip_set_ad(net, ctnl, skb,
-			 IPSET_DEL, nlh, attr, extack);
+	return ip_set_ad(info->net, info->sk, skb,
+			 IPSET_DEL, info->nlh, attr, info->extack);
 }
 
-static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
-			const struct nlmsghdr *nlh,
-			const struct nlattr * const attr[],
-			struct netlink_ext_ack *extack)
+static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info,
+			const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	struct ip_set *set;
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
 	int ret = 0;
@@ -1872,12 +1852,10 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 
 /* Get headed data of a set */
 
-static int ip_set_header(struct net *net, struct sock *ctnl,
-			 struct sk_buff *skb, const struct nlmsghdr *nlh,
-			 const struct nlattr * const attr[],
-			 struct netlink_ext_ack *extack)
+static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info,
+			 const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	const struct ip_set *set;
 	struct sk_buff *skb2;
 	struct nlmsghdr *nlh2;
@@ -1895,7 +1873,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
 	if (!skb2)
 		return -ENOMEM;
 
-	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
 			 IPSET_CMD_HEADER);
 	if (!nlh2)
 		goto nlmsg_failure;
@@ -1907,7 +1885,8 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
 		goto nla_put_failure;
 	nlmsg_end(skb2, nlh2);
 
-	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (ret < 0)
 		return ret;
 
@@ -1929,10 +1908,8 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
 	[IPSET_ATTR_FAMILY]	= { .type = NLA_U8 },
 };
 
-static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
-		       const struct nlmsghdr *nlh,
-		       const struct nlattr * const attr[],
-		       struct netlink_ext_ack *extack)
+static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info,
+		       const struct nlattr * const attr[])
 {
 	struct sk_buff *skb2;
 	struct nlmsghdr *nlh2;
@@ -1955,7 +1932,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	if (!skb2)
 		return -ENOMEM;
 
-	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
 			 IPSET_CMD_TYPE);
 	if (!nlh2)
 		goto nlmsg_failure;
@@ -1968,7 +1945,8 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	nlmsg_end(skb2, nlh2);
 
 	pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
-	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (ret < 0)
 		return ret;
 
@@ -1988,10 +1966,8 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
 	[IPSET_ATTR_PROTOCOL]	= { .type = NLA_U8 },
 };
 
-static int ip_set_protocol(struct net *net, struct sock *ctnl,
-			   struct sk_buff *skb, const struct nlmsghdr *nlh,
-			   const struct nlattr * const attr[],
-			   struct netlink_ext_ack *extack)
+static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info,
+			   const struct nlattr * const attr[])
 {
 	struct sk_buff *skb2;
 	struct nlmsghdr *nlh2;
@@ -2004,7 +1980,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
 	if (!skb2)
 		return -ENOMEM;
 
-	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
 			 IPSET_CMD_PROTOCOL);
 	if (!nlh2)
 		goto nlmsg_failure;
@@ -2014,7 +1990,8 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
 		goto nla_put_failure;
 	nlmsg_end(skb2, nlh2);
 
-	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (ret < 0)
 		return ret;
 
@@ -2029,12 +2006,10 @@ nlmsg_failure:
 
 /* Get set by name or index, from userspace */
 
-static int ip_set_byname(struct net *net, struct sock *ctnl,
-			 struct sk_buff *skb, const struct nlmsghdr *nlh,
-			 const struct nlattr * const attr[],
-			 struct netlink_ext_ack *extack)
+static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info,
+			 const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	struct sk_buff *skb2;
 	struct nlmsghdr *nlh2;
 	ip_set_id_t id = IPSET_INVALID_ID;
@@ -2053,7 +2028,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl,
 	if (!skb2)
 		return -ENOMEM;
 
-	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
 			 IPSET_CMD_GET_BYNAME);
 	if (!nlh2)
 		goto nlmsg_failure;
@@ -2063,7 +2038,8 @@ static int ip_set_byname(struct net *net, struct sock *ctnl,
 		goto nla_put_failure;
 	nlmsg_end(skb2, nlh2);
 
-	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (ret < 0)
 		return ret;
 
@@ -2081,12 +2057,10 @@ static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = {
 	[IPSET_ATTR_INDEX]	= { .type = NLA_U16 },
 };
 
-static int ip_set_byindex(struct net *net, struct sock *ctnl,
-			  struct sk_buff *skb, const struct nlmsghdr *nlh,
-			  const struct nlattr * const attr[],
-			  struct netlink_ext_ack *extack)
+static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info,
+			  const struct nlattr * const attr[])
 {
-	struct ip_set_net *inst = ip_set_pernet(net);
+	struct ip_set_net *inst = ip_set_pernet(info->net);
 	struct sk_buff *skb2;
 	struct nlmsghdr *nlh2;
 	ip_set_id_t id = IPSET_INVALID_ID;
@@ -2108,7 +2082,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl,
 	if (!skb2)
 		return -ENOMEM;
 
-	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
 			 IPSET_CMD_GET_BYINDEX);
 	if (!nlh2)
 		goto nlmsg_failure;
@@ -2117,7 +2091,8 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl,
 		goto nla_put_failure;
 	nlmsg_end(skb2, nlh2);
 
-	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (ret < 0)
 		return ret;
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 44e3cb80e2e0..5147a63b3d1b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1524,17 +1524,15 @@ static int ctnetlink_flush_conntrack(struct net *net,
 	return 0;
 }
 
-static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
-				   struct sk_buff *skb,
-				   const struct nlmsghdr *nlh,
-				   const struct nlattr * const cda[],
-				   struct netlink_ext_ack *extack)
+static int ctnetlink_del_conntrack(struct sk_buff *skb,
+				   const struct nfnl_info *info,
+				   const struct nlattr * const cda[])
 {
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_tuple tuple;
-	struct nf_conn *ct;
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	struct nf_conntrack_zone zone;
+	struct nf_conn *ct;
 	int err;
 
 	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1550,15 +1548,15 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
 	else {
 		u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC;
 
-		return ctnetlink_flush_conntrack(net, cda,
+		return ctnetlink_flush_conntrack(info->net, cda,
 						 NETLINK_CB(skb).portid,
-						 nlmsg_report(nlh), u3);
+						 nlmsg_report(info->nlh), u3);
 	}
 
 	if (err < 0)
 		return err;
 
-	h = nf_conntrack_find_get(net, &zone, &tuple);
+	h = nf_conntrack_find_get(info->net, &zone, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -1578,28 +1576,26 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
 		}
 	}
 
-	nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
+	nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh));
 	nf_ct_put(ct);
 
 	return 0;
 }
 
-static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
-				   struct sk_buff *skb,
-				   const struct nlmsghdr *nlh,
-				   const struct nlattr * const cda[],
-				   struct netlink_ext_ack *extack)
+static int ctnetlink_get_conntrack(struct sk_buff *skb,
+				   const struct nfnl_info *info,
+				   const struct nlattr * const cda[])
 {
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_tuple tuple;
-	struct nf_conn *ct;
-	struct sk_buff *skb2 = NULL;
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_zone zone;
+	struct sk_buff *skb2;
+	struct nf_conn *ct;
 	int err;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.start = ctnetlink_start,
 			.dump = ctnetlink_dump_table,
@@ -1607,7 +1603,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
 			.data = (void *)cda,
 		};
 
-		return netlink_dump_start(ctnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 
 	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1626,7 +1622,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
 	if (err < 0)
 		return err;
 
-	h = nf_conntrack_find_get(net, &zone, &tuple);
+	h = nf_conntrack_find_get(info->net, &zone, &tuple);
 	if (!h)
 		return -ENOENT;
 
@@ -1639,13 +1635,16 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
 		return -ENOMEM;
 	}
 
-	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
-				  NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0);
+	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid,
+				  info->nlh->nlmsg_seq,
+				  NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct,
+				  true, 0);
 	nf_ct_put(ct);
 	if (err <= 0)
 		goto free;
 
-	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (err < 0)
 		goto out;
 
@@ -1743,18 +1742,16 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
 	return ctnetlink_dump_list(skb, cb, true);
 }
 
-static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl,
-				  struct sk_buff *skb,
-				  const struct nlmsghdr *nlh,
-				  const struct nlattr * const cda[],
-				  struct netlink_ext_ack *extack)
+static int ctnetlink_get_ct_dying(struct sk_buff *skb,
+				  const struct nfnl_info *info,
+				  const struct nlattr * const cda[])
 {
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = ctnetlink_dump_dying,
 			.done = ctnetlink_done_list,
 		};
-		return netlink_dump_start(ctnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 
 	return -EOPNOTSUPP;
@@ -1766,18 +1763,16 @@ ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
 	return ctnetlink_dump_list(skb, cb, false);
 }
 
-static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl,
-					struct sk_buff *skb,
-					const struct nlmsghdr *nlh,
-					const struct nlattr * const cda[],
-					struct netlink_ext_ack *extack)
+static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
+					const struct nfnl_info *info,
+					const struct nlattr * const cda[])
 {
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = ctnetlink_dump_unconfirmed,
 			.done = ctnetlink_done_list,
 		};
-		return netlink_dump_start(ctnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 
 	return -EOPNOTSUPP;
@@ -2374,18 +2369,16 @@ err1:
 	return ERR_PTR(err);
 }
 
-static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
-				   struct sk_buff *skb,
-				   const struct nlmsghdr *nlh,
-				   const struct nlattr * const cda[],
-				   struct netlink_ext_ack *extack)
+static int ctnetlink_new_conntrack(struct sk_buff *skb,
+				   const struct nfnl_info *info,
+				   const struct nlattr * const cda[])
 {
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	struct nf_conntrack_tuple otuple, rtuple;
 	struct nf_conntrack_tuple_hash *h = NULL;
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	struct nf_conn *ct;
 	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_zone zone;
+	struct nf_conn *ct;
 	int err;
 
 	err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -2407,13 +2400,13 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 	}
 
 	if (cda[CTA_TUPLE_ORIG])
-		h = nf_conntrack_find_get(net, &zone, &otuple);
+		h = nf_conntrack_find_get(info->net, &zone, &otuple);
 	else if (cda[CTA_TUPLE_REPLY])
-		h = nf_conntrack_find_get(net, &zone, &rtuple);
+		h = nf_conntrack_find_get(info->net, &zone, &rtuple);
 
 	if (h == NULL) {
 		err = -ENOENT;
-		if (nlh->nlmsg_flags & NLM_F_CREATE) {
+		if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
 			enum ip_conntrack_events events;
 
 			if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
@@ -2421,8 +2414,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 			if (otuple.dst.protonum != rtuple.dst.protonum)
 				return -EINVAL;
 
-			ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
-							&rtuple, u3);
+			ct = ctnetlink_create_conntrack(info->net, &zone, cda,
+							&otuple, &rtuple, u3);
 			if (IS_ERR(ct))
 				return PTR_ERR(ct);
 
@@ -2445,7 +2438,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 						      (1 << IPCT_SYNPROXY) |
 						      events,
 						      ct, NETLINK_CB(skb).portid,
-						      nlmsg_report(nlh));
+						      nlmsg_report(info->nlh));
 			nf_ct_put(ct);
 		}
 
@@ -2455,7 +2448,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 
 	err = -EEXIST;
 	ct = nf_ct_tuplehash_to_ctrack(h);
-	if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
+	if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) {
 		err = ctnetlink_change_conntrack(ct, cda);
 		if (err == 0) {
 			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
@@ -2467,7 +2460,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 						      (1 << IPCT_MARK) |
 						      (1 << IPCT_SYNPROXY),
 						      ct, NETLINK_CB(skb).portid,
-						      nlmsg_report(nlh));
+						      nlmsg_report(info->nlh));
 		}
 	}
 
@@ -2539,17 +2532,15 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl,
-				 struct sk_buff *skb,
-				 const struct nlmsghdr *nlh,
-				 const struct nlattr * const cda[],
-				 struct netlink_ext_ack *extack)
+static int ctnetlink_stat_ct_cpu(struct sk_buff *skb,
+				 const struct nfnl_info *info,
+				 const struct nlattr * const cda[])
 {
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = ctnetlink_ct_stat_cpu_dump,
 		};
-		return netlink_dump_start(ctnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 
 	return 0;
@@ -2585,10 +2576,8 @@ nlmsg_failure:
 	return -1;
 }
 
-static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const cda[],
-			     struct netlink_ext_ack *extack)
+static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const cda[])
 {
 	struct sk_buff *skb2;
 	int err;
@@ -2598,13 +2587,14 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
 		return -ENOMEM;
 
 	err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
-					  nlh->nlmsg_seq,
-					  NFNL_MSG_TYPE(nlh->nlmsg_type),
+					  info->nlh->nlmsg_seq,
+					  NFNL_MSG_TYPE(info->nlh->nlmsg_type),
 					  sock_net(skb->sk));
 	if (err <= 0)
 		goto free;
 
-	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (err < 0)
 		goto out;
 
@@ -3284,29 +3274,29 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
 	return err;
 }
 
-static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
-				struct sk_buff *skb, const struct nlmsghdr *nlh,
-				const struct nlattr * const cda[],
-				struct netlink_ext_ack *extack)
+static int ctnetlink_get_expect(struct sk_buff *skb,
+				const struct nfnl_info *info,
+				const struct nlattr * const cda[])
 {
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_expect *exp;
-	struct sk_buff *skb2;
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_zone zone;
+	struct sk_buff *skb2;
 	int err;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		if (cda[CTA_EXPECT_MASTER])
-			return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda,
-						     extack);
+			return ctnetlink_dump_exp_ct(info->net, info->sk, skb,
+						     info->nlh, cda,
+						     info->extack);
 		else {
 			struct netlink_dump_control c = {
 				.dump = ctnetlink_exp_dump_table,
 				.done = ctnetlink_exp_done,
 			};
-			return netlink_dump_start(ctnl, skb, nlh, &c);
+			return netlink_dump_start(info->sk, skb, info->nlh, &c);
 		}
 	}
 
@@ -3326,7 +3316,7 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
 	if (err < 0)
 		return err;
 
-	exp = nf_ct_expect_find_get(net, &zone, &tuple);
+	exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
 	if (!exp)
 		return -ENOENT;
 
@@ -3348,13 +3338,15 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
 
 	rcu_read_lock();
 	err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
-				      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
+				      info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+				      exp);
 	rcu_read_unlock();
 	nf_ct_expect_put(exp);
 	if (err <= 0)
 		goto free;
 
-	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (err < 0)
 		goto out;
 
@@ -3382,15 +3374,14 @@ static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
 	return true;
 }
 
-static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
-				struct sk_buff *skb, const struct nlmsghdr *nlh,
-				const struct nlattr * const cda[],
-				struct netlink_ext_ack *extack)
+static int ctnetlink_del_expect(struct sk_buff *skb,
+				const struct nfnl_info *info,
+				const struct nlattr * const cda[])
 {
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_tuple tuple;
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_zone zone;
 	int err;
 
@@ -3406,7 +3397,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
 			return err;
 
 		/* bump usage count to 2 */
-		exp = nf_ct_expect_find_get(net, &zone, &tuple);
+		exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
 		if (!exp)
 			return -ENOENT;
 
@@ -3422,7 +3413,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
 		spin_lock_bh(&nf_conntrack_expect_lock);
 		if (del_timer(&exp->timeout)) {
 			nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
-						   nlmsg_report(nlh));
+						   nlmsg_report(info->nlh));
 			nf_ct_expect_put(exp);
 		}
 		spin_unlock_bh(&nf_conntrack_expect_lock);
@@ -3432,14 +3423,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
 	} else if (cda[CTA_EXPECT_HELP_NAME]) {
 		char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
 
-		nf_ct_expect_iterate_net(net, expect_iter_name, name,
+		nf_ct_expect_iterate_net(info->net, expect_iter_name, name,
 					 NETLINK_CB(skb).portid,
-					 nlmsg_report(nlh));
+					 nlmsg_report(info->nlh));
 	} else {
 		/* This basically means we have to flush everything*/
-		nf_ct_expect_iterate_net(net, expect_iter_all, NULL,
+		nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL,
 					 NETLINK_CB(skb).portid,
-					 nlmsg_report(nlh));
+					 nlmsg_report(info->nlh));
 	}
 
 	return 0;
@@ -3635,15 +3626,14 @@ err_ct:
 	return err;
 }
 
-static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
-				struct sk_buff *skb, const struct nlmsghdr *nlh,
-				const struct nlattr * const cda[],
-				struct netlink_ext_ack *extack)
+static int ctnetlink_new_expect(struct sk_buff *skb,
+				const struct nfnl_info *info,
+				const struct nlattr * const cda[])
 {
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_expect *exp;
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_zone zone;
 	int err;
 
@@ -3662,20 +3652,20 @@ static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
 		return err;
 
 	spin_lock_bh(&nf_conntrack_expect_lock);
-	exp = __nf_ct_expect_find(net, &zone, &tuple);
+	exp = __nf_ct_expect_find(info->net, &zone, &tuple);
 	if (!exp) {
 		spin_unlock_bh(&nf_conntrack_expect_lock);
 		err = -ENOENT;
-		if (nlh->nlmsg_flags & NLM_F_CREATE) {
-			err = ctnetlink_create_expect(net, &zone, cda, u3,
+		if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
+			err = ctnetlink_create_expect(info->net, &zone, cda, u3,
 						      NETLINK_CB(skb).portid,
-						      nlmsg_report(nlh));
+						      nlmsg_report(info->nlh));
 		}
 		return err;
 	}
 
 	err = -EEXIST;
-	if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+	if (!(info->nlh->nlmsg_flags & NLM_F_EXCL))
 		err = ctnetlink_change_expect(exp, cda);
 	spin_unlock_bh(&nf_conntrack_expect_lock);
 
@@ -3736,17 +3726,15 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl,
-				  struct sk_buff *skb,
-				  const struct nlmsghdr *nlh,
-				  const struct nlattr * const cda[],
-				  struct netlink_ext_ack *extack)
+static int ctnetlink_stat_exp_cpu(struct sk_buff *skb,
+				  const struct nfnl_info *info,
+				  const struct nlattr * const cda[])
 {
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = ctnetlink_exp_stat_cpu_dump,
 		};
-		return netlink_dump_start(ctnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 
 	return 0;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 06f5886f652e..5f04b67bf47e 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -252,6 +252,12 @@ replay:
 		struct nlattr *attr = (void *)nlh + min_len;
 		int attrlen = nlh->nlmsg_len - min_len;
 		__u8 subsys_id = NFNL_SUBSYS_ID(type);
+		struct nfnl_info info = {
+			.net	= net,
+			.sk	= nfnlnet->nfnl,
+			.nlh	= nlh,
+			.extack	= extack,
+		};
 
 		/* Sanity-check NFNL_MAX_ATTR_COUNT */
 		if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
@@ -276,14 +282,14 @@ replay:
 			rcu_read_unlock();
 			nfnl_lock(subsys_id);
 			if (nfnl_dereference_protected(subsys_id) != ss ||
-			    nfnetlink_find_client(type, ss) != nc)
+			    nfnetlink_find_client(type, ss) != nc) {
 				err = -EAGAIN;
-			else if (nc->call)
-				err = nc->call(net, nfnlnet->nfnl, skb, nlh,
-					       (const struct nlattr **)cda,
-					       extack);
-			else
+			} else if (nc->call) {
+				err = nc->call(skb, &info,
+					       (const struct nlattr **)cda);
+			} else {
 				err = -EINVAL;
+			}
 			nfnl_unlock(subsys_id);
 		}
 		if (err == -EAGAIN)
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 6895f31c5fbb..9cb4b21b8e95 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -56,15 +56,13 @@ static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net)
 #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
 #define NFACCT_OVERQUOTA_BIT	2	/* NFACCT_F_OVERQUOTA */
 
-static int nfnl_acct_new(struct net *net, struct sock *nfnl,
-			 struct sk_buff *skb, const struct nlmsghdr *nlh,
-			 const struct nlattr * const tb[],
-			 struct netlink_ext_ack *extack)
+static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info,
+			 const struct nlattr * const tb[])
 {
-	struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
+	struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
 	struct nf_acct *nfacct, *matching = NULL;
-	char *acct_name;
 	unsigned int size = 0;
+	char *acct_name;
 	u32 flags = 0;
 
 	if (!tb[NFACCT_NAME])
@@ -78,7 +76,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
 		if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
 			continue;
 
-                if (nlh->nlmsg_flags & NLM_F_EXCL)
+                if (info->nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 
 		matching = nfacct;
@@ -86,7 +84,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
         }
 
 	if (matching) {
-		if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+		if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
 			/* reset counters if you request a replacement. */
 			atomic64_set(&matching->pkts, 0);
 			atomic64_set(&matching->bytes, 0);
@@ -273,17 +271,15 @@ static int nfnl_acct_start(struct netlink_callback *cb)
 	return 0;
 }
 
-static int nfnl_acct_get(struct net *net, struct sock *nfnl,
-			 struct sk_buff *skb, const struct nlmsghdr *nlh,
-			 const struct nlattr * const tb[],
-			 struct netlink_ext_ack *extack)
+static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info,
+			 const struct nlattr * const tb[])
 {
-	struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
+	struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
 	int ret = -ENOENT;
 	struct nf_acct *cur;
 	char *acct_name;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = nfnl_acct_dump,
 			.start = nfnl_acct_start,
@@ -291,7 +287,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
 			.data = (void *)tb[NFACCT_FILTER],
 		};
 
-		return netlink_dump_start(nfnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 
 	if (!tb[NFACCT_NAME])
@@ -311,15 +307,15 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
 		}
 
 		ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid,
-					 nlh->nlmsg_seq,
-					 NFNL_MSG_TYPE(nlh->nlmsg_type),
-					 NFNL_MSG_ACCT_NEW, cur);
+					  info->nlh->nlmsg_seq,
+					  NFNL_MSG_TYPE(info->nlh->nlmsg_type),
+					  NFNL_MSG_ACCT_NEW, cur);
 		if (ret <= 0) {
 			kfree_skb(skb2);
 			break;
 		}
-		ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
-					MSG_DONTWAIT);
+		ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+				      MSG_DONTWAIT);
 		if (ret > 0)
 			ret = 0;
 
@@ -347,12 +343,10 @@ static int nfnl_acct_try_del(struct nf_acct *cur)
 	return ret;
 }
 
-static int nfnl_acct_del(struct net *net, struct sock *nfnl,
-			 struct sk_buff *skb, const struct nlmsghdr *nlh,
-			 const struct nlattr * const tb[],
-			 struct netlink_ext_ack *extack)
+static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info,
+			 const struct nlattr * const tb[])
 {
-	struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
+	struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
 	struct nf_acct *cur, *tmp;
 	int ret = -ENOENT;
 	char *acct_name;
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 22f6f7fcc724..3d1a5215177b 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -408,10 +408,8 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
 	return 0;
 }
 
-static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const tb[],
-			     struct netlink_ext_ack *extack)
+static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const tb[])
 {
 	const char *helper_name;
 	struct nf_conntrack_helper *cur, *helper = NULL;
@@ -441,7 +439,7 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
 		     tuple.dst.protonum != cur->tuple.dst.protonum))
 			continue;
 
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 
 		helper = cur;
@@ -607,10 +605,8 @@ out:
 	return skb->len;
 }
 
-static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const tb[],
-			     struct netlink_ext_ack *extack)
+static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const tb[])
 {
 	int ret = -ENOENT;
 	struct nf_conntrack_helper *cur;
@@ -623,11 +619,11 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = nfnl_cthelper_dump_table,
 		};
-		return netlink_dump_start(nfnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 
 	if (tb[NFCTH_NAME])
@@ -659,15 +655,15 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
 		}
 
 		ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
-					      nlh->nlmsg_seq,
-					      NFNL_MSG_TYPE(nlh->nlmsg_type),
+					      info->nlh->nlmsg_seq,
+					      NFNL_MSG_TYPE(info->nlh->nlmsg_type),
 					      NFNL_MSG_CTHELPER_NEW, cur);
 		if (ret <= 0) {
 			kfree_skb(skb2);
 			break;
 		}
 
-		ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+		ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
 				      MSG_DONTWAIT);
 		if (ret > 0)
 			ret = 0;
@@ -678,10 +674,8 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
 	return ret;
 }
 
-static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const tb[],
-			     struct netlink_ext_ack *extack)
+static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const tb[])
 {
 	char *helper_name = NULL;
 	struct nf_conntrack_helper *cur;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 46da5548d0b3..994f3172bf42 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -83,13 +83,11 @@ err:
 	return ret;
 }
 
-static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
-				 struct sk_buff *skb,
-				 const struct nlmsghdr *nlh,
-				 const struct nlattr * const cda[],
-				 struct netlink_ext_ack *extack)
+static int cttimeout_new_timeout(struct sk_buff *skb,
+				 const struct nfnl_info *info,
+				 const struct nlattr * const cda[])
 {
-	struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+	struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
 	__u16 l3num;
 	__u8 l4num;
 	const struct nf_conntrack_l4proto *l4proto;
@@ -111,7 +109,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
 		if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
 			continue;
 
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL)
 			return -EEXIST;
 
 		matching = timeout;
@@ -119,7 +117,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
 	}
 
 	if (matching) {
-		if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+		if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
 			/* You cannot replace one timeout policy by another of
 			 * different kind, sorry.
 			 */
@@ -129,7 +127,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
 
 			return ctnl_timeout_parse_policy(&matching->timeout.data,
 							 matching->timeout.l4proto,
-							 net, cda[CTA_TIMEOUT_DATA]);
+							 info->net,
+							 cda[CTA_TIMEOUT_DATA]);
 		}
 
 		return -EBUSY;
@@ -150,8 +149,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
 		goto err_proto_put;
 	}
 
-	ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net,
-					cda[CTA_TIMEOUT_DATA]);
+	ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto,
+					info->net, cda[CTA_TIMEOUT_DATA]);
 	if (ret < 0)
 		goto err;
 
@@ -248,22 +247,20 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
-				 struct sk_buff *skb,
-				 const struct nlmsghdr *nlh,
-				 const struct nlattr * const cda[],
-				 struct netlink_ext_ack *extack)
+static int cttimeout_get_timeout(struct sk_buff *skb,
+				 const struct nfnl_info *info,
+				 const struct nlattr * const cda[])
 {
-	struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+	struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
 	int ret = -ENOENT;
 	char *name;
 	struct ctnl_timeout *cur;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = ctnl_timeout_dump,
 		};
-		return netlink_dump_start(ctnl, skb, nlh, &c);
+		return netlink_dump_start(info->sk, skb, info->nlh, &c);
 	}
 
 	if (!cda[CTA_TIMEOUT_NAME])
@@ -283,15 +280,15 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
 		}
 
 		ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid,
-					     nlh->nlmsg_seq,
-					     NFNL_MSG_TYPE(nlh->nlmsg_type),
+					     info->nlh->nlmsg_seq,
+					     NFNL_MSG_TYPE(info->nlh->nlmsg_type),
 					     IPCTNL_MSG_TIMEOUT_NEW, cur);
 		if (ret <= 0) {
 			kfree_skb(skb2);
 			break;
 		}
-		ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
-					MSG_DONTWAIT);
+		ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+				      MSG_DONTWAIT);
 		if (ret > 0)
 			ret = 0;
 
@@ -320,13 +317,11 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
 	return ret;
 }
 
-static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
-				 struct sk_buff *skb,
-				 const struct nlmsghdr *nlh,
-				 const struct nlattr * const cda[],
-				 struct netlink_ext_ack *extack)
+static int cttimeout_del_timeout(struct sk_buff *skb,
+				 const struct nfnl_info *info,
+				 const struct nlattr * const cda[])
 {
-	struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+	struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
 	struct ctnl_timeout *cur, *tmp;
 	int ret = -ENOENT;
 	char *name;
@@ -334,7 +329,7 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
 	if (!cda[CTA_TIMEOUT_NAME]) {
 		list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list,
 					 head)
-			ctnl_timeout_try_del(net, cur);
+			ctnl_timeout_try_del(info->net, cur);
 
 		return 0;
 	}
@@ -344,7 +339,7 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
 		if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
 			continue;
 
-		ret = ctnl_timeout_try_del(net, cur);
+		ret = ctnl_timeout_try_del(info->net, cur);
 		if (ret < 0)
 			return ret;
 
@@ -353,11 +348,9 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
 	return ret;
 }
 
-static int cttimeout_default_set(struct net *net, struct sock *ctnl,
-				 struct sk_buff *skb,
-				 const struct nlmsghdr *nlh,
-				 const struct nlattr * const cda[],
-				 struct netlink_ext_ack *extack)
+static int cttimeout_default_set(struct sk_buff *skb,
+				 const struct nfnl_info *info,
+				 const struct nlattr * const cda[])
 {
 	const struct nf_conntrack_l4proto *l4proto;
 	__u8 l4num;
@@ -377,7 +370,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
 		goto err;
 	}
 
-	ret = ctnl_timeout_parse_policy(NULL, l4proto, net,
+	ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net,
 					cda[CTA_TIMEOUT_DATA]);
 	if (ret < 0)
 		goto err;
@@ -427,11 +420,9 @@ nla_put_failure:
 	return -1;
 }
 
-static int cttimeout_default_get(struct net *net, struct sock *ctnl,
-				 struct sk_buff *skb,
-				 const struct nlmsghdr *nlh,
-				 const struct nlattr * const cda[],
-				 struct netlink_ext_ack *extack)
+static int cttimeout_default_get(struct sk_buff *skb,
+				 const struct nfnl_info *info,
+				 const struct nlattr * const cda[])
 {
 	const struct nf_conntrack_l4proto *l4proto;
 	unsigned int *timeouts = NULL;
@@ -453,35 +444,35 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 
 	switch (l4proto->l4proto) {
 	case IPPROTO_ICMP:
-		timeouts = &nf_icmp_pernet(net)->timeout;
+		timeouts = &nf_icmp_pernet(info->net)->timeout;
 		break;
 	case IPPROTO_TCP:
-		timeouts = nf_tcp_pernet(net)->timeouts;
+		timeouts = nf_tcp_pernet(info->net)->timeouts;
 		break;
 	case IPPROTO_UDP:
 	case IPPROTO_UDPLITE:
-		timeouts = nf_udp_pernet(net)->timeouts;
+		timeouts = nf_udp_pernet(info->net)->timeouts;
 		break;
 	case IPPROTO_DCCP:
 #ifdef CONFIG_NF_CT_PROTO_DCCP
-		timeouts = nf_dccp_pernet(net)->dccp_timeout;
+		timeouts = nf_dccp_pernet(info->net)->dccp_timeout;
 #endif
 		break;
 	case IPPROTO_ICMPV6:
-		timeouts = &nf_icmpv6_pernet(net)->timeout;
+		timeouts = &nf_icmpv6_pernet(info->net)->timeout;
 		break;
 	case IPPROTO_SCTP:
 #ifdef CONFIG_NF_CT_PROTO_SCTP
-		timeouts = nf_sctp_pernet(net)->timeouts;
+		timeouts = nf_sctp_pernet(info->net)->timeouts;
 #endif
 		break;
 	case IPPROTO_GRE:
 #ifdef CONFIG_NF_CT_PROTO_GRE
-		timeouts = nf_gre_pernet(net)->timeouts;
+		timeouts = nf_gre_pernet(info->net)->timeouts;
 #endif
 		break;
 	case 255:
-		timeouts = &nf_generic_pernet(net)->timeout;
+		timeouts = &nf_generic_pernet(info->net)->timeout;
 		break;
 	default:
 		WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto);
@@ -497,9 +488,10 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 		goto err;
 	}
 
-	ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid,
-					  nlh->nlmsg_seq,
-					  NFNL_MSG_TYPE(nlh->nlmsg_type),
+	ret = cttimeout_default_fill_info(info->net, skb2,
+					  NETLINK_CB(skb).portid,
+					  info->nlh->nlmsg_seq,
+					  NFNL_MSG_TYPE(info->nlh->nlmsg_type),
 					  IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
 					  l3num, l4proto, timeouts);
 	if (ret <= 0) {
@@ -507,7 +499,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 		err = -ENOMEM;
 		goto err;
 	}
-	ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+	ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (ret > 0)
 		ret = 0;
 
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index d5f458d0ff3d..81630600b4ef 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -845,10 +845,8 @@ static struct notifier_block nfulnl_rtnl_notifier = {
 	.notifier_call	= nfulnl_rcv_nl_event,
 };
 
-static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl,
-			      struct sk_buff *skb, const struct nlmsghdr *nlh,
-			      const struct nlattr * const nfqa[],
-			      struct netlink_ext_ack *extack)
+static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nfula[])
 {
 	return -ENOTSUPP;
 }
@@ -869,18 +867,16 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
 	[NFULA_CFG_FLAGS]	= { .type = NLA_U16 },
 };
 
-static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
-			      struct sk_buff *skb, const struct nlmsghdr *nlh,
-			      const struct nlattr * const nfula[],
-			      struct netlink_ext_ack *extack)
+static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nfula[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nfnl_log_net *log = nfnl_log_pernet(info->net);
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	u_int16_t group_num = ntohs(nfmsg->res_id);
-	struct nfulnl_instance *inst;
 	struct nfulnl_msg_config_cmd *cmd = NULL;
-	struct nfnl_log_net *log = nfnl_log_pernet(net);
-	int ret = 0;
+	struct nfulnl_instance *inst;
 	u16 flags = 0;
+	int ret = 0;
 
 	if (nfula[NFULA_CFG_CMD]) {
 		u_int8_t pf = nfmsg->nfgen_family;
@@ -889,9 +885,9 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
 		/* Commands without queue context */
 		switch (cmd->command) {
 		case NFULNL_CFG_CMD_PF_BIND:
-			return nf_log_bind_pf(net, pf, &nfulnl_logger);
+			return nf_log_bind_pf(info->net, pf, &nfulnl_logger);
 		case NFULNL_CFG_CMD_PF_UNBIND:
-			nf_log_unbind_pf(net, pf);
+			nf_log_unbind_pf(info->net, pf);
 			return 0;
 		}
 	}
@@ -932,7 +928,7 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
 				goto out_put;
 			}
 
-			inst = instance_create(net, group_num,
+			inst = instance_create(info->net, group_num,
 					       NETLINK_CB(skb).portid,
 					       sk_user_ns(NETLINK_CB(skb).sk));
 			if (IS_ERR(inst)) {
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 916a3c7f9eaf..1fd537ef4496 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -292,10 +292,9 @@ static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = {
 	[OSF_ATTR_FINGER]	= { .len = sizeof(struct nf_osf_user_finger) },
 };
 
-static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
-				 struct sk_buff *skb, const struct nlmsghdr *nlh,
-				 const struct nlattr * const osf_attrs[],
-				 struct netlink_ext_ack *extack)
+static int nfnl_osf_add_callback(struct sk_buff *skb,
+				 const struct nfnl_info *info,
+				 const struct nlattr * const osf_attrs[])
 {
 	struct nf_osf_user_finger *f;
 	struct nf_osf_finger *kf = NULL, *sf;
@@ -307,7 +306,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
 	if (!osf_attrs[OSF_ATTR_FINGER])
 		return -EINVAL;
 
-	if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+	if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
 		return -EINVAL;
 
 	f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
@@ -325,7 +324,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
 		kfree(kf);
 		kf = NULL;
 
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL)
 			err = -EEXIST;
 		break;
 	}
@@ -339,11 +338,9 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
 	return err;
 }
 
-static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
-				    struct sk_buff *skb,
-				    const struct nlmsghdr *nlh,
-				    const struct nlattr * const osf_attrs[],
-				    struct netlink_ext_ack *extack)
+static int nfnl_osf_remove_callback(struct sk_buff *skb,
+				    const struct nfnl_info *info,
+				    const struct nlattr * const osf_attrs[])
 {
 	struct nf_osf_user_finger *f;
 	struct nf_osf_finger *sf;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 37e81d895e61..9d7e06d85199 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1245,16 +1245,14 @@ static const struct nf_queue_handler nfqh = {
 	.nf_hook_drop	= nfqnl_nf_hook_drop,
 };
 
-static int nfqnl_recv_config(struct net *net, struct sock *ctnl,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const nfqa[],
-			     struct netlink_ext_ack *extack)
+static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const nfqa[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	u_int16_t queue_num = ntohs(nfmsg->res_id);
-	struct nfqnl_instance *queue;
 	struct nfqnl_msg_config_cmd *cmd = NULL;
-	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+	struct nfqnl_instance *queue;
 	__u32 flags = 0, mask = 0;
 	int ret = 0;
 
-- 
cgit v1.2.3


From 797d49805ddc6595b2fafe3e9ceff7f562be1f2c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 23 Apr 2021 00:17:10 +0200
Subject: netfilter: nfnetlink: pass struct nfnl_info to rcu callbacks

Update rcu callbacks to use the nfnl_info structure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h |   6 +-
 net/netfilter/nf_tables_api.c       | 152 ++++++++++++++++++------------------
 net/netfilter/nfnetlink.c           |   5 +-
 net/netfilter/nfnetlink_queue.c     |  40 ++++------
 net/netfilter/nft_compat.c          |  24 +++---
 5 files changed, 107 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 1baa3205b199..c11f2f99eac4 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -17,10 +17,8 @@ struct nfnl_info {
 struct nfnl_callback {
 	int (*call)(struct sk_buff *skb, const struct nfnl_info *info,
 		    const struct nlattr * const cda[]);
-	int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb,
-			const struct nlmsghdr *nlh,
-			const struct nlattr * const cda[],
-			struct netlink_ext_ack *extack);
+	int (*call_rcu)(struct sk_buff *skb, const struct nfnl_info *info,
+			const struct nlattr * const cda[]);
 	int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb,
 			  const struct nlmsghdr *nlh,
 			  const struct nlattr * const cda[],
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 155b85553fcc..f7c4e6f14130 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -858,25 +858,25 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
 }
 
 /* called with rcu_read_lock held */
-static int nf_tables_gettable(struct net *net, struct sock *nlsk,
-			      struct sk_buff *skb, const struct nlmsghdr *nlh,
-			      const struct nlattr * const nla[],
-			      struct netlink_ext_ack *extack)
+static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_cur(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_cur(info->net);
+	int family = nfmsg->nfgen_family;
 	const struct nft_table *table;
+	struct net *net = info->net;
 	struct sk_buff *skb2;
-	int family = nfmsg->nfgen_family;
 	int err;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_tables,
 			.module = THIS_MODULE,
 		};
 
-		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0);
@@ -890,8 +890,8 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 		return -ENOMEM;
 
 	err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid,
-					nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
-					family, table);
+					info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE,
+					0, family, table);
 	if (err < 0)
 		goto err_fill_table_info;
 
@@ -1623,26 +1623,26 @@ done:
 }
 
 /* called with rcu_read_lock held */
-static int nf_tables_getchain(struct net *net, struct sock *nlsk,
-			      struct sk_buff *skb, const struct nlmsghdr *nlh,
-			      const struct nlattr * const nla[],
-			      struct netlink_ext_ack *extack)
+static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_cur(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_cur(info->net);
+	int family = nfmsg->nfgen_family;
 	const struct nft_chain *chain;
+	struct net *net = info->net;
 	struct nft_table *table;
 	struct sk_buff *skb2;
-	int family = nfmsg->nfgen_family;
 	int err;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_chains,
 			.module = THIS_MODULE,
 		};
 
-		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0);
@@ -1662,8 +1662,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 		return -ENOMEM;
 
 	err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
-					nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
-					family, table, chain);
+					info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
+					0, family, table, chain);
 	if (err < 0)
 		goto err_fill_chain_info;
 
@@ -3076,21 +3076,21 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 }
 
 /* called with rcu_read_lock held */
-static int nf_tables_getrule(struct net *net, struct sock *nlsk,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const nla[],
-			     struct netlink_ext_ack *extack)
+static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_cur(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_cur(info->net);
+	int family = nfmsg->nfgen_family;
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
+	struct net *net = info->net;
 	struct nft_table *table;
 	struct sk_buff *skb2;
-	int family = nfmsg->nfgen_family;
 	int err;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.start= nf_tables_dump_rules_start,
 			.dump = nf_tables_dump_rules,
@@ -3099,7 +3099,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 			.data = (void *)nla,
 		};
 
-		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0);
@@ -3125,7 +3125,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 		return -ENOMEM;
 
 	err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
-				       nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+				       info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
 				       family, table, chain, rule, NULL);
 	if (err < 0)
 		goto err_fill_rule_info;
@@ -4045,25 +4045,25 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
 }
 
 /* called with rcu_read_lock held */
-static int nf_tables_getset(struct net *net, struct sock *nlsk,
-			    struct sk_buff *skb, const struct nlmsghdr *nlh,
-			    const struct nlattr * const nla[],
-			    struct netlink_ext_ack *extack)
+static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
+			    const struct nlattr * const nla[])
 {
-	u8 genmask = nft_genmask_cur(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_cur(info->net);
+	struct net *net = info->net;
 	const struct nft_set *set;
-	struct nft_ctx ctx;
 	struct sk_buff *skb2;
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nft_ctx ctx;
 	int err;
 
 	/* Verify existence before starting dump */
-	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
+	err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack,
 					genmask, 0);
 	if (err < 0)
 		return err;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.start = nf_tables_dump_sets_start,
 			.dump = nf_tables_dump_sets,
@@ -4072,7 +4072,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 			.module = THIS_MODULE,
 		};
 
-		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
 	}
 
 	/* Only accept unspec with dump */
@@ -5063,18 +5063,19 @@ err_fill_setelem:
 }
 
 /* called with rcu_read_lock held */
-static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
-				struct sk_buff *skb, const struct nlmsghdr *nlh,
-				const struct nlattr * const nla[],
-				struct netlink_ext_ack *extack)
+static int nf_tables_getsetelem(struct sk_buff *skb,
+				const struct nfnl_info *info,
+				const struct nlattr * const nla[])
 {
-	u8 genmask = nft_genmask_cur(net);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_cur(info->net);
+	struct net *net = info->net;
 	struct nft_set *set;
 	struct nlattr *attr;
 	struct nft_ctx ctx;
 	int rem, err = 0;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack,
 					 genmask, NETLINK_CB(skb).portid);
 	if (err < 0)
 		return err;
@@ -5083,7 +5084,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.start = nf_tables_dump_set_start,
 			.dump = nf_tables_dump_set,
@@ -5096,7 +5097,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 		};
 
 		c.data = &dump_ctx;
-		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
 	}
 
 	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
@@ -6416,22 +6417,22 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 }
 
 /* called with rcu_read_lock held */
-static int nf_tables_getobj(struct net *net, struct sock *nlsk,
-			    struct sk_buff *skb, const struct nlmsghdr *nlh,
-			    const struct nlattr * const nla[],
-			    struct netlink_ext_ack *extack)
+static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
+			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_cur(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_cur(info->net);
 	int family = nfmsg->nfgen_family;
 	const struct nft_table *table;
+	struct net *net = info->net;
 	struct nft_object *obj;
 	struct sk_buff *skb2;
 	bool reset = false;
 	u32 objtype;
 	int err;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.start = nf_tables_dump_obj_start,
 			.dump = nf_tables_dump_obj,
@@ -6440,7 +6441,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 			.data = (void *)nla,
 		};
 
-		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
 	}
 
 	if (!nla[NFTA_OBJ_NAME] ||
@@ -6464,7 +6465,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 	if (!skb2)
 		return -ENOMEM;
 
-	if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
 		reset = true;
 
 	if (reset) {
@@ -6483,7 +6484,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 	}
 
 	err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
-				      nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+				      info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
 				      family, table, obj, reset);
 	if (err < 0)
 		goto err_fill_obj_info;
@@ -7320,21 +7321,20 @@ static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
 }
 
 /* called with rcu_read_lock held */
-static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
-				  struct sk_buff *skb,
-				  const struct nlmsghdr *nlh,
-				  const struct nlattr * const nla[],
-				  struct netlink_ext_ack *extack)
+static int nf_tables_getflowtable(struct sk_buff *skb,
+				  const struct nfnl_info *info,
+				  const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_cur(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	u8 genmask = nft_genmask_cur(info->net);
 	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
 	const struct nft_table *table;
+	struct net *net = info->net;
 	struct sk_buff *skb2;
 	int err;
 
-	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.start = nf_tables_dump_flowtable_start,
 			.dump = nf_tables_dump_flowtable,
@@ -7343,7 +7343,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 			.data = (void *)nla,
 		};
 
-		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
 	}
 
 	if (!nla[NFTA_FLOWTABLE_NAME])
@@ -7364,7 +7364,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 		return -ENOMEM;
 
 	err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
-					    nlh->nlmsg_seq,
+					    info->nlh->nlmsg_seq,
 					    NFT_MSG_NEWFLOWTABLE, 0, family,
 					    flowtable, &flowtable->hook_list);
 	if (err < 0)
@@ -7526,10 +7526,8 @@ err:
 			  -ENOBUFS);
 }
 
-static int nf_tables_getgen(struct net *net, struct sock *nlsk,
-			    struct sk_buff *skb, const struct nlmsghdr *nlh,
-			    const struct nlattr * const nla[],
-			    struct netlink_ext_ack *extack)
+static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info,
+			    const struct nlattr * const nla[])
 {
 	struct sk_buff *skb2;
 	int err;
@@ -7538,12 +7536,12 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
 	if (skb2 == NULL)
 		return -ENOMEM;
 
-	err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
-				      nlh->nlmsg_seq);
+	err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid,
+				      info->nlh->nlmsg_seq);
 	if (err < 0)
 		goto err_fill_gen_info;
 
-	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+	return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
 
 err_fill_gen_info:
 	kfree_skb(skb2);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 5f04b67bf47e..7920f6c4ff69 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -274,9 +274,8 @@ replay:
 		}
 
 		if (nc->call_rcu) {
-			err = nc->call_rcu(net, nfnlnet->nfnl, skb, nlh,
-					   (const struct nlattr **)cda,
-					   extack);
+			err = nc->call_rcu(skb, &info,
+					   (const struct nlattr **)cda);
 			rcu_read_unlock();
 		} else {
 			rcu_read_unlock();
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 9d7e06d85199..ede9252c8de1 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1046,20 +1046,18 @@ static int nfq_id_after(unsigned int id, unsigned int max)
 	return (int)(id - max) > 0;
 }
 
-static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
-				    struct sk_buff *skb,
-				    const struct nlmsghdr *nlh,
-			            const struct nlattr * const nfqa[],
-				    struct netlink_ext_ack *extack)
+static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
+				    const struct nfnl_info *info,
+				    const struct nlattr * const nfqa[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	u16 queue_num = ntohs(nfmsg->res_id);
 	struct nf_queue_entry *entry, *tmp;
-	unsigned int verdict, maxid;
 	struct nfqnl_msg_verdict_hdr *vhdr;
 	struct nfqnl_instance *queue;
+	unsigned int verdict, maxid;
 	LIST_HEAD(batch_list);
-	u16 queue_num = ntohs(nfmsg->res_id);
-	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
 
 	queue = verdict_instance_lookup(q, queue_num,
 					NETLINK_CB(skb).portid);
@@ -1158,22 +1156,19 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
 	return 0;
 }
 
-static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
-			      struct sk_buff *skb,
-			      const struct nlmsghdr *nlh,
-			      const struct nlattr * const nfqa[],
-			      struct netlink_ext_ack *extack)
+static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nfqa[])
 {
-	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+	struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
 	u_int16_t queue_num = ntohs(nfmsg->res_id);
 	struct nfqnl_msg_verdict_hdr *vhdr;
+	enum ip_conntrack_info ctinfo;
 	struct nfqnl_instance *queue;
-	unsigned int verdict;
 	struct nf_queue_entry *entry;
-	enum ip_conntrack_info ctinfo;
 	struct nfnl_ct_hook *nfnl_ct;
 	struct nf_conn *ct = NULL;
-	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+	unsigned int verdict;
 	int err;
 
 	queue = verdict_instance_lookup(q, queue_num,
@@ -1196,7 +1191,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
 
 	if (nfqa[NFQA_CT]) {
 		if (nfnl_ct != NULL)
-			ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
+			ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry,
+					    &ctinfo);
 	}
 
 	if (entry->state.pf == PF_BRIDGE) {
@@ -1224,10 +1220,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
 	return 0;
 }
 
-static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const nfqa[],
-			     struct netlink_ext_ack *extack)
+static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const cda[])
 {
 	return -ENOTSUPP;
 }
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index b8dbd20a6a4c..4c0657245d5a 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -613,17 +613,15 @@ nla_put_failure:
 	return -1;
 }
 
-static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
-			       struct sk_buff *skb, const struct nlmsghdr *nlh,
-			       const struct nlattr * const tb[],
-			       struct netlink_ext_ack *extack)
+static int nfnl_compat_get_rcu(struct sk_buff *skb,
+			       const struct nfnl_info *info,
+			       const struct nlattr * const tb[])
 {
-	int ret = 0, target;
 	struct nfgenmsg *nfmsg;
-	const char *fmt;
-	const char *name;
-	u32 rev;
+	const char *name, *fmt;
 	struct sk_buff *skb2;
+	int ret = 0, target;
+	u32 rev;
 
 	if (tb[NFTA_COMPAT_NAME] == NULL ||
 	    tb[NFTA_COMPAT_REV] == NULL ||
@@ -634,7 +632,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
 	rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
 	target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
 
-	nfmsg = nlmsg_data(nlh);
+	nfmsg = nlmsg_data(info->nlh);
 
 	switch(nfmsg->nfgen_family) {
 	case AF_INET:
@@ -673,8 +671,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
 
 	/* include the best revision for this extension in the message */
 	if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
-				  nlh->nlmsg_seq,
-				  NFNL_MSG_TYPE(nlh->nlmsg_type),
+				  info->nlh->nlmsg_seq,
+				  NFNL_MSG_TYPE(info->nlh->nlmsg_type),
 				  NFNL_MSG_COMPAT_GET,
 				  nfmsg->nfgen_family,
 				  name, ret, target) <= 0) {
@@ -682,8 +680,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
 		goto out_put;
 	}
 
-	ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
-				MSG_DONTWAIT);
+	ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid,
+			      MSG_DONTWAIT);
 	if (ret > 0)
 		ret = 0;
 out_put:
-- 
cgit v1.2.3


From 7dab8ee3b6e7ec856a616d07ebb9ebd736c92520 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 23 Apr 2021 00:17:11 +0200
Subject: netfilter: nfnetlink: pass struct nfnl_info to batch callbacks

Update batch callbacks to use the nfnl_info structure. Rename one
clashing info variable to expr_info.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h |   6 +-
 net/netfilter/nf_tables_api.c       | 338 ++++++++++++++++++------------------
 net/netfilter/nfnetlink.c           |  14 +-
 3 files changed, 182 insertions(+), 176 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index c11f2f99eac4..df0e3254c57b 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -19,10 +19,8 @@ struct nfnl_callback {
 		    const struct nlattr * const cda[]);
 	int (*call_rcu)(struct sk_buff *skb, const struct nfnl_info *info,
 			const struct nlattr * const cda[]);
-	int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb,
-			  const struct nlmsghdr *nlh,
-			  const struct nlattr * const cda[],
-			  struct netlink_ext_ack *extack);
+	int (*call_batch)(struct sk_buff *skb, const struct nfnl_info *info,
+			  const struct nlattr * const cda[]);
 	const struct nla_policy *policy;	/* netlink attribute policy */
 	const u_int16_t attr_count;		/* number of nlattr's */
 };
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f7c4e6f14130..280ca136df56 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1055,15 +1055,15 @@ static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg,
 	return strcmp(obj->key.name, k->name);
 }
 
-static int nf_tables_newtable(struct net *net, struct sock *nlsk,
-			      struct sk_buff *skb, const struct nlmsghdr *nlh,
-			      const struct nlattr * const nla[],
-			      struct netlink_ext_ack *extack)
+static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nla[])
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
+	struct nftables_pernet *nft_net = nft_pernet(info->net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
 	int family = nfmsg->nfgen_family;
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_ctx ctx;
@@ -1078,14 +1078,15 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
 	} else {
-		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
 			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
 		}
-		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+		if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
-		nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+		nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
 		return nf_tables_updtable(&ctx);
 	}
 
@@ -1126,7 +1127,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	if (table->flags & NFT_TABLE_F_OWNER)
 		table->nlpid = NETLINK_CB(skb).portid;
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
 	if (err < 0)
 		goto err_trans;
@@ -1250,19 +1251,19 @@ out:
 	return err;
 }
 
-static int nf_tables_deltable(struct net *net, struct sock *nlsk,
-			      struct sk_buff *skb, const struct nlmsghdr *nlh,
-			      const struct nlattr * const nla[],
-			      struct netlink_ext_ack *extack)
+static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
 	int family = nfmsg->nfgen_family;
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_ctx ctx;
 
-	nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla);
 	if (family == AF_UNSPEC ||
 	    (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
 		return nft_flush(&ctx, family);
@@ -1281,7 +1282,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 	}
 
-	if (nlh->nlmsg_flags & NLM_F_NONREC &&
+	if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
 	    table->use > 0)
 		return -EBUSY;
 
@@ -2350,16 +2351,16 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
 	return ERR_PTR(-ENOENT);
 }
 
-static int nf_tables_newchain(struct net *net, struct sock *nlsk,
-			      struct sk_buff *skb, const struct nlmsghdr *nlh,
-			      const struct nlattr * const nla[],
-			      struct netlink_ext_ack *extack)
+static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nla[])
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
+	struct nftables_pernet *nft_net = nft_pernet(info->net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
 	int family = nfmsg->nfgen_family;
 	struct nft_chain *chain = NULL;
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
 	u8 policy = NF_ACCEPT;
@@ -2431,14 +2432,14 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	if (flags & ~NFT_CHAIN_FLAGS)
 		return -EOPNOTSUPP;
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
 
 	if (chain != NULL) {
-		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
 			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
 		}
-		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+		if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
 		flags |= chain->flags & NFT_CHAIN_BASE;
@@ -2449,14 +2450,14 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	return nf_tables_addchain(&ctx, family, genmask, policy, flags);
 }
 
-static int nf_tables_delchain(struct net *net, struct sock *nlsk,
-			      struct sk_buff *skb, const struct nlmsghdr *nlh,
-			      const struct nlattr * const nla[],
-			      struct netlink_ext_ack *extack)
+static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
+			      const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
 	int family = nfmsg->nfgen_family;
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_chain *chain;
@@ -2486,11 +2487,11 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 		return PTR_ERR(chain);
 	}
 
-	if (nlh->nlmsg_flags & NLM_F_NONREC &&
+	if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
 	    chain->use > 0)
 		return -EBUSY;
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
 
 	use = chain->use;
 	list_for_each_entry(rule, &chain->rules, list) {
@@ -2713,15 +2714,15 @@ err1:
 }
 
 static int nf_tables_newexpr(const struct nft_ctx *ctx,
-			     const struct nft_expr_info *info,
+			     const struct nft_expr_info *expr_info,
 			     struct nft_expr *expr)
 {
-	const struct nft_expr_ops *ops = info->ops;
+	const struct nft_expr_ops *ops = expr_info->ops;
 	int err;
 
 	expr->ops = ops;
 	if (ops->init) {
-		err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
+		err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
 		if (err < 0)
 			goto err1;
 	}
@@ -2745,21 +2746,21 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
 static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
 				      const struct nlattr *nla)
 {
-	struct nft_expr_info info;
+	struct nft_expr_info expr_info;
 	struct nft_expr *expr;
 	struct module *owner;
 	int err;
 
-	err = nf_tables_expr_parse(ctx, nla, &info);
+	err = nf_tables_expr_parse(ctx, nla, &expr_info);
 	if (err < 0)
 		goto err1;
 
 	err = -ENOMEM;
-	expr = kzalloc(info.ops->size, GFP_KERNEL);
+	expr = kzalloc(expr_info.ops->size, GFP_KERNEL);
 	if (expr == NULL)
 		goto err2;
 
-	err = nf_tables_newexpr(ctx, &info, expr);
+	err = nf_tables_newexpr(ctx, &expr_info, expr);
 	if (err < 0)
 		goto err3;
 
@@ -2767,9 +2768,9 @@ static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
 err3:
 	kfree(expr);
 err2:
-	owner = info.ops->type->owner;
-	if (info.ops->type->release_ops)
-		info.ops->type->release_ops(info.ops);
+	owner = expr_info.ops->type->owner;
+	if (expr_info.ops->type->release_ops)
+		expr_info.ops->type->release_ops(expr_info.ops);
 
 	module_put(owner);
 err1:
@@ -3216,28 +3217,28 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
 
 #define NFT_RULE_MAXEXPRS	128
 
-static int nf_tables_newrule(struct net *net, struct sock *nlsk,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const nla[],
-			     struct netlink_ext_ack *extack)
+static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const nla[])
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
-	struct nft_expr_info *info = NULL;
+	struct nftables_pernet *nft_net = nft_pernet(info->net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	unsigned int size, i, n, ulen = 0, usize = 0;
+	u8 genmask = nft_genmask_next(info->net);
+	struct nft_rule *rule, *old_rule = NULL;
+	struct nft_expr_info *expr_info = NULL;
 	int family = nfmsg->nfgen_family;
+	struct net *net = info->net;
 	struct nft_flow_rule *flow;
+	struct nft_userdata *udata;
 	struct nft_table *table;
 	struct nft_chain *chain;
-	struct nft_rule *rule, *old_rule = NULL;
-	struct nft_userdata *udata;
-	struct nft_trans *trans = NULL;
+	struct nft_trans *trans;
+	u64 handle, pos_handle;
 	struct nft_expr *expr;
 	struct nft_ctx ctx;
 	struct nlattr *tmp;
-	unsigned int size, i, n, ulen = 0, usize = 0;
 	int err, rem;
-	u64 handle, pos_handle;
 
 	lockdep_assert_held(&nft_net->commit_mutex);
 
@@ -3276,17 +3277,17 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 			return PTR_ERR(rule);
 		}
 
-		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
 			NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
 			return -EEXIST;
 		}
-		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+		if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
 			old_rule = rule;
 		else
 			return -EOPNOTSUPP;
 	} else {
-		if (!(nlh->nlmsg_flags & NLM_F_CREATE) ||
-		    nlh->nlmsg_flags & NLM_F_REPLACE)
+		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) ||
+		    info->nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EINVAL;
 		handle = nf_tables_alloc_handle(table);
 
@@ -3309,15 +3310,15 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 		}
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
 
 	n = 0;
 	size = 0;
 	if (nla[NFTA_RULE_EXPRESSIONS]) {
-		info = kvmalloc_array(NFT_RULE_MAXEXPRS,
-				      sizeof(struct nft_expr_info),
-				      GFP_KERNEL);
-		if (!info)
+		expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS,
+					   sizeof(struct nft_expr_info),
+					   GFP_KERNEL);
+		if (!expr_info)
 			return -ENOMEM;
 
 		nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
@@ -3326,10 +3327,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 				goto err1;
 			if (n == NFT_RULE_MAXEXPRS)
 				goto err1;
-			err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
+			err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]);
 			if (err < 0)
 				goto err1;
-			size += info[n].ops->size;
+			size += expr_info[n].ops->size;
 			n++;
 		}
 	}
@@ -3363,20 +3364,20 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 
 	expr = nft_expr_first(rule);
 	for (i = 0; i < n; i++) {
-		err = nf_tables_newexpr(&ctx, &info[i], expr);
+		err = nf_tables_newexpr(&ctx, &expr_info[i], expr);
 		if (err < 0) {
-			NL_SET_BAD_ATTR(extack, info[i].attr);
+			NL_SET_BAD_ATTR(extack, expr_info[i].attr);
 			goto err2;
 		}
 
-		if (info[i].ops->validate)
+		if (expr_info[i].ops->validate)
 			nft_validate_state_update(net, NFT_VALIDATE_NEED);
 
-		info[i].ops = NULL;
+		expr_info[i].ops = NULL;
 		expr = nft_expr_next(expr);
 	}
 
-	if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+	if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
 		trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
 		if (trans == NULL) {
 			err = -ENOMEM;
@@ -3396,7 +3397,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 			goto err2;
 		}
 
-		if (nlh->nlmsg_flags & NLM_F_APPEND) {
+		if (info->nlh->nlmsg_flags & NLM_F_APPEND) {
 			if (old_rule)
 				list_add_rcu(&rule->list, &old_rule->list);
 			else
@@ -3408,7 +3409,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 				list_add_rcu(&rule->list, &chain->rules);
 		}
 	}
-	kvfree(info);
+	kvfree(expr_info);
 	chain->use++;
 
 	if (nft_net->validate_state == NFT_VALIDATE_DO)
@@ -3427,13 +3428,14 @@ err2:
 	nf_tables_rule_release(&ctx, rule);
 err1:
 	for (i = 0; i < n; i++) {
-		if (info[i].ops) {
-			module_put(info[i].ops->type->owner);
-			if (info[i].ops->type->release_ops)
-				info[i].ops->type->release_ops(info[i].ops);
+		if (expr_info[i].ops) {
+			module_put(expr_info[i].ops->type->owner);
+			if (expr_info[i].ops->type->release_ops)
+				expr_info[i].ops->type->release_ops(expr_info[i].ops);
 		}
 	}
-	kvfree(info);
+	kvfree(expr_info);
+
 	return err;
 }
 
@@ -3454,17 +3456,17 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
 	return ERR_PTR(-ENOENT);
 }
 
-static int nf_tables_delrule(struct net *net, struct sock *nlsk,
-			     struct sk_buff *skb, const struct nlmsghdr *nlh,
-			     const struct nlattr * const nla[],
-			     struct netlink_ext_ack *extack)
+static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
-	struct nft_table *table;
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	int family = nfmsg->nfgen_family, err = 0;
+	u8 genmask = nft_genmask_next(info->net);
 	struct nft_chain *chain = NULL;
+	struct net *net = info->net;
+	struct nft_table *table;
 	struct nft_rule *rule;
-	int family = nfmsg->nfgen_family, err = 0;
 	struct nft_ctx ctx;
 
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
@@ -3485,7 +3487,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 			return -EOPNOTSUPP;
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
 
 	if (chain) {
 		if (nla[NFTA_RULE_HANDLE]) {
@@ -4166,28 +4168,27 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
 	return err;
 }
 
-static int nf_tables_newset(struct net *net, struct sock *nlsk,
-			    struct sk_buff *skb, const struct nlmsghdr *nlh,
-			    const struct nlattr * const nla[],
-			    struct netlink_ext_ack *extack)
+static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
+			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	u32 ktype, dtype, flags, policy, gc_int, objtype;
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
 	int family = nfmsg->nfgen_family;
 	const struct nft_set_ops *ops;
 	struct nft_expr *expr = NULL;
+	struct net *net = info->net;
+	struct nft_set_desc desc;
 	struct nft_table *table;
+	unsigned char *udata;
 	struct nft_set *set;
 	struct nft_ctx ctx;
-	char *name;
-	u64 size;
 	u64 timeout;
-	u32 ktype, dtype, flags, policy, gc_int, objtype;
-	struct nft_set_desc desc;
-	unsigned char *udata;
+	char *name;
+	int err, i;
 	u16 udlen;
-	int err;
-	int i;
+	u64 size;
 
 	if (nla[NFTA_SET_TABLE] == NULL ||
 	    nla[NFTA_SET_NAME] == NULL ||
@@ -4295,7 +4296,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
 	set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set)) {
@@ -4304,17 +4305,17 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 			return PTR_ERR(set);
 		}
 	} else {
-		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
 			NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
 			return -EEXIST;
 		}
-		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+		if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
 		return 0;
 	}
 
-	if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+	if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
 		return -ENOENT;
 
 	ops = nft_select_set_ops(&ctx, nla, &desc, policy);
@@ -4448,13 +4449,13 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
 	kvfree(set);
 }
 
-static int nf_tables_delset(struct net *net, struct sock *nlsk,
-			    struct sk_buff *skb, const struct nlmsghdr *nlh,
-			    const struct nlattr * const nla[],
-			    struct netlink_ext_ack *extack)
+static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
+			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
@@ -4465,7 +4466,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_TABLE] == NULL)
 		return -EINVAL;
 
-	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
+	err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack,
 					genmask, NETLINK_CB(skb).portid);
 	if (err < 0)
 		return err;
@@ -4483,7 +4484,8 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 		return PTR_ERR(set);
 	}
 	if (set->use ||
-	    (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
+	    (info->nlh->nlmsg_flags & NLM_F_NONREC &&
+	     atomic_read(&set->nelems) > 0)) {
 		NL_SET_BAD_ATTR(extack, attr);
 		return -EBUSY;
 	}
@@ -5654,13 +5656,14 @@ err_set_elem_expr_clone:
 	return err;
 }
 
-static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
-				struct sk_buff *skb, const struct nlmsghdr *nlh,
-				const struct nlattr * const nla[],
-				struct netlink_ext_ack *extack)
+static int nf_tables_newsetelem(struct sk_buff *skb,
+				const struct nfnl_info *info,
+				const struct nlattr * const nla[])
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
-	u8 genmask = nft_genmask_next(net);
+	struct nftables_pernet *nft_net = nft_pernet(info->net);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
@@ -5669,7 +5672,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
 		return -EINVAL;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack,
 					 genmask, NETLINK_CB(skb).portid);
 	if (err < 0)
 		return err;
@@ -5683,7 +5686,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 		return -EBUSY;
 
 	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
-		err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
+		err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
 		if (err < 0)
 			return err;
 	}
@@ -5866,18 +5869,19 @@ err1:
 	return err;
 }
 
-static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
-				struct sk_buff *skb, const struct nlmsghdr *nlh,
-				const struct nlattr * const nla[],
-				struct netlink_ext_ack *extack)
+static int nf_tables_delsetelem(struct sk_buff *skb,
+				const struct nfnl_info *info,
+				const struct nlattr * const nla[])
 {
-	u8 genmask = nft_genmask_next(net);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
 	int rem, err = 0;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack,
 					 genmask, NETLINK_CB(skb).portid);
 	if (err < 0)
 		return err;
@@ -6161,15 +6165,15 @@ err_free_trans:
 	return err;
 }
 
-static int nf_tables_newobj(struct net *net, struct sock *nlsk,
-			    struct sk_buff *skb, const struct nlmsghdr *nlh,
-			    const struct nlattr * const nla[],
-			    struct netlink_ext_ack *extack)
+static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
+			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
 	const struct nft_object_type *type;
-	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
+	struct net *net = info->net;
 	struct nft_table *table;
 	struct nft_object *obj;
 	struct nft_ctx ctx;
@@ -6197,20 +6201,20 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 			return err;
 		}
 	} else {
-		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
 			NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
 			return -EEXIST;
 		}
-		if (nlh->nlmsg_flags & NLM_F_REPLACE)
+		if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
 		type = __nft_obj_type_get(objtype);
-		nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+		nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
 		return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
 	type = nft_obj_type_get(net, objtype);
 	if (IS_ERR(type))
@@ -6507,14 +6511,14 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
 	kfree(obj);
 }
 
-static int nf_tables_delobj(struct net *net, struct sock *nlsk,
-			    struct sk_buff *skb, const struct nlmsghdr *nlh,
-			    const struct nlattr * const nla[],
-			    struct netlink_ext_ack *extack)
+static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
+			    const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
 	int family = nfmsg->nfgen_family;
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_object *obj;
@@ -6550,7 +6554,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 		return -EBUSY;
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
 	return nft_delobj(&ctx, obj);
 }
@@ -6937,19 +6941,19 @@ err_flowtable_update_hook:
 
 }
 
-static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
-				  struct sk_buff *skb,
-				  const struct nlmsghdr *nlh,
-				  const struct nlattr * const nla[],
-				  struct netlink_ext_ack *extack)
+static int nf_tables_newflowtable(struct sk_buff *skb,
+				  const struct nfnl_info *info,
+				  const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
 	struct nft_flowtable_hook flowtable_hook;
+	u8 genmask = nft_genmask_next(info->net);
 	const struct nf_flowtable_type *type;
-	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
 	struct nft_hook *hook, *next;
+	struct net *net = info->net;
 	struct nft_table *table;
 	struct nft_ctx ctx;
 	int err;
@@ -6975,17 +6979,17 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 			return err;
 		}
 	} else {
-		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+		if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
 			NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
 			return -EEXIST;
 		}
 
-		nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+		nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
-		return nft_flowtable_update(&ctx, nlh, flowtable);
+		return nft_flowtable_update(&ctx, info->nlh, flowtable);
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
 	flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
 	if (!flowtable)
@@ -7126,16 +7130,16 @@ err_flowtable_del_hook:
 	return err;
 }
 
-static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
-				  struct sk_buff *skb,
-				  const struct nlmsghdr *nlh,
-				  const struct nlattr * const nla[],
-				  struct netlink_ext_ack *extack)
+static int nf_tables_delflowtable(struct sk_buff *skb,
+				  const struct nfnl_info *info,
+				  const struct nlattr * const nla[])
 {
-	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	u8 genmask = nft_genmask_next(net);
+	const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_next(info->net);
 	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
+	struct net *net = info->net;
 	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_ctx ctx;
@@ -7165,7 +7169,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 		return PTR_ERR(flowtable);
 	}
 
-	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
 	if (nla[NFTA_FLOWTABLE_HOOK])
 		return nft_delflowtable_hook(&ctx, flowtable);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 7920f6c4ff69..e62c5af4b631 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -469,10 +469,17 @@ replay_abort:
 
 		{
 			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+			struct nfnl_net *nfnlnet = nfnl_pernet(net);
 			u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
 			struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
 			struct nlattr *attr = (void *)nlh + min_len;
 			int attrlen = nlh->nlmsg_len - min_len;
+			struct nfnl_info info = {
+				.net	= net,
+				.sk	= nfnlnet->nfnl,
+				.nlh	= nlh,
+				.extack	= &extack,
+			};
 
 			/* Sanity-check NFTA_MAX_ATTR */
 			if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
@@ -488,11 +495,8 @@ replay_abort:
 				goto ack;
 
 			if (nc->call_batch) {
-				struct nfnl_net *nfnlnet = nfnl_pernet(net);
-
-				err = nc->call_batch(net, nfnlnet->nfnl, skb, nlh,
-						     (const struct nlattr **)cda,
-						     &extack);
+				err = nc->call_batch(skb, &info,
+						     (const struct nlattr **)cda);
 			}
 
 			/* The lock was released to autoload some module, we
-- 
cgit v1.2.3


From 50f2db9e368f73ecbbaa92da365183fa953aaba7 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 23 Apr 2021 00:17:12 +0200
Subject: netfilter: nfnetlink: consolidate callback types

Add enum nfnl_callback_type to identify the callback type to provide one
single callback.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h  | 16 ++++---
 net/netfilter/ipset/ip_set_core.c    | 16 +++++++
 net/netfilter/nf_conntrack_netlink.c | 88 +++++++++++++++++++++++++-----------
 net/netfilter/nf_tables_api.c        | 69 ++++++++++++++++++----------
 net/netfilter/nfnetlink.c            | 37 +++++++++------
 net/netfilter/nfnetlink_acct.c       | 36 ++++++++++-----
 net/netfilter/nfnetlink_cthelper.c   | 27 +++++++----
 net/netfilter/nfnetlink_cttimeout.c  | 45 ++++++++++++------
 net/netfilter/nfnetlink_log.c        | 16 +++++--
 net/netfilter/nfnetlink_osf.c        |  2 +
 net/netfilter/nfnetlink_queue.c      | 34 +++++++++-----
 net/netfilter/nft_compat.c           |  9 ++--
 12 files changed, 271 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index df0e3254c57b..515ce53aa20d 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -14,15 +14,19 @@ struct nfnl_info {
 	struct netlink_ext_ack	*extack;
 };
 
+enum nfnl_callback_type {
+	NFNL_CB_UNSPEC	= 0,
+	NFNL_CB_MUTEX,
+	NFNL_CB_RCU,
+	NFNL_CB_BATCH,
+};
+
 struct nfnl_callback {
 	int (*call)(struct sk_buff *skb, const struct nfnl_info *info,
 		    const struct nlattr * const cda[]);
-	int (*call_rcu)(struct sk_buff *skb, const struct nfnl_info *info,
-			const struct nlattr * const cda[]);
-	int (*call_batch)(struct sk_buff *skb, const struct nfnl_info *info,
-			  const struct nlattr * const cda[]);
-	const struct nla_policy *policy;	/* netlink attribute policy */
-	const u_int16_t attr_count;		/* number of nlattr's */
+	const struct nla_policy	*policy;
+	enum nfnl_callback_type	type;
+	__u16			attr_count;
 };
 
 enum nfnl_abort_action {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index bf9902c1daa8..de2d20c37cda 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -2108,80 +2108,96 @@ nlmsg_failure:
 static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
 	[IPSET_CMD_NONE]	= {
 		.call		= ip_set_none,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 	},
 	[IPSET_CMD_CREATE]	= {
 		.call		= ip_set_create,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_create_policy,
 	},
 	[IPSET_CMD_DESTROY]	= {
 		.call		= ip_set_destroy,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_setname_policy,
 	},
 	[IPSET_CMD_FLUSH]	= {
 		.call		= ip_set_flush,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_setname_policy,
 	},
 	[IPSET_CMD_RENAME]	= {
 		.call		= ip_set_rename,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_setname2_policy,
 	},
 	[IPSET_CMD_SWAP]	= {
 		.call		= ip_set_swap,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_setname2_policy,
 	},
 	[IPSET_CMD_LIST]	= {
 		.call		= ip_set_dump,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_dump_policy,
 	},
 	[IPSET_CMD_SAVE]	= {
 		.call		= ip_set_dump,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_setname_policy,
 	},
 	[IPSET_CMD_ADD]	= {
 		.call		= ip_set_uadd,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_adt_policy,
 	},
 	[IPSET_CMD_DEL]	= {
 		.call		= ip_set_udel,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_adt_policy,
 	},
 	[IPSET_CMD_TEST]	= {
 		.call		= ip_set_utest,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_adt_policy,
 	},
 	[IPSET_CMD_HEADER]	= {
 		.call		= ip_set_header,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_setname_policy,
 	},
 	[IPSET_CMD_TYPE]	= {
 		.call		= ip_set_type,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_type_policy,
 	},
 	[IPSET_CMD_PROTOCOL]	= {
 		.call		= ip_set_protocol,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_protocol_policy,
 	},
 	[IPSET_CMD_GET_BYNAME]	= {
 		.call		= ip_set_byname,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_setname_policy,
 	},
 	[IPSET_CMD_GET_BYINDEX]	= {
 		.call		= ip_set_byindex,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= IPSET_ATTR_CMD_MAX,
 		.policy		= ip_set_index_policy,
 	},
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 5147a63b3d1b..8690fc07030f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3751,35 +3751,71 @@ static struct nf_exp_event_notifier ctnl_notifier_exp = {
 #endif
 
 static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
-	[IPCTNL_MSG_CT_NEW]		= { .call = ctnetlink_new_conntrack,
-					    .attr_count = CTA_MAX,
-					    .policy = ct_nla_policy },
-	[IPCTNL_MSG_CT_GET] 		= { .call = ctnetlink_get_conntrack,
-					    .attr_count = CTA_MAX,
-					    .policy = ct_nla_policy },
-	[IPCTNL_MSG_CT_DELETE]  	= { .call = ctnetlink_del_conntrack,
-					    .attr_count = CTA_MAX,
-					    .policy = ct_nla_policy },
-	[IPCTNL_MSG_CT_GET_CTRZERO] 	= { .call = ctnetlink_get_conntrack,
-					    .attr_count = CTA_MAX,
-					    .policy = ct_nla_policy },
-	[IPCTNL_MSG_CT_GET_STATS_CPU]	= { .call = ctnetlink_stat_ct_cpu },
-	[IPCTNL_MSG_CT_GET_STATS]	= { .call = ctnetlink_stat_ct },
-	[IPCTNL_MSG_CT_GET_DYING]	= { .call = ctnetlink_get_ct_dying },
-	[IPCTNL_MSG_CT_GET_UNCONFIRMED]	= { .call = ctnetlink_get_ct_unconfirmed },
+	[IPCTNL_MSG_CT_NEW]	= {
+		.call		= ctnetlink_new_conntrack,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_MAX,
+		.policy		= ct_nla_policy
+	},
+	[IPCTNL_MSG_CT_GET]	= {
+		.call		= ctnetlink_get_conntrack,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_MAX,
+		.policy		= ct_nla_policy
+	},
+	[IPCTNL_MSG_CT_DELETE]	= {
+		.call		= ctnetlink_del_conntrack,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_MAX,
+		.policy		= ct_nla_policy
+	},
+	[IPCTNL_MSG_CT_GET_CTRZERO] = {
+		.call		= ctnetlink_get_conntrack,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_MAX,
+		.policy		= ct_nla_policy
+	},
+	[IPCTNL_MSG_CT_GET_STATS_CPU] = {
+		.call		= ctnetlink_stat_ct_cpu,
+		.type		= NFNL_CB_MUTEX,
+	},
+	[IPCTNL_MSG_CT_GET_STATS] = {
+		.call		= ctnetlink_stat_ct,
+		.type		= NFNL_CB_MUTEX,
+	},
+	[IPCTNL_MSG_CT_GET_DYING] = {
+		.call		= ctnetlink_get_ct_dying,
+		.type		= NFNL_CB_MUTEX,
+	},
+	[IPCTNL_MSG_CT_GET_UNCONFIRMED]	= {
+		.call		= ctnetlink_get_ct_unconfirmed,
+		.type		= NFNL_CB_MUTEX,
+	},
 };
 
 static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
-	[IPCTNL_MSG_EXP_GET]		= { .call = ctnetlink_get_expect,
-					    .attr_count = CTA_EXPECT_MAX,
-					    .policy = exp_nla_policy },
-	[IPCTNL_MSG_EXP_NEW]		= { .call = ctnetlink_new_expect,
-					    .attr_count = CTA_EXPECT_MAX,
-					    .policy = exp_nla_policy },
-	[IPCTNL_MSG_EXP_DELETE]		= { .call = ctnetlink_del_expect,
-					    .attr_count = CTA_EXPECT_MAX,
-					    .policy = exp_nla_policy },
-	[IPCTNL_MSG_EXP_GET_STATS_CPU]	= { .call = ctnetlink_stat_exp_cpu },
+	[IPCTNL_MSG_EXP_GET] = {
+		.call		= ctnetlink_get_expect,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_EXPECT_MAX,
+		.policy		= exp_nla_policy
+	},
+	[IPCTNL_MSG_EXP_NEW] = {
+		.call		= ctnetlink_new_expect,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_EXPECT_MAX,
+		.policy		= exp_nla_policy
+	},
+	[IPCTNL_MSG_EXP_DELETE] = {
+		.call		= ctnetlink_del_expect,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_EXPECT_MAX,
+		.policy		= exp_nla_policy
+	},
+	[IPCTNL_MSG_EXP_GET_STATS_CPU] = {
+		.call		= ctnetlink_stat_exp_cpu,
+		.type		= NFNL_CB_MUTEX,
+	},
 };
 
 static const struct nfnetlink_subsystem ctnl_subsys = {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 280ca136df56..1050f23c0d29 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -7554,115 +7554,138 @@ err_fill_gen_info:
 
 static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 	[NFT_MSG_NEWTABLE] = {
-		.call_batch	= nf_tables_newtable,
+		.call		= nf_tables_newtable,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_TABLE_MAX,
 		.policy		= nft_table_policy,
 	},
 	[NFT_MSG_GETTABLE] = {
-		.call_rcu	= nf_tables_gettable,
+		.call		= nf_tables_gettable,
+		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_TABLE_MAX,
 		.policy		= nft_table_policy,
 	},
 	[NFT_MSG_DELTABLE] = {
-		.call_batch	= nf_tables_deltable,
+		.call		= nf_tables_deltable,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_TABLE_MAX,
 		.policy		= nft_table_policy,
 	},
 	[NFT_MSG_NEWCHAIN] = {
-		.call_batch	= nf_tables_newchain,
+		.call		= nf_tables_newchain,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_CHAIN_MAX,
 		.policy		= nft_chain_policy,
 	},
 	[NFT_MSG_GETCHAIN] = {
-		.call_rcu	= nf_tables_getchain,
+		.call		= nf_tables_getchain,
+		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_CHAIN_MAX,
 		.policy		= nft_chain_policy,
 	},
 	[NFT_MSG_DELCHAIN] = {
-		.call_batch	= nf_tables_delchain,
+		.call		= nf_tables_delchain,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_CHAIN_MAX,
 		.policy		= nft_chain_policy,
 	},
 	[NFT_MSG_NEWRULE] = {
-		.call_batch	= nf_tables_newrule,
+		.call		= nf_tables_newrule,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
 	},
 	[NFT_MSG_GETRULE] = {
-		.call_rcu	= nf_tables_getrule,
+		.call		= nf_tables_getrule,
+		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
 	},
 	[NFT_MSG_DELRULE] = {
-		.call_batch	= nf_tables_delrule,
+		.call		= nf_tables_delrule,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
 	},
 	[NFT_MSG_NEWSET] = {
-		.call_batch	= nf_tables_newset,
+		.call		= nf_tables_newset,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_SET_MAX,
 		.policy		= nft_set_policy,
 	},
 	[NFT_MSG_GETSET] = {
-		.call_rcu	= nf_tables_getset,
+		.call		= nf_tables_getset,
+		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_SET_MAX,
 		.policy		= nft_set_policy,
 	},
 	[NFT_MSG_DELSET] = {
-		.call_batch	= nf_tables_delset,
+		.call		= nf_tables_delset,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_SET_MAX,
 		.policy		= nft_set_policy,
 	},
 	[NFT_MSG_NEWSETELEM] = {
-		.call_batch	= nf_tables_newsetelem,
+		.call		= nf_tables_newsetelem,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
 		.policy		= nft_set_elem_list_policy,
 	},
 	[NFT_MSG_GETSETELEM] = {
-		.call_rcu	= nf_tables_getsetelem,
+		.call		= nf_tables_getsetelem,
+		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
 		.policy		= nft_set_elem_list_policy,
 	},
 	[NFT_MSG_DELSETELEM] = {
-		.call_batch	= nf_tables_delsetelem,
+		.call		= nf_tables_delsetelem,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
 		.policy		= nft_set_elem_list_policy,
 	},
 	[NFT_MSG_GETGEN] = {
-		.call_rcu	= nf_tables_getgen,
+		.call		= nf_tables_getgen,
+		.type		= NFNL_CB_RCU,
 	},
 	[NFT_MSG_NEWOBJ] = {
-		.call_batch	= nf_tables_newobj,
+		.call		= nf_tables_newobj,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
 	},
 	[NFT_MSG_GETOBJ] = {
-		.call_rcu	= nf_tables_getobj,
+		.call		= nf_tables_getobj,
+		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
 	},
 	[NFT_MSG_DELOBJ] = {
-		.call_batch	= nf_tables_delobj,
+		.call		= nf_tables_delobj,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
 	},
 	[NFT_MSG_GETOBJ_RESET] = {
-		.call_rcu	= nf_tables_getobj,
+		.call		= nf_tables_getobj,
+		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
 	},
 	[NFT_MSG_NEWFLOWTABLE] = {
-		.call_batch	= nf_tables_newflowtable,
+		.call		= nf_tables_newflowtable,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_FLOWTABLE_MAX,
 		.policy		= nft_flowtable_policy,
 	},
 	[NFT_MSG_GETFLOWTABLE] = {
-		.call_rcu	= nf_tables_getflowtable,
+		.call		= nf_tables_getflowtable,
+		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_FLOWTABLE_MAX,
 		.policy		= nft_flowtable_policy,
 	},
 	[NFT_MSG_DELFLOWTABLE] = {
-		.call_batch	= nf_tables_delflowtable,
+		.call		= nf_tables_delflowtable,
+		.type		= NFNL_CB_BATCH,
 		.attr_count	= NFTA_FLOWTABLE_MAX,
 		.policy		= nft_flowtable_policy,
 	},
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e62c5af4b631..d7a9628b6cee 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -273,23 +273,30 @@ replay:
 			return err;
 		}
 
-		if (nc->call_rcu) {
-			err = nc->call_rcu(skb, &info,
-					   (const struct nlattr **)cda);
+		if (!nc->call) {
 			rcu_read_unlock();
-		} else {
+			return -EINVAL;
+		}
+
+		switch (nc->type) {
+		case NFNL_CB_RCU:
+			err = nc->call(skb, &info, (const struct nlattr **)cda);
+			rcu_read_unlock();
+			break;
+		case NFNL_CB_MUTEX:
 			rcu_read_unlock();
 			nfnl_lock(subsys_id);
 			if (nfnl_dereference_protected(subsys_id) != ss ||
 			    nfnetlink_find_client(type, ss) != nc) {
 				err = -EAGAIN;
-			} else if (nc->call) {
-				err = nc->call(skb, &info,
-					       (const struct nlattr **)cda);
-			} else {
-				err = -EINVAL;
+				break;
 			}
+			err = nc->call(skb, &info, (const struct nlattr **)cda);
 			nfnl_unlock(subsys_id);
+			break;
+		default:
+			err = -EINVAL;
+			break;
 		}
 		if (err == -EAGAIN)
 			goto replay;
@@ -467,12 +474,17 @@ replay_abort:
 			goto ack;
 		}
 
+		if (nc->type != NFNL_CB_BATCH) {
+			err = -EINVAL;
+			goto ack;
+		}
+
 		{
 			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
 			struct nfnl_net *nfnlnet = nfnl_pernet(net);
-			u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
 			struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
 			struct nlattr *attr = (void *)nlh + min_len;
+			u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
 			int attrlen = nlh->nlmsg_len - min_len;
 			struct nfnl_info info = {
 				.net	= net,
@@ -494,10 +506,7 @@ replay_abort:
 			if (err < 0)
 				goto ack;
 
-			if (nc->call_batch) {
-				err = nc->call_batch(skb, &info,
-						     (const struct nlattr **)cda);
-			}
+			err = nc->call(skb, &info, (const struct nlattr **)cda);
 
 			/* The lock was released to autoload some module, we
 			 * have to abort and start from scratch using the
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 9cb4b21b8e95..3c8cf8748cfb 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -382,18 +382,30 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
 };
 
 static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
-	[NFNL_MSG_ACCT_NEW]		= { .call = nfnl_acct_new,
-					    .attr_count = NFACCT_MAX,
-					    .policy = nfnl_acct_policy },
-	[NFNL_MSG_ACCT_GET] 		= { .call = nfnl_acct_get,
-					    .attr_count = NFACCT_MAX,
-					    .policy = nfnl_acct_policy },
-	[NFNL_MSG_ACCT_GET_CTRZERO] 	= { .call = nfnl_acct_get,
-					    .attr_count = NFACCT_MAX,
-					    .policy = nfnl_acct_policy },
-	[NFNL_MSG_ACCT_DEL]		= { .call = nfnl_acct_del,
-					    .attr_count = NFACCT_MAX,
-					    .policy = nfnl_acct_policy },
+	[NFNL_MSG_ACCT_NEW] = {
+		.call		= nfnl_acct_new,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFACCT_MAX,
+		.policy		= nfnl_acct_policy
+	},
+	[NFNL_MSG_ACCT_GET] = {
+		.call		= nfnl_acct_get,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFACCT_MAX,
+		.policy		= nfnl_acct_policy
+	},
+	[NFNL_MSG_ACCT_GET_CTRZERO] = {
+		.call		= nfnl_acct_get,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFACCT_MAX,
+		.policy		= nfnl_acct_policy
+	},
+	[NFNL_MSG_ACCT_DEL] = {
+		.call		= nfnl_acct_del,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFACCT_MAX,
+		.policy		= nfnl_acct_policy
+	},
 };
 
 static const struct nfnetlink_subsystem nfnl_acct_subsys = {
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 3d1a5215177b..322ac5dd5402 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -737,15 +737,24 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
 };
 
 static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
-	[NFNL_MSG_CTHELPER_NEW]		= { .call = nfnl_cthelper_new,
-					    .attr_count = NFCTH_MAX,
-					    .policy = nfnl_cthelper_policy },
-	[NFNL_MSG_CTHELPER_GET]		= { .call = nfnl_cthelper_get,
-					    .attr_count = NFCTH_MAX,
-					    .policy = nfnl_cthelper_policy },
-	[NFNL_MSG_CTHELPER_DEL]		= { .call = nfnl_cthelper_del,
-					    .attr_count = NFCTH_MAX,
-					    .policy = nfnl_cthelper_policy },
+	[NFNL_MSG_CTHELPER_NEW]	= {
+		.call		= nfnl_cthelper_new,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFCTH_MAX,
+		.policy		= nfnl_cthelper_policy
+	},
+	[NFNL_MSG_CTHELPER_GET] = {
+		.call		= nfnl_cthelper_get,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFCTH_MAX,
+		.policy		= nfnl_cthelper_policy
+	},
+	[NFNL_MSG_CTHELPER_DEL]	= {
+		.call		= nfnl_cthelper_del,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFCTH_MAX,
+		.policy		= nfnl_cthelper_policy
+	},
 };
 
 static const struct nfnetlink_subsystem nfnl_cthelper_subsys = {
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 994f3172bf42..38848ad68899 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -546,21 +546,36 @@ static void ctnl_timeout_put(struct nf_ct_timeout *t)
 }
 
 static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
-	[IPCTNL_MSG_TIMEOUT_NEW]	= { .call = cttimeout_new_timeout,
-					    .attr_count = CTA_TIMEOUT_MAX,
-					    .policy = cttimeout_nla_policy },
-	[IPCTNL_MSG_TIMEOUT_GET]	= { .call = cttimeout_get_timeout,
-					    .attr_count = CTA_TIMEOUT_MAX,
-					    .policy = cttimeout_nla_policy },
-	[IPCTNL_MSG_TIMEOUT_DELETE]	= { .call = cttimeout_del_timeout,
-					    .attr_count = CTA_TIMEOUT_MAX,
-					    .policy = cttimeout_nla_policy },
-	[IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set,
-					    .attr_count = CTA_TIMEOUT_MAX,
-					    .policy = cttimeout_nla_policy },
-	[IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get,
-					    .attr_count = CTA_TIMEOUT_MAX,
-					    .policy = cttimeout_nla_policy },
+	[IPCTNL_MSG_TIMEOUT_NEW] = {
+		.call		= cttimeout_new_timeout,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_TIMEOUT_MAX,
+		.policy		= cttimeout_nla_policy
+	},
+	[IPCTNL_MSG_TIMEOUT_GET] = {
+		.call		= cttimeout_get_timeout,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_TIMEOUT_MAX,
+		.policy		= cttimeout_nla_policy
+	},
+	[IPCTNL_MSG_TIMEOUT_DELETE] = {
+		.call		= cttimeout_del_timeout,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_TIMEOUT_MAX,
+		.policy		= cttimeout_nla_policy
+	},
+	[IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = {
+		.call		= cttimeout_default_set,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_TIMEOUT_MAX,
+		.policy		= cttimeout_nla_policy
+	},
+	[IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = {
+		.call		= cttimeout_default_get,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= CTA_TIMEOUT_MAX,
+		.policy		= cttimeout_nla_policy
+	},
 };
 
 static const struct nfnetlink_subsystem cttimeout_subsys = {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 81630600b4ef..587086b18c36 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -989,11 +989,17 @@ out:
 }
 
 static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
-	[NFULNL_MSG_PACKET]	= { .call = nfulnl_recv_unsupp,
-				    .attr_count = NFULA_MAX, },
-	[NFULNL_MSG_CONFIG]	= { .call = nfulnl_recv_config,
-				    .attr_count = NFULA_CFG_MAX,
-				    .policy = nfula_cfg_policy },
+	[NFULNL_MSG_PACKET]	= {
+		.call		= nfulnl_recv_unsupp,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFULA_MAX,
+	},
+	[NFULNL_MSG_CONFIG]	= {
+		.call		= nfulnl_recv_config,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFULA_CFG_MAX,
+		.policy		= nfula_cfg_policy
+	},
 };
 
 static const struct nfnetlink_subsystem nfulnl_subsys = {
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 1fd537ef4496..e8f8875c6884 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -374,11 +374,13 @@ static int nfnl_osf_remove_callback(struct sk_buff *skb,
 static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = {
 	[OSF_MSG_ADD]	= {
 		.call		= nfnl_osf_add_callback,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= OSF_ATTR_MAX,
 		.policy		= nfnl_osf_policy,
 	},
 	[OSF_MSG_REMOVE]	= {
 		.call		= nfnl_osf_remove_callback,
+		.type		= NFNL_CB_MUTEX,
 		.attr_count	= OSF_ATTR_MAX,
 		.policy		= nfnl_osf_policy,
 	},
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index ede9252c8de1..f37a575ebd7f 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1365,17 +1365,29 @@ err_out_unlock:
 }
 
 static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
-	[NFQNL_MSG_PACKET]	= { .call_rcu = nfqnl_recv_unsupp,
-				    .attr_count = NFQA_MAX, },
-	[NFQNL_MSG_VERDICT]	= { .call_rcu = nfqnl_recv_verdict,
-				    .attr_count = NFQA_MAX,
-				    .policy = nfqa_verdict_policy },
-	[NFQNL_MSG_CONFIG]	= { .call = nfqnl_recv_config,
-				    .attr_count = NFQA_CFG_MAX,
-				    .policy = nfqa_cfg_policy },
-	[NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch,
-				    .attr_count = NFQA_MAX,
-				    .policy = nfqa_verdict_batch_policy },
+	[NFQNL_MSG_PACKET]	= {
+		.call		= nfqnl_recv_unsupp,
+		.type		= NFNL_CB_RCU,
+		.attr_count	= NFQA_MAX,
+	},
+	[NFQNL_MSG_VERDICT]	= {
+		.call		= nfqnl_recv_verdict,
+		.type		= NFNL_CB_RCU,
+		.attr_count	= NFQA_MAX,
+		.policy		= nfqa_verdict_policy
+	},
+	[NFQNL_MSG_CONFIG]	= {
+		.call		= nfqnl_recv_config,
+		.type		= NFNL_CB_MUTEX,
+		.attr_count	= NFQA_CFG_MAX,
+		.policy		= nfqa_cfg_policy
+	},
+	[NFQNL_MSG_VERDICT_BATCH] = {
+		.call		= nfqnl_recv_verdict_batch,
+		.type		= NFNL_CB_RCU,
+		.attr_count	= NFQA_MAX,
+		.policy		= nfqa_verdict_batch_policy
+	},
 };
 
 static const struct nfnetlink_subsystem nfqnl_subsys = {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 4c0657245d5a..5415ab14400d 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -698,9 +698,12 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
 };
 
 static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
-	[NFNL_MSG_COMPAT_GET]		= { .call_rcu = nfnl_compat_get_rcu,
-					    .attr_count = NFTA_COMPAT_MAX,
-					    .policy = nfnl_compat_policy_get },
+	[NFNL_MSG_COMPAT_GET]	= {
+		.call		= nfnl_compat_get_rcu,
+		.type		= NFNL_CB_RCU,
+		.attr_count	= NFTA_COMPAT_MAX,
+		.policy		= nfnl_compat_policy_get
+	},
 };
 
 static const struct nfnetlink_subsystem nfnl_compat_subsys = {
-- 
cgit v1.2.3


From 47a6959fa331fe892a4fc3b48ca08e92045c6bda Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 26 Apr 2021 12:14:40 +0200
Subject: netfilter: allow to turn off xtables compat layer

The compat layer needs to parse untrusted input (the ruleset)
to translate it to a 64bit compatible format.

We had a number of bugs in this department in the past, so allow users
to turn this feature off.

Add CONFIG_NETFILTER_XTABLES_COMPAT kconfig knob and make it default to y
to keep existing behaviour.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h        | 12 ++++++------
 include/linux/netfilter_arp/arp_tables.h  |  2 +-
 include/linux/netfilter_ipv4/ip_tables.h  |  2 +-
 include/linux/netfilter_ipv6/ip6_tables.h |  2 +-
 net/bridge/netfilter/ebt_limit.c          |  4 ++--
 net/bridge/netfilter/ebt_mark.c           |  4 ++--
 net/bridge/netfilter/ebt_mark_m.c         |  4 ++--
 net/bridge/netfilter/ebtables.c           | 12 ++++++------
 net/ipv4/netfilter/arp_tables.c           | 16 ++++++++--------
 net/ipv4/netfilter/ip_tables.c            | 16 ++++++++--------
 net/ipv4/netfilter/ipt_CLUSTERIP.c        |  8 ++++----
 net/ipv6/netfilter/ip6_tables.c           | 16 ++++++++--------
 net/netfilter/Kconfig                     | 10 ++++++++++
 net/netfilter/x_tables.c                  | 16 ++++++++--------
 net/netfilter/xt_limit.c                  |  6 +++---
 15 files changed, 70 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index a52cc22f806a..07c6ad8f2a02 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -158,7 +158,7 @@ struct xt_match {
 
 	/* Called when entry of this type deleted. */
 	void (*destroy)(const struct xt_mtdtor_param *);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	/* Called when userspace align differs from kernel space one */
 	void (*compat_from_user)(void *dst, const void *src);
 	int (*compat_to_user)(void __user *dst, const void *src);
@@ -169,7 +169,7 @@ struct xt_match {
 	const char *table;
 	unsigned int matchsize;
 	unsigned int usersize;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	unsigned int compatsize;
 #endif
 	unsigned int hooks;
@@ -199,7 +199,7 @@ struct xt_target {
 
 	/* Called when entry of this type deleted. */
 	void (*destroy)(const struct xt_tgdtor_param *);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	/* Called when userspace align differs from kernel space one */
 	void (*compat_from_user)(void *dst, const void *src);
 	int (*compat_to_user)(void __user *dst, const void *src);
@@ -210,7 +210,7 @@ struct xt_target {
 	const char *table;
 	unsigned int targetsize;
 	unsigned int usersize;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	unsigned int compatsize;
 #endif
 	unsigned int hooks;
@@ -452,7 +452,7 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
 
 struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 #include <net/compat.h>
 
 struct compat_xt_entry_match {
@@ -533,5 +533,5 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset);
 
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
 #endif /* _X_TABLES_H */
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index a0474b4e7782..2aab9612f6ab 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -59,7 +59,7 @@ extern unsigned int arpt_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 #include <net/compat.h>
 
 struct compat_arpt_entry {
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 0fdab3246ef5..8d09bfe850dc 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -67,7 +67,7 @@ extern unsigned int ipt_do_table(struct sk_buff *skb,
 				 const struct nf_hook_state *state,
 				 struct xt_table *table);
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 #include <net/compat.h>
 
 struct compat_ipt_entry {
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 11d0e725fe79..79e73fd7d965 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -33,7 +33,7 @@ extern unsigned int ip6t_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 #include <net/compat.h>
 
 struct compat_ip6t_entry {
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index fa199556e122..e16183bd1bb8 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -87,7 +87,7 @@ static int ebt_limit_mt_check(const struct xt_mtchk_param *par)
 }
 
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 /*
  * no conversion function needed --
  * only avg/burst have meaningful values in userspace.
@@ -107,7 +107,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = {
 	.checkentry	= ebt_limit_mt_check,
 	.matchsize	= sizeof(struct ebt_limit_info),
 	.usersize	= offsetof(struct ebt_limit_info, prev),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	.compatsize	= sizeof(struct ebt_compat_limit_info),
 #endif
 	.me		= THIS_MODULE,
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 21fd3d3d77f6..8cf653c72fd8 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -53,7 +53,7 @@ static int ebt_mark_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 	return 0;
 }
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 struct compat_ebt_mark_t_info {
 	compat_ulong_t mark;
 	compat_uint_t target;
@@ -87,7 +87,7 @@ static struct xt_target ebt_mark_tg_reg __read_mostly = {
 	.target		= ebt_mark_tg,
 	.checkentry	= ebt_mark_tg_check,
 	.targetsize	= sizeof(struct ebt_mark_t_info),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	.compatsize	= sizeof(struct compat_ebt_mark_t_info),
 	.compat_from_user = mark_tg_compat_from_user,
 	.compat_to_user	= mark_tg_compat_to_user,
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index 81fb59dec499..5872e73c741e 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -37,7 +37,7 @@ static int ebt_mark_mt_check(const struct xt_mtchk_param *par)
 }
 
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 struct compat_ebt_mark_m_info {
 	compat_ulong_t mark, mask;
 	uint8_t invert, bitmask;
@@ -75,7 +75,7 @@ static struct xt_match ebt_mark_mt_reg __read_mostly = {
 	.match		= ebt_mark_mt,
 	.checkentry	= ebt_mark_mt_check,
 	.matchsize	= sizeof(struct ebt_mark_m_info),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	.compatsize	= sizeof(struct compat_ebt_mark_m_info),
 	.compat_from_user = mark_mt_compat_from_user,
 	.compat_to_user	= mark_mt_compat_to_user,
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index a04596bb2a6e..f022deb3721e 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -47,7 +47,7 @@ struct ebt_pernet {
 static unsigned int ebt_pernet_id __read_mostly;
 static DEFINE_MUTEX(ebt_mutex);
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 static void ebt_standard_compat_from_user(void *dst, const void *src)
 {
 	int v = *(compat_int_t *)src;
@@ -73,7 +73,7 @@ static struct xt_target ebt_standard_target = {
 	.revision   = 0,
 	.family     = NFPROTO_BRIDGE,
 	.targetsize = sizeof(int),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	.compatsize = sizeof(compat_int_t),
 	.compat_from_user = ebt_standard_compat_from_user,
 	.compat_to_user =  ebt_standard_compat_to_user,
@@ -1502,7 +1502,7 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
 	   ebt_entry_to_user, entries, tmp.entries);
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 /* 32 bit-userspace compatibility definitions. */
 struct compat_ebt_replace {
 	char name[EBT_TABLE_MAXNAMELEN];
@@ -2367,7 +2367,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	/* try real handler in case userland supplied needed padding */
 	if (in_compat_syscall() &&
 	    ((cmd != EBT_SO_GET_INFO && cmd != EBT_SO_GET_INIT_INFO) ||
@@ -2434,7 +2434,7 @@ static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg,
 
 	switch (cmd) {
 	case EBT_SO_SET_ENTRIES:
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		if (in_compat_syscall())
 			ret = compat_do_replace(net, arg, len);
 		else
@@ -2442,7 +2442,7 @@ static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg,
 			ret = do_replace(net, arg, len);
 		break;
 	case EBT_SO_SET_COUNTERS:
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		if (in_compat_syscall())
 			ret = compat_update_counters(net, arg, len);
 		else
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b1bb6a7e2dd7..cf20316094d0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -713,7 +713,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 static void compat_standard_from_user(void *dst, const void *src)
 {
 	int v = *(compat_int_t *)src;
@@ -800,7 +800,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 		return -EFAULT;
 
 	name[XT_TABLE_MAXNAMELEN-1] = '\0';
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	if (in_compat_syscall())
 		xt_compat_lock(NFPROTO_ARP);
 #endif
@@ -808,7 +808,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	if (!IS_ERR(t)) {
 		struct arpt_getinfo info;
 		const struct xt_table_info *private = t->private;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		struct xt_table_info tmp;
 
 		if (in_compat_syscall()) {
@@ -835,7 +835,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 		module_put(t->me);
 	} else
 		ret = PTR_ERR(t);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	if (in_compat_syscall())
 		xt_compat_unlock(NFPROTO_ARP);
 #endif
@@ -1044,7 +1044,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 struct compat_arpt_replace {
 	char				name[XT_TABLE_MAXNAMELEN];
 	u32				valid_hooks;
@@ -1412,7 +1412,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg,
 
 	switch (cmd) {
 	case ARPT_SO_SET_REPLACE:
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		if (in_compat_syscall())
 			ret = compat_do_replace(sock_net(sk), arg, len);
 		else
@@ -1444,7 +1444,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 		break;
 
 	case ARPT_SO_GET_ENTRIES:
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		if (in_compat_syscall())
 			ret = compat_get_entries(sock_net(sk), user, len);
 		else
@@ -1580,7 +1580,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
 		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_ARP,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		.compatsize       = sizeof(compat_int_t),
 		.compat_from_user = compat_standard_from_user,
 		.compat_to_user   = compat_standard_to_user,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d6caaed5dd45..13acb687c19a 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -868,7 +868,7 @@ copy_entries_to_user(unsigned int total_size,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 static void compat_standard_from_user(void *dst, const void *src)
 {
 	int v = *(compat_int_t *)src;
@@ -957,7 +957,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 		return -EFAULT;
 
 	name[XT_TABLE_MAXNAMELEN-1] = '\0';
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	if (in_compat_syscall())
 		xt_compat_lock(AF_INET);
 #endif
@@ -965,7 +965,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	if (!IS_ERR(t)) {
 		struct ipt_getinfo info;
 		const struct xt_table_info *private = t->private;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		struct xt_table_info tmp;
 
 		if (in_compat_syscall()) {
@@ -993,7 +993,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 		module_put(t->me);
 	} else
 		ret = PTR_ERR(t);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	if (in_compat_syscall())
 		xt_compat_unlock(AF_INET);
 #endif
@@ -1199,7 +1199,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 struct compat_ipt_replace {
 	char			name[XT_TABLE_MAXNAMELEN];
 	u32			valid_hooks;
@@ -1621,7 +1621,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
 
 	switch (cmd) {
 	case IPT_SO_SET_REPLACE:
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		if (in_compat_syscall())
 			ret = compat_do_replace(sock_net(sk), arg, len);
 		else
@@ -1654,7 +1654,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		break;
 
 	case IPT_SO_GET_ENTRIES:
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		if (in_compat_syscall())
 			ret = compat_get_entries(sock_net(sk), user, len);
 		else
@@ -1846,7 +1846,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
 		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_IPV4,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		.compatsize       = sizeof(compat_int_t),
 		.compat_from_user = compat_standard_from_user,
 		.compat_to_user   = compat_standard_to_user,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index a8b980ad11d4..8f7ca67475b7 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -541,7 +541,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
 	nf_ct_netns_put(par->net, par->family);
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 struct compat_ipt_clusterip_tgt_info
 {
 	u_int32_t	flags;
@@ -553,7 +553,7 @@ struct compat_ipt_clusterip_tgt_info
 	u_int32_t	hash_initval;
 	compat_uptr_t	config;
 };
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
 
 static struct xt_target clusterip_tg_reg __read_mostly = {
 	.name		= "CLUSTERIP",
@@ -563,9 +563,9 @@ static struct xt_target clusterip_tg_reg __read_mostly = {
 	.destroy	= clusterip_tg_destroy,
 	.targetsize	= sizeof(struct ipt_clusterip_tgt_info),
 	.usersize	= offsetof(struct ipt_clusterip_tgt_info, config),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	.compatsize	= sizeof(struct compat_ipt_clusterip_tgt_info),
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
 	.me		= THIS_MODULE
 };
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e763716ffa25..e810a23baf99 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -884,7 +884,7 @@ copy_entries_to_user(unsigned int total_size,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 static void compat_standard_from_user(void *dst, const void *src)
 {
 	int v = *(compat_int_t *)src;
@@ -973,7 +973,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 		return -EFAULT;
 
 	name[XT_TABLE_MAXNAMELEN-1] = '\0';
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	if (in_compat_syscall())
 		xt_compat_lock(AF_INET6);
 #endif
@@ -981,7 +981,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 	if (!IS_ERR(t)) {
 		struct ip6t_getinfo info;
 		const struct xt_table_info *private = t->private;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		struct xt_table_info tmp;
 
 		if (in_compat_syscall()) {
@@ -1009,7 +1009,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
 		module_put(t->me);
 	} else
 		ret = PTR_ERR(t);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	if (in_compat_syscall())
 		xt_compat_unlock(AF_INET6);
 #endif
@@ -1215,7 +1215,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 struct compat_ip6t_replace {
 	char			name[XT_TABLE_MAXNAMELEN];
 	u32			valid_hooks;
@@ -1630,7 +1630,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
 
 	switch (cmd) {
 	case IP6T_SO_SET_REPLACE:
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		if (in_compat_syscall())
 			ret = compat_do_replace(sock_net(sk), arg, len);
 		else
@@ -1663,7 +1663,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		break;
 
 	case IP6T_SO_GET_ENTRIES:
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		if (in_compat_syscall())
 			ret = compat_get_entries(sock_net(sk), user, len);
 		else
@@ -1853,7 +1853,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
 		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_IPV6,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		.compatsize       = sizeof(compat_int_t),
 		.compat_from_user = compat_standard_from_user,
 		.compat_to_user   = compat_standard_to_user,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index fcd8682704c4..56a2531a3402 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -728,6 +728,16 @@ config NETFILTER_XTABLES
 
 if NETFILTER_XTABLES
 
+config NETFILTER_XTABLES_COMPAT
+	bool "Netfilter Xtables 32bit support"
+	depends on COMPAT
+	default y
+	help
+	   This option provides a translation layer to run 32bit arp,ip(6),ebtables
+	   binaries on 64bit kernels.
+
+	   If unsure, say N.
+
 comment "Xtables combined modules"
 
 config NETFILTER_XT_MARK
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index ef37deff8405..84e58ee501a4 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -52,7 +52,7 @@ struct xt_af {
 	struct mutex mutex;
 	struct list_head match;
 	struct list_head target;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	struct mutex compat_mutex;
 	struct compat_delta *compat_tab;
 	unsigned int number; /* number of slots in compat_tab[] */
@@ -647,7 +647,7 @@ static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
 	return usersize == kernsize && strnlen(msg, msglen) < msglen;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
 {
 	struct xt_af *xp = &xt[af];
@@ -850,7 +850,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				    __alignof__(struct compat_xt_entry_match));
 }
 EXPORT_SYMBOL(xt_compat_check_entry_offsets);
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
 
 /**
  * xt_check_entry_offsets - validate arp/ip/ip6t_entry
@@ -868,7 +868,7 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
  * match structures are aligned, and that the last structure ends where
  * the target structure begins.
  *
- * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
+ * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version.
  *
  * The arp/ip/ip6t_entry structure @base must have passed following tests:
  * - it must point to a valid memory location
@@ -1059,7 +1059,7 @@ void *xt_copy_counters(sockptr_t arg, unsigned int len,
 	void *mem;
 	u64 size;
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	if (in_compat_syscall()) {
 		/* structures only differ in size due to alignment */
 		struct compat_xt_counters_info compat_tmp;
@@ -1106,7 +1106,7 @@ void *xt_copy_counters(sockptr_t arg, unsigned int len,
 }
 EXPORT_SYMBOL_GPL(xt_copy_counters);
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 int xt_compat_target_offset(const struct xt_target *target)
 {
 	u_int16_t csize = target->compatsize ? : target->targetsize;
@@ -1293,7 +1293,7 @@ void xt_table_unlock(struct xt_table *table)
 }
 EXPORT_SYMBOL_GPL(xt_table_unlock);
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 void xt_compat_lock(u_int8_t af)
 {
 	mutex_lock(&xt[af].compat_mutex);
@@ -1931,7 +1931,7 @@ static int __init xt_init(void)
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++) {
 		mutex_init(&xt[i].mutex);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 		mutex_init(&xt[i].compat_mutex);
 		xt[i].compat_tab = NULL;
 #endif
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index bd1dea9c7b88..24d4afb9988d 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -134,7 +134,7 @@ static void limit_mt_destroy(const struct xt_mtdtor_param *par)
 	kfree(info->master);
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 struct compat_xt_rateinfo {
 	u_int32_t avg;
 	u_int32_t burst;
@@ -176,7 +176,7 @@ static int limit_mt_compat_to_user(void __user *dst, const void *src)
 	};
 	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
 }
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
 
 static struct xt_match limit_mt_reg __read_mostly = {
 	.name             = "limit",
@@ -186,7 +186,7 @@ static struct xt_match limit_mt_reg __read_mostly = {
 	.checkentry       = limit_mt_check,
 	.destroy          = limit_mt_destroy,
 	.matchsize        = sizeof(struct xt_rateinfo),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
 	.compatsize       = sizeof(struct compat_xt_rateinfo),
 	.compat_from_user = limit_mt_compat_from_user,
 	.compat_to_user   = limit_mt_compat_to_user,
-- 
cgit v1.2.3


From 63fa73e2151848ed5930dfe0040c823ffe1f2cc4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: Mon, 26 Apr 2021 09:44:47 -0300
Subject: net: Fix typo in comment about ancillary data

Ingo sent typo fixes for tools/ and this resulted in a warning when
building the perf/core branch that will be sent upstream in the next
merge window:

  Warning: Kernel ABI header at 'tools/perf/trace/beauty/include/linux/socket.h' differs from latest version at 'include/linux/socket.h'
  diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h

Fix the typo on the kernel file to address this.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 385894b4a8bb..b8fc5c53ba6f 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -85,7 +85,7 @@ struct mmsghdr {
 
 /*
  *	POSIX 1003.1g - ancillary data object information
- *	Ancillary data consits of a sequence of pairs of
+ *	Ancillary data consists of a sequence of pairs of
  *	(cmsghdr, cmsg_data[])
  */
 
-- 
cgit v1.2.3


From 3d14ec1fe61aebe3da85a9b8f2c3d61e43d522e6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 25 Apr 2021 22:02:38 +0100
Subject: iov_iter: Four fixes for ITER_XARRAY

Fix four things[1] in the patch that adds ITER_XARRAY[2]:

 (1) Remove the address_space struct predeclaration.  This is a holdover
     from when it was ITER_MAPPING.

 (2) Fix _copy_mc_to_iter() so that the xarray segment updates count and
     iov_offset in the iterator before returning.

 (3) Fix iov_iter_alignment() to not loop in the xarray case.  Because the
     middle pages are all whole pages, only the end pages need be
     considered - and this can be reduced to just looking at the start
     position in the xarray and the iteration size.

 (4) Fix iov_iter_advance() to limit the size of the advance to no more
     than the remaining iteration size.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
Link: https://lore.kernel.org/r/YIVrJT8GwLI0Wlgx@zeniv-ca.linux.org.uk [1]
Link: https://lore.kernel.org/r/161918448151.3145707.11541538916600921083.stgit@warthog.procyon.org.uk [2]
---
 include/linux/uio.h | 1 -
 lib/iov_iter.c      | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 5f5ffc45d4aa..d3ec87706d75 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -10,7 +10,6 @@
 #include <uapi/linux/uio.h>
 
 struct page;
-struct address_space;
 struct pipe_inode_info;
 
 struct kvec {
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 44fa726a8323..61228a6c69f8 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -791,6 +791,8 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 			curr_addr = (unsigned long) from;
 			bytes = curr_addr - s_addr - rem;
 			rcu_read_unlock();
+			i->iov_offset += bytes;
+			i->count -= bytes;
 			return bytes;
 		}
 		})
@@ -1147,6 +1149,7 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		return;
 	}
 	if (unlikely(iov_iter_is_xarray(i))) {
+		size = min(size, i->count);
 		i->iov_offset += size;
 		i->count -= size;
 		return;
@@ -1346,6 +1349,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 			return size | i->iov_offset;
 		return size;
 	}
+	if (unlikely(iov_iter_is_xarray(i)))
+		return (i->xarray_start + i->iov_offset) | i->count;
 	iterate_all_kinds(i, size, v,
 		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
 		res |= v.bv_offset | v.bv_len,
-- 
cgit v1.2.3


From c4b364ce1270d689ee5010001344b8eae3685f32 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Tue, 27 Apr 2021 12:22:00 +0800
Subject: net: dsa: free skb->cb usage in core driver

Free skb->cb usage in core driver and let device drivers decide to
use or not. The reason having a DSA_SKB_CB(skb)->clone was because
dsa_skb_tx_timestamp() which may set the clone pointer was called
before p->xmit() which would use the clone if any, and the device
driver has no way to initialize the clone pointer.

This patch just put memset(skb->cb, 0, sizeof(skb->cb)) at beginning
of dsa_slave_xmit(). Some new features in the future, like one-step
timestamp may need more bytes of skb->cb to use in
dsa_skb_tx_timestamp(), and p->xmit().

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c         |  2 +-
 drivers/net/dsa/sja1105/sja1105_main.c |  2 +-
 drivers/net/dsa/sja1105/sja1105_ptp.c  |  4 ++--
 drivers/net/ethernet/mscc/ocelot.c     |  6 +++---
 drivers/net/ethernet/mscc/ocelot_net.c |  2 +-
 include/linux/dsa/sja1105.h            |  3 ++-
 include/net/dsa.h                      | 14 --------------
 include/soc/mscc/ocelot.h              |  8 ++++++++
 net/dsa/slave.c                        |  2 +-
 net/dsa/tag_ocelot.c                   |  8 ++++----
 net/dsa/tag_ocelot_8021q.c             |  8 ++++----
 11 files changed, 27 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index fe7e8bad90df..b28280b6e91a 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -1408,7 +1408,7 @@ static void felix_txtstamp(struct dsa_switch *ds, int port,
 			return;
 
 		ocelot_port_add_txtstamp_skb(ocelot, port, clone);
-		DSA_SKB_CB(skb)->clone = clone;
+		OCELOT_SKB_CB(skb)->clone = clone;
 	}
 }
 
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index d9c198ca0197..405024b637d6 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -3137,7 +3137,7 @@ static void sja1105_port_deferred_xmit(struct kthread_work *work)
 	struct sk_buff *skb;
 
 	while ((skb = skb_dequeue(&sp->xmit_queue)) != NULL) {
-		struct sk_buff *clone = DSA_SKB_CB(skb)->clone;
+		struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone;
 
 		mutex_lock(&priv->mgmt_lock);
 
diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c
index a5140084000d..0bc566b9e958 100644
--- a/drivers/net/dsa/sja1105/sja1105_ptp.c
+++ b/drivers/net/dsa/sja1105/sja1105_ptp.c
@@ -432,7 +432,7 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port,
 }
 
 /* Called from dsa_skb_tx_timestamp. This callback is just to clone
- * the skb and have it available in DSA_SKB_CB in the .port_deferred_xmit
+ * the skb and have it available in SJA1105_SKB_CB in the .port_deferred_xmit
  * callback, where we will timestamp it synchronously.
  */
 void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb)
@@ -448,7 +448,7 @@ void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb)
 	if (!clone)
 		return;
 
-	DSA_SKB_CB(skb)->clone = clone;
+	SJA1105_SKB_CB(skb)->clone = clone;
 }
 
 static int sja1105_ptp_reset(struct dsa_switch *ds)
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 8d06ffaf318a..7da2dd1632b1 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -538,8 +538,8 @@ void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port,
 	spin_lock(&ocelot_port->ts_id_lock);
 
 	skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS;
-	/* Store timestamp ID in cb[0] of sk_buff */
-	clone->cb[0] = ocelot_port->ts_id;
+	/* Store timestamp ID in OCELOT_SKB_CB(clone)->ts_id */
+	OCELOT_SKB_CB(clone)->ts_id = ocelot_port->ts_id;
 	ocelot_port->ts_id = (ocelot_port->ts_id + 1) % 4;
 	skb_queue_tail(&ocelot_port->tx_skbs, clone);
 
@@ -604,7 +604,7 @@ void ocelot_get_txtstamp(struct ocelot *ocelot)
 		spin_lock_irqsave(&port->tx_skbs.lock, flags);
 
 		skb_queue_walk_safe(&port->tx_skbs, skb, skb_tmp) {
-			if (skb->cb[0] != id)
+			if (OCELOT_SKB_CB(skb)->ts_id != id)
 				continue;
 			__skb_unlink(skb, &port->tx_skbs);
 			skb_match = skb;
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 36f32a4d9b0f..789a5fba146c 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -520,7 +520,7 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
 
 			ocelot_port_add_txtstamp_skb(ocelot, port, clone);
 
-			rew_op |= clone->cb[0] << 3;
+			rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3;
 		}
 	}
 
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index dd93735ae228..1eb84562b311 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -47,11 +47,12 @@ struct sja1105_tagger_data {
 };
 
 struct sja1105_skb_cb {
+	struct sk_buff *clone;
 	u32 meta_tstamp;
 };
 
 #define SJA1105_SKB_CB(skb) \
-	((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb))
+	((struct sja1105_skb_cb *)((skb)->cb))
 
 struct sja1105_port {
 	u16 subvlan_map[DSA_8021Q_N_SUBVLAN];
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 73ce6ce38aa1..e1a2610a0e06 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -117,20 +117,6 @@ struct dsa_netdevice_ops {
 #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto)				\
 	MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE))
 
-struct dsa_skb_cb {
-	struct sk_buff *clone;
-};
-
-struct __dsa_skb_cb {
-	struct dsa_skb_cb cb;
-	u8 priv[48 - sizeof(struct dsa_skb_cb)];
-};
-
-#define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb))
-
-#define DSA_SKB_CB_PRIV(skb)			\
-	((void *)(skb)->cb + offsetof(struct __dsa_skb_cb, priv))
-
 struct dsa_switch_tree {
 	struct list_head	list;
 
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 68cdc7ceaf4d..f075aaf70eee 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -689,6 +689,14 @@ struct ocelot_policer {
 	u32 burst; /* bytes */
 };
 
+struct ocelot_skb_cb {
+	struct sk_buff *clone;
+	u8 ts_id;
+};
+
+#define OCELOT_SKB_CB(skb) \
+	((struct ocelot_skb_cb *)((skb)->cb))
+
 #define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
 #define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi))
 #define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri))
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 85e51f46a9d5..8c0f3c6ab365 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -614,7 +614,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	dev_sw_netstats_tx_add(dev, 1, skb->len);
 
-	DSA_SKB_CB(skb)->clone = NULL;
+	memset(skb->cb, 0, sizeof(skb->cb));
 
 	/* Handle tx timestamp if any */
 	dsa_skb_tx_timestamp(p, skb);
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index f9df9cac81c5..1100a16f1032 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -15,11 +15,11 @@ static void ocelot_xmit_ptp(struct dsa_port *dp, void *injection,
 	ocelot_port = ocelot->ports[dp->index];
 	rew_op = ocelot_port->ptp_cmd;
 
-	/* Retrieve timestamp ID populated inside skb->cb[0] of the
-	 * clone by ocelot_port_add_txtstamp_skb
+	/* Retrieve timestamp ID populated inside OCELOT_SKB_CB(clone)->ts_id
+	 * by ocelot_port_add_txtstamp_skb
 	 */
 	if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP)
-		rew_op |= clone->cb[0] << 3;
+		rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3;
 
 	ocelot_ifh_set_rew_op(injection, rew_op);
 }
@@ -28,7 +28,7 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
 			       __be32 ifh_prefix, void **ifh)
 {
 	struct dsa_port *dp = dsa_slave_to_port(netdev);
-	struct sk_buff *clone = DSA_SKB_CB(skb)->clone;
+	struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone;
 	struct dsa_switch *ds = dp->ds;
 	void *injection;
 	__be32 *prefix;
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 5f3e8e124a82..a001a7e3f575 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -28,11 +28,11 @@ static struct sk_buff *ocelot_xmit_ptp(struct dsa_port *dp,
 	ocelot_port = ocelot->ports[port];
 	rew_op = ocelot_port->ptp_cmd;
 
-	/* Retrieve timestamp ID populated inside skb->cb[0] of the
-	 * clone by ocelot_port_add_txtstamp_skb
+	/* Retrieve timestamp ID populated inside OCELOT_SKB_CB(clone)->ts_id
+	 * by ocelot_port_add_txtstamp_skb
 	 */
 	if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP)
-		rew_op |= clone->cb[0] << 3;
+		rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3;
 
 	ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
 
@@ -46,7 +46,7 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
 	u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index);
 	u16 queue_mapping = skb_get_queue_mapping(skb);
 	u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
-	struct sk_buff *clone = DSA_SKB_CB(skb)->clone;
+	struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone;
 
 	/* TX timestamping was requested, so inject through MMIO */
 	if (clone)
-- 
cgit v1.2.3


From 800fcab8230f622544a12403977b5b7259a076f8 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 27 Apr 2021 09:09:07 +0200
Subject: net: phy: Add support for microchip SMI0 MDIO bus

SMI0 is a mangled version of MDIO. The main low level difference is
the MDIO C22 OP code is always 0, not 0x2 or 0x1 for Read/Write. The
read/write information is instead encoded in the PHY address.

Extend the bit-bang code to allow the op code to be overridden, but
default to normal C22 values. Add an extra compatible to the mdio-gpio
driver, and when this compatible is present, set the op codes to 0.

A higher level driver, sitting on top of the basic MDIO bus driver can
then implement the rest of the microchip SMI0 odderties.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Michael Grzeschik <m.grzeschik@pengutronix.de>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio/mdio-bitbang.c | 8 ++++++--
 drivers/net/mdio/mdio-gpio.c    | 8 ++++++++
 include/linux/mdio-bitbang.h    | 3 +++
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mdio/mdio-bitbang.c b/drivers/net/mdio/mdio-bitbang.c
index 0f457c436335..07609114a26b 100644
--- a/drivers/net/mdio/mdio-bitbang.c
+++ b/drivers/net/mdio/mdio-bitbang.c
@@ -158,7 +158,7 @@ int mdiobb_read(struct mii_bus *bus, int phy, int reg)
 		reg = mdiobb_cmd_addr(ctrl, phy, reg);
 		mdiobb_cmd(ctrl, MDIO_C45_READ, phy, reg);
 	} else
-		mdiobb_cmd(ctrl, MDIO_READ, phy, reg);
+		mdiobb_cmd(ctrl, ctrl->op_c22_read, phy, reg);
 
 	ctrl->ops->set_mdio_dir(ctrl, 0);
 
@@ -190,7 +190,7 @@ int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val)
 		reg = mdiobb_cmd_addr(ctrl, phy, reg);
 		mdiobb_cmd(ctrl, MDIO_C45_WRITE, phy, reg);
 	} else
-		mdiobb_cmd(ctrl, MDIO_WRITE, phy, reg);
+		mdiobb_cmd(ctrl, ctrl->op_c22_write, phy, reg);
 
 	/* send the turnaround (10) */
 	mdiobb_send_bit(ctrl, 1);
@@ -217,6 +217,10 @@ struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl)
 	bus->read = mdiobb_read;
 	bus->write = mdiobb_write;
 	bus->priv = ctrl;
+	if (!ctrl->override_op_c22) {
+		ctrl->op_c22_read = MDIO_READ;
+		ctrl->op_c22_write = MDIO_WRITE;
+	}
 
 	return bus;
 }
diff --git a/drivers/net/mdio/mdio-gpio.c b/drivers/net/mdio/mdio-gpio.c
index 56c8f914f893..0fb3c2de0845 100644
--- a/drivers/net/mdio/mdio-gpio.c
+++ b/drivers/net/mdio/mdio-gpio.c
@@ -132,6 +132,13 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 		new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask;
 	}
 
+	if (dev->of_node &&
+	    of_device_is_compatible(dev->of_node, "microchip,mdio-smi0")) {
+		bitbang->ctrl.op_c22_read = 0;
+		bitbang->ctrl.op_c22_write = 0;
+		bitbang->ctrl.override_op_c22 = 1;
+	}
+
 	dev_set_drvdata(dev, new_bus);
 
 	return new_bus;
@@ -196,6 +203,7 @@ static int mdio_gpio_remove(struct platform_device *pdev)
 
 static const struct of_device_id mdio_gpio_of_match[] = {
 	{ .compatible = "virtual,mdio-gpio", },
+	{ .compatible = "microchip,mdio-smi0" },
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, mdio_gpio_of_match);
diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h
index aca4dc037b70..373630fe5c28 100644
--- a/include/linux/mdio-bitbang.h
+++ b/include/linux/mdio-bitbang.h
@@ -33,6 +33,9 @@ struct mdiobb_ops {
 
 struct mdiobb_ctrl {
 	const struct mdiobb_ops *ops;
+	unsigned int override_op_c22;
+	u8 op_c22_read;
+	u8 op_c22_write;
 };
 
 int mdiobb_read(struct mii_bus *bus, int phy, int reg);
-- 
cgit v1.2.3


From 76d6a13383b8e3ff20a9cf52aa9c3de39e485632 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Tue, 27 Apr 2021 19:43:12 +0200
Subject: seq_file: Add a seq_bprintf function

Similarly to seq_buf_bprintf in lib/seq_buf.c, this function writes a
printf formatted string with arguments provided in a "binary
representation" built by functions such as vbin_printf.

Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210427174313.860948-2-revest@chromium.org
---
 fs/seq_file.c            | 18 ++++++++++++++++++
 include/linux/seq_file.h |  4 ++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index cb11a34fb871..5059248f2d64 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -412,6 +412,24 @@ void seq_printf(struct seq_file *m, const char *f, ...)
 }
 EXPORT_SYMBOL(seq_printf);
 
+#ifdef CONFIG_BINARY_PRINTF
+void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary)
+{
+	int len;
+
+	if (m->count < m->size) {
+		len = bstr_printf(m->buf + m->count, m->size - m->count, f,
+				  binary);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return;
+		}
+	}
+	seq_set_overflow(m);
+}
+EXPORT_SYMBOL(seq_bprintf);
+#endif /* CONFIG_BINARY_PRINTF */
+
 /**
  *	mangle_path -	mangle and copy path to buffer beginning
  *	@s: buffer start
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index b83b3ae3c877..723b1fa1177e 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -146,6 +146,10 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_release_private(struct inode *, struct file *);
 
+#ifdef CONFIG_BINARY_PRINTF
+void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary);
+#endif
+
 #define DEFINE_SEQ_ATTRIBUTE(__name)					\
 static int __name ## _open(struct inode *inode, struct file *file)	\
 {									\
-- 
cgit v1.2.3


From 527139d738d7f2e9f929c752eebf3cbf0f74c754 Mon Sep 17 00:00:00 2001
From: Krzysztof Wilczyński <kw@linux.com>
Date: Fri, 16 Apr 2021 20:58:38 +0000
Subject: PCI/sysfs: Convert "rom" to static attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "rom" sysfs attribute allows access to the PCI Option ROM.  Previously
it was dynamically created either by pci_bus_add_device() or the
pci_sysfs_init() initcall, but since it doesn't need to be created or
removed dynamically, we can use a static attribute so the device model
takes care of addition and removal automatically.

Convert "rom" to a static attribute and use the .is_bin_visible() callback
to set the correct object size based on the ROM size.

Remove "rom_attr" from the struct pci_dev since it is no longer needed.

This attribute was added in the pre-git era by https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/drivers/pci/pci-sysfs.c?id=f6d553444da2

[bhelgaas: commit log]
Suggested-by: Oliver O'Halloran <oohall@gmail.com>
Link: https://lore.kernel.org/r/20210416205856.3234481-3-kw@linux.com
Signed-off-by: Krzysztof Wilczyński <kw@linux.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci-sysfs.c | 67 +++++++++++++++++++++----------------------------
 include/linux/pci.h     |  1 -
 2 files changed, 29 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index dc14daf404f5..fa8373685140 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1306,6 +1306,33 @@ static ssize_t pci_read_rom(struct file *filp, struct kobject *kobj,
 
 	return count;
 }
+static BIN_ATTR(rom, 0600, pci_read_rom, pci_write_rom, 0);
+
+static struct bin_attribute *pci_dev_rom_attrs[] = {
+	&bin_attr_rom,
+	NULL,
+};
+
+static umode_t pci_dev_rom_attr_is_visible(struct kobject *kobj,
+					   struct bin_attribute *a, int n)
+{
+	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
+	size_t rom_size;
+
+	/* If the device has a ROM, try to expose it in sysfs. */
+	rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
+	if (!rom_size)
+		return 0;
+
+	a->size = rom_size;
+
+	return a->attr.mode;
+}
+
+static const struct attribute_group pci_dev_rom_attr_group = {
+	.bin_attrs = pci_dev_rom_attrs,
+	.is_bin_visible = pci_dev_rom_attr_is_visible,
+};
 
 static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
 			   const char *buf, size_t count)
@@ -1352,8 +1379,6 @@ error:
 int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev)
 {
 	int retval;
-	int rom_size;
-	struct bin_attribute *attr;
 
 	if (!sysfs_initialized)
 		return -EACCES;
@@ -1362,43 +1387,15 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev)
 	if (retval)
 		goto err;
 
-	/* If the device has a ROM, try to expose it in sysfs. */
-	rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
-	if (rom_size) {
-		attr = kzalloc(sizeof(*attr), GFP_ATOMIC);
-		if (!attr) {
-			retval = -ENOMEM;
-			goto err_resource_files;
-		}
-		sysfs_bin_attr_init(attr);
-		attr->size = rom_size;
-		attr->attr.name = "rom";
-		attr->attr.mode = 0600;
-		attr->read = pci_read_rom;
-		attr->write = pci_write_rom;
-		retval = sysfs_create_bin_file(&pdev->dev.kobj, attr);
-		if (retval) {
-			kfree(attr);
-			goto err_resource_files;
-		}
-		pdev->rom_attr = attr;
-	}
-
 	/* add sysfs entries for various capabilities */
 	retval = pci_create_capabilities_sysfs(pdev);
 	if (retval)
-		goto err_rom_file;
+		goto err_resource_files;
 
 	pci_create_firmware_label_files(pdev);
 
 	return 0;
 
-err_rom_file:
-	if (pdev->rom_attr) {
-		sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr);
-		kfree(pdev->rom_attr);
-		pdev->rom_attr = NULL;
-	}
 err_resource_files:
 	pci_remove_resource_files(pdev);
 err:
@@ -1427,13 +1424,6 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev)
 
 	pci_remove_capabilities_sysfs(pdev);
 	pci_remove_resource_files(pdev);
-
-	if (pdev->rom_attr) {
-		sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr);
-		kfree(pdev->rom_attr);
-		pdev->rom_attr = NULL;
-	}
-
 	pci_remove_firmware_label_files(pdev);
 }
 
@@ -1526,6 +1516,7 @@ static const struct attribute_group pci_dev_group = {
 const struct attribute_group *pci_dev_groups[] = {
 	&pci_dev_group,
 	&pci_dev_config_attr_group,
+	&pci_dev_rom_attr_group,
 	NULL,
 };
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 86c799c97b77..45f1fef80b50 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -458,7 +458,6 @@ struct pci_dev {
 
 	u32		saved_config_space[16]; /* Config space saved at suspend time */
 	struct hlist_head saved_cap_space;
-	struct bin_attribute *rom_attr;		/* Attribute descriptor for sysfs ROM entry */
 	int		rom_attr_enabled;	/* Display of ROM attribute enabled? */
 	struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
 	struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
-- 
cgit v1.2.3


From 48cac3f4a96ddf08df8e53809ed066de0dc93915 Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Tue, 27 Apr 2021 19:43:13 +0200
Subject: bpf: Implement formatted output helpers with bstr_printf

BPF has three formatted output helpers: bpf_trace_printk, bpf_seq_printf
and bpf_snprintf. Their signatures specify that all arguments are
provided from the BPF world as u64s (in an array or as registers). All
of these helpers are currently implemented by calling functions such as
snprintf() whose signatures take a variable number of arguments, then
placed in a va_list by the compiler to call vsnprintf().

"d9c9e4db bpf: Factorize bpf_trace_printk and bpf_seq_printf" introduced
a bpf_printf_prepare function that fills an array of u64 sanitized
arguments with an array of "modifiers" which indicate what the "real"
size of each argument should be (given by the format specifier). The
BPF_CAST_FMT_ARG macro consumes these arrays and casts each argument to
its real size. However, the C promotion rules implicitely cast them all
back to u64s. Therefore, the arguments given to snprintf are u64s and
the va_list constructed by the compiler will use 64 bits for each
argument. On 64 bit machines, this happens to work well because 32 bit
arguments in va_lists need to occupy 64 bits anyway, but on 32 bit
architectures this breaks the layout of the va_list expected by the
called function and mangles values.

In "88a5c690b6 bpf: fix bpf_trace_printk on 32 bit archs", this problem
had been solved for bpf_trace_printk only with a "horrid workaround"
that emitted multiple calls to trace_printk where each call had
different argument types and generated different va_list layouts. One of
the call would be dynamically chosen at runtime. This was ok with the 3
arguments that bpf_trace_printk takes but bpf_seq_printf and
bpf_snprintf accept up to 12 arguments. Because this approach scales
code exponentially, it is not a viable option anymore.

Because the promotion rules are part of the language and because the
construction of a va_list is an arch-specific ABI, it's best to just
avoid variadic arguments and va_lists altogether. Thankfully the
kernel's snprintf() has an alternative in the form of bstr_printf() that
accepts arguments in a "binary buffer representation". These binary
buffers are currently created by vbin_printf and used in the tracing
subsystem to split the cost of printing into two parts: a fast one that
only dereferences and remembers values, and a slower one, called later,
that does the pretty-printing.

This patch refactors bpf_printf_prepare to construct binary buffers of
arguments consumable by bstr_printf() instead of arrays of arguments and
modifiers. This gets rid of BPF_CAST_FMT_ARG and greatly simplifies the
bpf_printf_prepare usage but there are a few gotchas that change how
bpf_printf_prepare needs to do things.

Currently, bpf_printf_prepare uses a per cpu temporary buffer as a
generic storage for strings and IP addresses. With this refactoring, the
temporary buffers now holds all the arguments in a structured binary
format.

To comply with the format expected by bstr_printf, certain format
specifiers also need to be pre-formatted: %pB and %pi6/%pi4/%pI4/%pI6.
Because vsnprintf subroutines for these specifiers are hard to expose,
we pre-format these arguments with calls to snprintf().

Reported-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Florent Revest <revest@chromium.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210427174313.860948-3-revest@chromium.org
---
 include/linux/bpf.h      |  22 +-----
 init/Kconfig             |   1 +
 kernel/bpf/helpers.c     | 188 +++++++++++++++++++++++++----------------------
 kernel/bpf/verifier.c    |   2 +-
 kernel/trace/bpf_trace.c |  34 +++------
 5 files changed, 115 insertions(+), 132 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ad4bcf1cadbb..b33f199c4cc2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2081,24 +2081,8 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 struct btf_id_set;
 bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 
-enum bpf_printf_mod_type {
-	BPF_PRINTF_INT,
-	BPF_PRINTF_LONG,
-	BPF_PRINTF_LONG_LONG,
-};
-
-/* Workaround for getting va_list handling working with different argument type
- * combinations generically for 32 and 64 bit archs.
- */
-#define BPF_CAST_FMT_ARG(arg_nb, args, mod)				\
-	(mod[arg_nb] == BPF_PRINTF_LONG_LONG ||				\
-	 (mod[arg_nb] == BPF_PRINTF_LONG && __BITS_PER_LONG == 64)	\
-	  ? (u64)args[arg_nb]						\
-	  : (u32)args[arg_nb])
-
-int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
-		       u64 *final_args, enum bpf_printf_mod_type *mod,
-		       u32 num_args);
-void bpf_printf_cleanup(void);
+int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+			u32 **bin_buf, u32 num_args);
+void bpf_bprintf_cleanup(void);
 
 #endif /* _LINUX_BPF_H */
diff --git a/init/Kconfig b/init/Kconfig
index 5deae45b8d81..0d82a1f838cc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1708,6 +1708,7 @@ config BPF_SYSCALL
 	select BPF
 	select IRQ_WORK
 	select TASKS_TRACE_RCU
+	select BINARY_PRINTF
 	select NET_SOCK_MSG if INET
 	default n
 	help
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 85b26ca5aacd..544773970dbc 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -707,9 +707,6 @@ static int try_get_fmt_tmp_buf(char **tmp_buf)
 	struct bpf_printf_buf *bufs;
 	int used;
 
-	if (*tmp_buf)
-		return 0;
-
 	preempt_disable();
 	used = this_cpu_inc_return(bpf_printf_buf_used);
 	if (WARN_ON_ONCE(used > 1)) {
@@ -723,7 +720,7 @@ static int try_get_fmt_tmp_buf(char **tmp_buf)
 	return 0;
 }
 
-void bpf_printf_cleanup(void)
+void bpf_bprintf_cleanup(void)
 {
 	if (this_cpu_read(bpf_printf_buf_used)) {
 		this_cpu_dec(bpf_printf_buf_used);
@@ -732,43 +729,45 @@ void bpf_printf_cleanup(void)
 }
 
 /*
- * bpf_parse_fmt_str - Generic pass on format strings for printf-like helpers
+ * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
  *
  * Returns a negative value if fmt is an invalid format string or 0 otherwise.
  *
  * This can be used in two ways:
- * - Format string verification only: when final_args and mod are NULL
+ * - Format string verification only: when bin_args is NULL
  * - Arguments preparation: in addition to the above verification, it writes in
- *   final_args a copy of raw_args where pointers from BPF have been sanitized
- *   into pointers safe to use by snprintf. This also writes in the mod array
- *   the size requirement of each argument, usable by BPF_CAST_FMT_ARG for ex.
+ *   bin_args a binary representation of arguments usable by bstr_printf where
+ *   pointers from BPF have been sanitized.
  *
  * In argument preparation mode, if 0 is returned, safe temporary buffers are
- * allocated and bpf_printf_cleanup should be called to free them after use.
+ * allocated and bpf_bprintf_cleanup should be called to free them after use.
  */
-int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
-			u64 *final_args, enum bpf_printf_mod_type *mod,
-			u32 num_args)
+int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+			u32 **bin_args, u32 num_args)
 {
-	char *unsafe_ptr = NULL, *tmp_buf = NULL, *fmt_end;
-	size_t tmp_buf_len = MAX_PRINTF_BUF_LEN;
-	int err, i, num_spec = 0, copy_size;
-	enum bpf_printf_mod_type cur_mod;
+	char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+	size_t sizeof_cur_arg, sizeof_cur_ip;
+	int err, i, num_spec = 0;
 	u64 cur_arg;
-	char fmt_ptype;
-
-	if (!!final_args != !!mod)
-		return -EINVAL;
+	char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
 
 	fmt_end = strnchr(fmt, fmt_size, 0);
 	if (!fmt_end)
 		return -EINVAL;
 	fmt_size = fmt_end - fmt;
 
+	if (bin_args) {
+		if (num_args && try_get_fmt_tmp_buf(&tmp_buf))
+			return -EBUSY;
+
+		tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN;
+		*bin_args = (u32 *)tmp_buf;
+	}
+
 	for (i = 0; i < fmt_size; i++) {
 		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
 			err = -EINVAL;
-			goto cleanup;
+			goto out;
 		}
 
 		if (fmt[i] != '%')
@@ -781,7 +780,7 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 
 		if (num_spec >= num_args) {
 			err = -EINVAL;
-			goto cleanup;
+			goto out;
 		}
 
 		/* The string is zero-terminated so if fmt[i] != 0, we can
@@ -800,7 +799,7 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 		}
 
 		if (fmt[i] == 'p') {
-			cur_mod = BPF_PRINTF_LONG;
+			sizeof_cur_arg = sizeof(long);
 
 			if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
 			    fmt[i + 2] == 's') {
@@ -811,117 +810,140 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 
 			if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
 			    ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
-			    fmt[i + 1] == 'x' || fmt[i + 1] == 'B' ||
-			    fmt[i + 1] == 's' || fmt[i + 1] == 'S') {
+			    fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
+			    fmt[i + 1] == 'S') {
 				/* just kernel pointers */
-				if (final_args)
+				if (tmp_buf)
 					cur_arg = raw_args[num_spec];
-				goto fmt_next;
+				i++;
+				goto nocopy_fmt;
+			}
+
+			if (fmt[i + 1] == 'B') {
+				if (tmp_buf)  {
+					err = snprintf(tmp_buf,
+						       (tmp_buf_end - tmp_buf),
+						       "%pB",
+						       (void *)(long)raw_args[num_spec]);
+					tmp_buf += (err + 1);
+				}
+
+				i++;
+				num_spec++;
+				continue;
 			}
 
 			/* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
 			if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
 			    (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
 				err = -EINVAL;
-				goto cleanup;
+				goto out;
 			}
 
 			i += 2;
-			if (!final_args)
-				goto fmt_next;
+			if (!tmp_buf)
+				goto nocopy_fmt;
 
-			if (try_get_fmt_tmp_buf(&tmp_buf)) {
-				err = -EBUSY;
-				goto out;
-			}
-
-			copy_size = (fmt[i + 2] == '4') ? 4 : 16;
-			if (tmp_buf_len < copy_size) {
+			sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
+			if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
 				err = -ENOSPC;
-				goto cleanup;
+				goto out;
 			}
 
 			unsafe_ptr = (char *)(long)raw_args[num_spec];
-			err = copy_from_kernel_nofault(tmp_buf, unsafe_ptr,
-						       copy_size);
+			err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
+						       sizeof_cur_ip);
 			if (err < 0)
-				memset(tmp_buf, 0, copy_size);
-			cur_arg = (u64)(long)tmp_buf;
-			tmp_buf += copy_size;
-			tmp_buf_len -= copy_size;
+				memset(cur_ip, 0, sizeof_cur_ip);
+
+			/* hack: bstr_printf expects IP addresses to be
+			 * pre-formatted as strings, ironically, the easiest way
+			 * to do that is to call snprintf.
+			 */
+			ip_spec[2] = fmt[i - 1];
+			ip_spec[3] = fmt[i];
+			err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
+				       ip_spec, &cur_ip);
 
-			goto fmt_next;
+			tmp_buf += err + 1;
+			num_spec++;
+
+			continue;
 		} else if (fmt[i] == 's') {
-			cur_mod = BPF_PRINTF_LONG;
 			fmt_ptype = fmt[i];
 fmt_str:
 			if (fmt[i + 1] != 0 &&
 			    !isspace(fmt[i + 1]) &&
 			    !ispunct(fmt[i + 1])) {
 				err = -EINVAL;
-				goto cleanup;
-			}
-
-			if (!final_args)
-				goto fmt_next;
-
-			if (try_get_fmt_tmp_buf(&tmp_buf)) {
-				err = -EBUSY;
 				goto out;
 			}
 
-			if (!tmp_buf_len) {
+			if (!tmp_buf)
+				goto nocopy_fmt;
+
+			if (tmp_buf_end == tmp_buf) {
 				err = -ENOSPC;
-				goto cleanup;
+				goto out;
 			}
 
 			unsafe_ptr = (char *)(long)raw_args[num_spec];
 			err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
-						    fmt_ptype, tmp_buf_len);
+						    fmt_ptype,
+						    tmp_buf_end - tmp_buf);
 			if (err < 0) {
 				tmp_buf[0] = '\0';
 				err = 1;
 			}
 
-			cur_arg = (u64)(long)tmp_buf;
 			tmp_buf += err;
-			tmp_buf_len -= err;
+			num_spec++;
 
-			goto fmt_next;
+			continue;
 		}
 
-		cur_mod = BPF_PRINTF_INT;
+		sizeof_cur_arg = sizeof(int);
 
 		if (fmt[i] == 'l') {
-			cur_mod = BPF_PRINTF_LONG;
+			sizeof_cur_arg = sizeof(long);
 			i++;
 		}
 		if (fmt[i] == 'l') {
-			cur_mod = BPF_PRINTF_LONG_LONG;
+			sizeof_cur_arg = sizeof(long long);
 			i++;
 		}
 
 		if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
 		    fmt[i] != 'x' && fmt[i] != 'X') {
 			err = -EINVAL;
-			goto cleanup;
+			goto out;
 		}
 
-		if (final_args)
+		if (tmp_buf)
 			cur_arg = raw_args[num_spec];
-fmt_next:
-		if (final_args) {
-			mod[num_spec] = cur_mod;
-			final_args[num_spec] = cur_arg;
+nocopy_fmt:
+		if (tmp_buf) {
+			tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
+			if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
+				err = -ENOSPC;
+				goto out;
+			}
+
+			if (sizeof_cur_arg == 8) {
+				*(u32 *)tmp_buf = *(u32 *)&cur_arg;
+				*(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
+			} else {
+				*(u32 *)tmp_buf = (u32)(long)cur_arg;
+			}
+			tmp_buf += sizeof_cur_arg;
 		}
 		num_spec++;
 	}
 
 	err = 0;
-cleanup:
-	if (err)
-		bpf_printf_cleanup();
 out:
+	if (err)
+		bpf_bprintf_cleanup();
 	return err;
 }
 
@@ -930,9 +952,8 @@ out:
 BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
 	   const void *, data, u32, data_len)
 {
-	enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS];
-	u64 args[MAX_SNPRINTF_VARARGS];
 	int err, num_args;
+	u32 *bin_args;
 
 	if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
 	    (data_len && !data))
@@ -942,22 +963,13 @@ BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
 	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
 	 * can safely give an unbounded size.
 	 */
-	err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args);
+	err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args);
 	if (err < 0)
 		return err;
 
-	/* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give
-	 * all of them to snprintf().
-	 */
-	err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod),
-		BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod),
-		BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod),
-		BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod),
-		BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod),
-		BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod),
-		BPF_CAST_FMT_ARG(11, args, mod));
-
-	bpf_printf_cleanup();
+	err = bstr_printf(str, str_size, fmt, bin_args);
+
+	bpf_bprintf_cleanup();
 
 	return err + 1;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9145f88b2a0a..8fd552c16763 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5946,7 +5946,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
 	/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
 	 * can focus on validating the format specifiers.
 	 */
-	err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args);
+	err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args);
 	if (err < 0)
 		verbose(env, "Invalid format string\n");
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0e67d12a8f40..d2d7cf6cfe83 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -381,27 +381,23 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
 	   u64, arg2, u64, arg3)
 {
 	u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
-	enum bpf_printf_mod_type mod[MAX_TRACE_PRINTK_VARARGS];
+	u32 *bin_args;
 	static char buf[BPF_TRACE_PRINTK_SIZE];
 	unsigned long flags;
 	int ret;
 
-	ret = bpf_printf_prepare(fmt, fmt_size, args, args, mod,
-				 MAX_TRACE_PRINTK_VARARGS);
+	ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args,
+				  MAX_TRACE_PRINTK_VARARGS);
 	if (ret < 0)
 		return ret;
 
 	raw_spin_lock_irqsave(&trace_printk_lock, flags);
-	ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod),
-		BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod));
-	/* snprintf() will not append null for zero-length strings */
-	if (ret == 0)
-		buf[0] = '\0';
+	ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
 
 	trace_bpf_trace_printk(buf);
 	raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
 
-	bpf_printf_cleanup();
+	bpf_bprintf_cleanup();
 
 	return ret;
 }
@@ -435,31 +431,21 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
 	   const void *, data, u32, data_len)
 {
-	enum bpf_printf_mod_type mod[MAX_SEQ_PRINTF_VARARGS];
-	u64 args[MAX_SEQ_PRINTF_VARARGS];
 	int err, num_args;
+	u32 *bin_args;
 
 	if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 ||
 	    (data_len && !data))
 		return -EINVAL;
 	num_args = data_len / 8;
 
-	err = bpf_printf_prepare(fmt, fmt_size, data, args, mod, num_args);
+	err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
 	if (err < 0)
 		return err;
 
-	/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
-	 * all of them to seq_printf().
-	 */
-	seq_printf(m, fmt, BPF_CAST_FMT_ARG(0, args, mod),
-		BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod),
-		BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod),
-		BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod),
-		BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod),
-		BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod),
-		BPF_CAST_FMT_ARG(11, args, mod));
-
-	bpf_printf_cleanup();
+	seq_bprintf(m, fmt, bin_args);
+
+	bpf_bprintf_cleanup();
 
 	return seq_has_overflowed(m) ? -EOVERFLOW : 0;
 }
-- 
cgit v1.2.3


From d991bb1c8da842a2a0b9dc83b1005e655783f861 Mon Sep 17 00:00:00 2001
From: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Date: Thu, 29 Apr 2021 22:53:50 -0700
Subject: include/linux/compiler-gcc.h: sparse can do constant folding of
 __builtin_bswap*()

Sparse can do constant folding of __builtin_bswap*() since 2017.  Also, a
much recent version of Sparse is needed anyway, see commit 6ec4476ac825
("Raise gcc version requirement to 4.9").

So, remove the comment about sparse not being yet able to constant fold
__builtin_bswap*() and remove the corresponding test of __CHECKER__.

Link: https://lkml.kernel.org/r/20210226092236.99369-1-luc.vanoostenryck@gmail.com
Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 48750243db4c..5d97ef738a57 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -90,15 +90,11 @@
  */
 #define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
 
-/*
- * sparse (__CHECKER__) pretends to be gcc, but can't do constant
- * folding in __builtin_bswap*() (yet), so don't set these for it.
- */
-#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) && !defined(__CHECKER__)
+#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
 #define __HAVE_BUILTIN_BSWAP32__
 #define __HAVE_BUILTIN_BSWAP64__
 #define __HAVE_BUILTIN_BSWAP16__
-#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */
+#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
 
 #if GCC_VERSION >= 70000
 #define KASAN_ABI_VERSION 5
-- 
cgit v1.2.3


From 8e9b16c47680f6e7d6e5864a37f313f905a91cf5 Mon Sep 17 00:00:00 2001
From: Sergei Trofimovich <slyfox@gentoo.org>
Date: Thu, 29 Apr 2021 22:55:08 -0700
Subject: mm: page_owner: detect page_owner recursion via task_struct

Before the change page_owner recursion was detected via fetching
backtrace and inspecting it for current instruction pointer.
It has a few problems:

 - it is slightly slow as it requires extra backtrace and a linear stack
   scan of the result

 - it is too late to check if backtrace fetching required memory
   allocation itself (ia64's unwinder requires it).

To simplify recursion tracking let's use page_owner recursion flag in
'struct task_struct'.

The change make page_owner=on work on ia64 by avoiding infinite
recursion in:
  kmalloc()
  -> __set_page_owner()
  -> save_stack()
  -> unwind() [ia64-specific]
  -> build_script()
  -> kmalloc()
  -> __set_page_owner() [we short-circuit here]
  -> save_stack()
  -> unwind() [recursion]

Link: https://lkml.kernel.org/r/20210402115342.1463781-1-slyfox@gentoo.org
Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  4 ++++
 mm/page_owner.c       | 32 ++++++++++----------------------
 2 files changed, 14 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7d07aa0facf..9c25c8e67030 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -841,6 +841,10 @@ struct task_struct {
 	/* Stalled due to lack of memory */
 	unsigned			in_memstall:1;
 #endif
+#ifdef CONFIG_PAGE_OWNER
+	/* Used by page_owner=on to detect recursion in page tracking. */
+	unsigned			in_page_owner:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 5c941ca9861d..9661d5320a07 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -98,42 +98,30 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
 	return (void *)page_ext + page_owner_ops.offset;
 }
 
-static inline bool check_recursive_alloc(unsigned long *entries,
-					 unsigned int nr_entries,
-					 unsigned long ip)
-{
-	unsigned int i;
-
-	for (i = 0; i < nr_entries; i++) {
-		if (entries[i] == ip)
-			return true;
-	}
-	return false;
-}
-
 static noinline depot_stack_handle_t save_stack(gfp_t flags)
 {
 	unsigned long entries[PAGE_OWNER_STACK_DEPTH];
 	depot_stack_handle_t handle;
 	unsigned int nr_entries;
 
-	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
-
 	/*
-	 * We need to check recursion here because our request to
-	 * stackdepot could trigger memory allocation to save new
-	 * entry. New memory allocation would reach here and call
-	 * stack_depot_save_entries() again if we don't catch it. There is
-	 * still not enough memory in stackdepot so it would try to
-	 * allocate memory again and loop forever.
+	 * Avoid recursion.
+	 *
+	 * Sometimes page metadata allocation tracking requires more
+	 * memory to be allocated:
+	 * - when new stack trace is saved to stack depot
+	 * - when backtrace itself is calculated (ia64)
 	 */
-	if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
+	if (current->in_page_owner)
 		return dummy_handle;
+	current->in_page_owner = 1;
 
+	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
 	handle = stack_depot_save(entries, nr_entries, flags);
 	if (!handle)
 		handle = failure_handle;
 
+	current->in_page_owner = 0;
 	return handle;
 }
 
-- 
cgit v1.2.3


From 63135aa3866db99fd923b716c5ff2e468879624a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 29 Apr 2021 22:55:18 -0700
Subject: mm: provide filemap_range_needs_writeback() helper

Patch series "Improve IOCB_NOWAIT O_DIRECT reads", v3.

An internal workload complained because it was using too much CPU, and
when I took a look, we had a lot of io_uring workers going to town.

For an async buffered read like workload, I am normally expecting _zero_
offloads to a worker thread, but this one had tons of them.  I'd drop
caches and things would look good again, but then a minute later we'd
regress back to using workers.  Turns out that every minute something
was reading parts of the device, which would add page cache for that
inode.  I put patches like these in for our kernel, and the problem was
solved.

Don't -EAGAIN IOCB_NOWAIT dio reads just because we have page cache
entries for the given range.  This causes unnecessary work from the
callers side, when the IO could have been issued totally fine without
blocking on writeback when there is none.

This patch (of 3):

For O_DIRECT reads/writes, we check if we need to issue a call to
filemap_write_and_wait_range() to issue and/or wait for writeback for any
page in the given range.  The existing mechanism just checks for a page in
the range, which is suboptimal for IOCB_NOWAIT as we'll fallback to the
slow path (and needing retry) if there's just a clean page cache page in
the range.

Provide filemap_range_needs_writeback() which tries a little harder to
check if we actually need to issue and/or wait for writeback in the range.

Link: https://lkml.kernel.org/r/20210224164455.1096727-1-axboe@kernel.dk
Link: https://lkml.kernel.org/r/20210224164455.1096727-2-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h |  2 ++
 mm/filemap.c       | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bf4e90d3ab18..12766edee81f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2878,6 +2878,8 @@ static inline int filemap_fdatawait(struct address_space *mapping)
 
 extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
 				  loff_t lend);
+extern bool filemap_range_needs_writeback(struct address_space *,
+					  loff_t lstart, loff_t lend);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
 extern int __filemap_fdatawrite_range(struct address_space *mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index 151090fdcf29..1750742b5689 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -635,6 +635,49 @@ static bool mapping_needs_writeback(struct address_space *mapping)
 	return mapping->nrpages;
 }
 
+/**
+ * filemap_range_needs_writeback - check if range potentially needs writeback
+ * @mapping:           address space within which to check
+ * @start_byte:        offset in bytes where the range starts
+ * @end_byte:          offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback. Used by O_DIRECT
+ * read/write with IOCB_NOWAIT, to see if the caller needs to do
+ * filemap_write_and_wait_range() before proceeding.
+ *
+ * Return: %true if the caller should do filemap_write_and_wait_range() before
+ * doing O_DIRECT to a page in this range, %false otherwise.
+ */
+bool filemap_range_needs_writeback(struct address_space *mapping,
+				   loff_t start_byte, loff_t end_byte)
+{
+	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+	pgoff_t max = end_byte >> PAGE_SHIFT;
+	struct page *page;
+
+	if (!mapping_needs_writeback(mapping))
+		return false;
+	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+	    !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+		return false;
+	if (end_byte < start_byte)
+		return false;
+
+	rcu_read_lock();
+	xas_for_each(&xas, page, max) {
+		if (xas_retry(&xas, page))
+			continue;
+		if (xa_is_value(page))
+			continue;
+		if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
+			break;
+	}
+	rcu_read_unlock();
+	return page != NULL;
+}
+EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
+
 /**
  * filemap_write_and_wait_range - write out & wait on a file range
  * @mapping:	the address_space for the pages
-- 
cgit v1.2.3


From 1c824a680b1b67ad43c0908f11a70bcf37af56d5 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 29 Apr 2021 22:55:32 -0700
Subject: mm: page-writeback: simplify memcg handling in
 test_clear_page_writeback()

Page writeback doesn't hold a page reference, which allows truncate to
free a page the second PageWriteback is cleared.  This used to require
special attention in test_clear_page_writeback(), where we had to be
careful not to rely on the unstable page->memcg binding and look up all
the necessary information before clearing the writeback flag.

Since commit 073861ed77b6 ("mm: fix VM_BUG_ON(PageTail) and
BUG_ON(PageWriteback)") test_clear_page_writeback() is called with an
explicit reference on the page, and this dance is no longer needed.

Use unlock_page_memcg() and dec_lruvec_page_state() directly.

This removes the last user of the lock_page_memcg() return value, change
it to void.  Touch up the comments in there as well.  This also removes
the last extern user of __unlock_page_memcg(), make it static.  Further,
it removes the last user of dec_lruvec_state(), delete it, along with a
few other unused helpers.

Link: https://lkml.kernel.org/r/YCQbYAWg4nvBFL6h@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ++--------
 include/linux/vmstat.h     | 24 +++---------------------
 mm/memcontrol.c            | 36 +++++++++++-------------------------
 mm/page-writeback.c        |  9 +++------
 4 files changed, 19 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c04d39a7967..ae448d955a87 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -867,8 +867,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
 extern bool cgroup_memory_noswap;
 #endif
 
-struct mem_cgroup *lock_page_memcg(struct page *page);
-void __unlock_page_memcg(struct mem_cgroup *memcg);
+void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
 /*
@@ -1289,12 +1288,7 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
 {
 }
 
-static inline struct mem_cgroup *lock_page_memcg(struct page *page)
-{
-	return NULL;
-}
-
-static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
+static inline void lock_page_memcg(struct page *page)
 {
 }
 
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 506d625163a1..3299cd69e4ca 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -512,16 +512,10 @@ static inline void mod_lruvec_page_state(struct page *page,
 
 #endif /* CONFIG_MEMCG */
 
-static inline void __inc_lruvec_state(struct lruvec *lruvec,
-				      enum node_stat_item idx)
-{
-	__mod_lruvec_state(lruvec, idx, 1);
-}
-
-static inline void __dec_lruvec_state(struct lruvec *lruvec,
-				      enum node_stat_item idx)
+static inline void inc_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx)
 {
-	__mod_lruvec_state(lruvec, idx, -1);
+	mod_lruvec_state(lruvec, idx, 1);
 }
 
 static inline void __inc_lruvec_page_state(struct page *page,
@@ -536,18 +530,6 @@ static inline void __dec_lruvec_page_state(struct page *page,
 	__mod_lruvec_page_state(page, idx, -1);
 }
 
-static inline void inc_lruvec_state(struct lruvec *lruvec,
-				    enum node_stat_item idx)
-{
-	mod_lruvec_state(lruvec, idx, 1);
-}
-
-static inline void dec_lruvec_state(struct lruvec *lruvec,
-				    enum node_stat_item idx)
-{
-	mod_lruvec_state(lruvec, idx, -1);
-}
-
 static inline void inc_lruvec_page_state(struct page *page,
 					 enum node_stat_item idx)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e064ac0d850a..06caac775abb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2118,11 +2118,10 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
  * This function protects unlocked LRU pages from being moved to
  * another cgroup.
  *
- * It ensures lifetime of the returned memcg. Caller is responsible
- * for the lifetime of the page; __unlock_page_memcg() is available
- * when @page might get freed inside the locked section.
+ * It ensures lifetime of the locked memcg. Caller is responsible
+ * for the lifetime of the page.
  */
-struct mem_cgroup *lock_page_memcg(struct page *page)
+void lock_page_memcg(struct page *page)
 {
 	struct page *head = compound_head(page); /* rmap on tail pages */
 	struct mem_cgroup *memcg;
@@ -2132,21 +2131,15 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
 	 * The RCU lock is held throughout the transaction.  The fast
 	 * path can get away without acquiring the memcg->move_lock
 	 * because page moving starts with an RCU grace period.
-	 *
-	 * The RCU lock also protects the memcg from being freed when
-	 * the page state that is going to change is the only thing
-	 * preventing the page itself from being freed. E.g. writeback
-	 * doesn't hold a page reference and relies on PG_writeback to
-	 * keep off truncation, migration and so forth.
          */
 	rcu_read_lock();
 
 	if (mem_cgroup_disabled())
-		return NULL;
+		return;
 again:
 	memcg = page_memcg(head);
 	if (unlikely(!memcg))
-		return NULL;
+		return;
 
 #ifdef CONFIG_PROVE_LOCKING
 	local_irq_save(flags);
@@ -2155,7 +2148,7 @@ again:
 #endif
 
 	if (atomic_read(&memcg->moving_account) <= 0)
-		return memcg;
+		return;
 
 	spin_lock_irqsave(&memcg->move_lock, flags);
 	if (memcg != page_memcg(head)) {
@@ -2164,24 +2157,17 @@ again:
 	}
 
 	/*
-	 * When charge migration first begins, we can have locked and
-	 * unlocked page stat updates happening concurrently.  Track
-	 * the task who has the lock for unlock_page_memcg().
+	 * When charge migration first begins, we can have multiple
+	 * critical sections holding the fast-path RCU lock and one
+	 * holding the slowpath move_lock. Track the task who has the
+	 * move_lock for unlock_page_memcg().
 	 */
 	memcg->move_lock_task = current;
 	memcg->move_lock_flags = flags;
-
-	return memcg;
 }
 EXPORT_SYMBOL(lock_page_memcg);
 
-/**
- * __unlock_page_memcg - unlock and unpin a memcg
- * @memcg: the memcg
- *
- * Unlock and unpin a memcg returned by lock_page_memcg().
- */
-void __unlock_page_memcg(struct mem_cgroup *memcg)
+static void __unlock_page_memcg(struct mem_cgroup *memcg)
 {
 	if (memcg && memcg->move_lock_task == current) {
 		unsigned long flags = memcg->move_lock_flags;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9e35b636a393..5e761fb62800 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2722,12 +2722,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
-	struct mem_cgroup *memcg;
-	struct lruvec *lruvec;
 	int ret;
 
-	memcg = lock_page_memcg(page);
-	lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+	lock_page_memcg(page);
 	if (mapping && mapping_use_writeback_tags(mapping)) {
 		struct inode *inode = mapping->host;
 		struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2755,11 +2752,11 @@ int test_clear_page_writeback(struct page *page)
 		ret = TestClearPageWriteback(page);
 	}
 	if (ret) {
-		dec_lruvec_state(lruvec, NR_WRITEBACK);
+		dec_lruvec_page_state(page, NR_WRITEBACK);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		inc_node_page_state(page, NR_WRITTEN);
 	}
-	__unlock_page_memcg(memcg);
+	unlock_page_memcg(page);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 842ca547f706b1e05ccf3026a0ab15d24772a188 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 29 Apr 2021 22:55:35 -0700
Subject: mm: move page_mapping_file to pagemap.h

page_mapping_file() is only used by some architectures, and then it
is usually only used in one place.  Make it a static inline function
so other architectures don't have to carry this dead code.

Link: https://lkml.kernel.org/r/20210317123011.350118-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/copypage-v4mc.c   |  1 +
 arch/arm/mm/copypage-v6.c     |  1 +
 arch/arm/mm/copypage-xscale.c |  1 +
 arch/csky/abiv1/cacheflush.c  |  1 +
 arch/mips/mm/cache.c          |  1 +
 arch/nios2/mm/cacheflush.c    |  1 +
 arch/sh/mm/cache-sh4.c        |  1 +
 arch/sh/mm/cache-sh7705.c     |  1 +
 arch/sparc/mm/tlb.c           |  1 +
 include/linux/mm.h            |  1 -
 include/linux/pagemap.h       | 10 ++++++++++
 mm/util.c                     | 10 ----------
 12 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c
index 44f7292ec27b..f1da3b439b96 100644
--- a/arch/arm/mm/copypage-v4mc.c
+++ b/arch/arm/mm/copypage-v4mc.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/pagemap.h>
 
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c
index 6a769a6c314e..d8a115de5507 100644
--- a/arch/arm/mm/copypage-v6.c
+++ b/arch/arm/mm/copypage-v6.c
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/pagemap.h>
 
 #include <asm/shmparam.h>
 #include <asm/tlbflush.h>
diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c
index eb5d338657d1..bcb485620a05 100644
--- a/arch/arm/mm/copypage-xscale.c
+++ b/arch/arm/mm/copypage-xscale.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/pagemap.h>
 
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c
index 9f1fe80cc847..07ff17ea33de 100644
--- a/arch/csky/abiv1/cacheflush.c
+++ b/arch/csky/abiv1/cacheflush.c
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/spinlock.h>
 #include <asm/page.h>
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index 7719d632df8d..a7bf0c80371c 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -15,6 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/pagemap.h>
 
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c
index 65de1bd6a760..6aa9257c3ede 100644
--- a/arch/nios2/mm/cacheflush.c
+++ b/arch/nios2/mm/cacheflush.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 
 #include <asm/cacheflush.h>
 #include <asm/cpuinfo.h>
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index ddfa9685f1ef..72c2e1b46c08 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -16,6 +16,7 @@
 #include <linux/mutex.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
+#include <linux/pagemap.h>
 #include <asm/mmu_context.h>
 #include <asm/cache_insns.h>
 #include <asm/cacheflush.h>
diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c
index 4c67b3d88775..9b63a53a5e46 100644
--- a/arch/sh/mm/cache-sh7705.c
+++ b/arch/sh/mm/cache-sh7705.c
@@ -13,6 +13,7 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/threads.h>
 #include <asm/addrspace.h>
 #include <asm/page.h>
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 20ee14739333..9a725547578e 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/preempt.h>
+#include <linux/pagemap.h>
 
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21115933b9b8..2e5c207e702c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1629,7 +1629,6 @@ static inline pgoff_t page_index(struct page *page)
 
 bool page_mapped(struct page *page);
 struct address_space *page_mapping(struct page *page);
-struct address_space *page_mapping_file(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4686f9ab0636..469fa7ffcf96 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -157,6 +157,16 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
 
 void release_pages(struct page **pages, int nr);
 
+/*
+ * For file cache pages, return the address_space, otherwise return NULL
+ */
+static inline struct address_space *page_mapping_file(struct page *page)
+{
+	if (unlikely(PageSwapCache(page)))
+		return NULL;
+	return page_mapping(page);
+}
+
 /*
  * speculatively take a reference to a page.
  * If the page is free (_refcount == 0), then _refcount is untouched, and 0
diff --git a/mm/util.c b/mm/util.c
index c37e24d5fa43..083c5c417cfc 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -711,16 +711,6 @@ struct address_space *page_mapping(struct page *page)
 }
 EXPORT_SYMBOL(page_mapping);
 
-/*
- * For file cache pages, return the address_space, otherwise return NULL
- */
-struct address_space *page_mapping_file(struct page *page)
-{
-	if (unlikely(PageSwapCache(page)))
-		return NULL;
-	return page_mapping(page);
-}
-
 /* Slow path of page_mapcount() for compound pages */
 int __page_mapcount(struct page *page)
 {
-- 
cgit v1.2.3


From 458a4f788f8602e5701b3d8c2fb6b021310a7301 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 29 Apr 2021 22:55:50 -0700
Subject: mm/gup: add a range variant of unpin_user_pages_dirty_lock()

Add an unpin_user_page_range_dirty_lock() API which takes a starting page
and how many consecutive pages we want to unpin and optionally dirty.

To that end, define another iterator for_each_compound_range() that
operates in page ranges as opposed to page array.

For users (like RDMA mr_dereg) where each sg represents a contiguous set
of pages, we're able to more efficiently unpin pages without having to
supply an array of pages much of what happens today with
unpin_user_pages().

Link: https://lkml.kernel.org/r/20210212130843.13865-4-joao.m.martins@oracle.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/gup.c           | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e5c207e702c..702c2a7379d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1265,6 +1265,8 @@ static inline void put_page(struct page *page)
 void unpin_user_page(struct page *page);
 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 				 bool make_dirty);
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+				      bool make_dirty);
 void unpin_user_pages(struct page **pages, unsigned long npages);
 
 /**
diff --git a/mm/gup.c b/mm/gup.c
index de1b75ef44da..66522ae28d09 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -213,6 +213,32 @@ void unpin_user_page(struct page *page)
 }
 EXPORT_SYMBOL(unpin_user_page);
 
+static inline void compound_range_next(unsigned long i, unsigned long npages,
+				       struct page **list, struct page **head,
+				       unsigned int *ntails)
+{
+	struct page *next, *page;
+	unsigned int nr = 1;
+
+	if (i >= npages)
+		return;
+
+	next = *list + i;
+	page = compound_head(next);
+	if (PageCompound(page) && compound_order(page) >= 1)
+		nr = min_t(unsigned int,
+			   page + compound_nr(page) - next, npages - i);
+
+	*head = page;
+	*ntails = nr;
+}
+
+#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
+	for (__i = 0, \
+	     compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
+	     __i < __npages; __i += __ntails, \
+	     compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
+
 static inline void compound_next(unsigned long i, unsigned long npages,
 				 struct page **list, struct page **head,
 				 unsigned int *ntails)
@@ -301,6 +327,42 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 }
 EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
 
+/**
+ * unpin_user_page_range_dirty_lock() - release and optionally dirty
+ * gup-pinned page range
+ *
+ * @page:  the starting page of a range maybe marked dirty, and definitely released.
+ * @npages: number of consecutive pages to release.
+ * @make_dirty: whether to mark the pages dirty
+ *
+ * "gup-pinned page range" refers to a range of pages that has had one of the
+ * pin_user_pages() variants called on that page.
+ *
+ * For the page ranges defined by [page .. page+npages], make that range (or
+ * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
+ * page range was previously listed as clean.
+ *
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), unpin_user_page().
+ *
+ */
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+				      bool make_dirty)
+{
+	unsigned long index;
+	struct page *head;
+	unsigned int ntails;
+
+	for_each_compound_range(index, &page, npages, head, ntails) {
+		if (make_dirty && !PageDirty(head))
+			set_page_dirty_lock(head);
+		put_compound_head(head, ntails, FOLL_PIN);
+	}
+}
+EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
+
 /**
  * unpin_user_pages() - release an array of gup-pinned pages.
  * @pages:  array of pages to be marked dirty and released.
-- 
cgit v1.2.3


From 4066c119483af8e86a75447fd35be1d2553d370f Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Thu, 29 Apr 2021 22:55:56 -0700
Subject: mm: gup: remove FOLL_SPLIT

Since commit 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of
FOLL_SPLIT") and commit ba925fa35057 ("s390/gmap: improve THP splitting")
FOLL_SPLIT has not been used anymore.  Remove the dead code.

Link: https://lkml.kernel.org/r/20210330203900.9222-1-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/transhuge.rst |  5 -----
 include/linux/mm.h             |  1 -
 mm/gup.c                       | 28 ++--------------------------
 3 files changed, 2 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst
index 0ed23e59abe5..216db1d67d04 100644
--- a/Documentation/vm/transhuge.rst
+++ b/Documentation/vm/transhuge.rst
@@ -53,11 +53,6 @@ prevent the page from being split by anyone.
    of handling GUP on hugetlbfs will also work fine on transparent
    hugepage backed mappings.
 
-In case you can't handle compound pages if they're returned by
-follow_page, the FOLL_SPLIT bit can be specified as a parameter to
-follow_page, so that it will split the hugepages before returning
-them.
-
 Graceful fallback
 =================
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 702c2a7379d6..64be3baf861a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2791,7 +2791,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_NOWAIT	0x20	/* if a disk transfer is needed, start the IO
 				 * and return without waiting upon it */
 #define FOLL_POPULATE	0x40	/* fault in page */
-#define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
 #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
 #define FOLL_NUMA	0x200	/* force NUMA hinting page fault */
 #define FOLL_MIGRATION	0x400	/* wait for page to replace migration entry */
diff --git a/mm/gup.c b/mm/gup.c
index 66522ae28d09..71e546e279fc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -516,18 +516,6 @@ retry:
 		}
 	}
 
-	if (flags & FOLL_SPLIT && PageTransCompound(page)) {
-		get_page(page);
-		pte_unmap_unlock(ptep, ptl);
-		lock_page(page);
-		ret = split_huge_page(page);
-		unlock_page(page);
-		put_page(page);
-		if (ret)
-			return ERR_PTR(ret);
-		goto retry;
-	}
-
 	/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
 	if (unlikely(!try_grab_page(page, flags))) {
 		page = ERR_PTR(-ENOMEM);
@@ -672,7 +660,7 @@ retry_locked:
 		spin_unlock(ptl);
 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 	}
-	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
+	if (flags & FOLL_SPLIT_PMD) {
 		int ret;
 		page = pmd_page(*pmd);
 		if (is_huge_zero_page(page)) {
@@ -681,19 +669,7 @@ retry_locked:
 			split_huge_pmd(vma, pmd, address);
 			if (pmd_trans_unstable(pmd))
 				ret = -EBUSY;
-		} else if (flags & FOLL_SPLIT) {
-			if (unlikely(!try_get_page(page))) {
-				spin_unlock(ptl);
-				return ERR_PTR(-ENOMEM);
-			}
-			spin_unlock(ptl);
-			lock_page(page);
-			ret = split_huge_page(page);
-			unlock_page(page);
-			put_page(page);
-			if (pmd_none(*pmd))
-				return no_page_table(vma, flags);
-		} else {  /* flags & FOLL_SPLIT_PMD */
+		} else {
 			spin_unlock(ptl);
 			split_huge_pmd(vma, pmd, address);
 			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
-- 
cgit v1.2.3


From a3747b53b1771a787fea71d86a2fc39aea337685 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 29 Apr 2021 22:56:14 -0700
Subject: mm: memcontrol: kill mem_cgroup_nodeinfo()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No need to encapsulate a simple struct member access.

Link: https://lkml.kernel.org/r/20210209163304.77088-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  8 +-------
 mm/memcontrol.c            | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ae448d955a87..149dc2fd97a6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -602,12 +602,6 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
 
-static struct mem_cgroup_per_node *
-mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
-{
-	return memcg->nodeinfo[nid];
-}
-
 /**
  * mem_cgroup_lruvec - get the lru list vector for a memcg & node
  * @memcg: memcg of the wanted lruvec
@@ -631,7 +625,7 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
 	if (!memcg)
 		memcg = root_mem_cgroup;
 
-	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+	mz = memcg->nodeinfo[pgdat->node_id];
 	lruvec = &mz->lruvec;
 out:
 	/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5be0e8834a5a..b4a73c41312e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -414,13 +414,14 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
 					 int size, int old_size)
 {
 	struct memcg_shrinker_map *new, *old;
+	struct mem_cgroup_per_node *pn;
 	int nid;
 
 	lockdep_assert_held(&memcg_shrinker_map_mutex);
 
 	for_each_node(nid) {
-		old = rcu_dereference_protected(
-			mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
+		pn = memcg->nodeinfo[nid];
+		old = rcu_dereference_protected(pn->shrinker_map, true);
 		/* Not yet online memcg */
 		if (!old)
 			return 0;
@@ -433,7 +434,7 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
 		memset(new->map, (int)0xff, old_size);
 		memset((void *)new->map + old_size, 0, size - old_size);
 
-		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
+		rcu_assign_pointer(pn->shrinker_map, new);
 		call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
 	}
 
@@ -450,7 +451,7 @@ static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
 		return;
 
 	for_each_node(nid) {
-		pn = mem_cgroup_nodeinfo(memcg, nid);
+		pn = memcg->nodeinfo[nid];
 		map = rcu_dereference_protected(pn->shrinker_map, true);
 		kvfree(map);
 		rcu_assign_pointer(pn->shrinker_map, NULL);
@@ -713,7 +714,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 	int nid;
 
 	for_each_node(nid) {
-		mz = mem_cgroup_nodeinfo(memcg, nid);
+		mz = memcg->nodeinfo[nid];
 		mctz = soft_limit_tree_node(nid);
 		if (mctz)
 			mem_cgroup_remove_exceeded(mz, mctz);
@@ -796,7 +797,7 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
 	parent = parent_mem_cgroup(pn->memcg);
 	if (!parent)
 		return NULL;
-	return mem_cgroup_nodeinfo(parent, nid);
+	return parent->nodeinfo[nid];
 }
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@ -1136,7 +1137,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 	if (reclaim) {
 		struct mem_cgroup_per_node *mz;
 
-		mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
+		mz = root->nodeinfo[reclaim->pgdat->node_id];
 		iter = &mz->iter;
 
 		if (prev && reclaim->generation != iter->generation)
@@ -1238,7 +1239,7 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
 	int nid;
 
 	for_each_node(nid) {
-		mz = mem_cgroup_nodeinfo(from, nid);
+		mz = from->nodeinfo[nid];
 		iter = &mz->iter;
 		cmpxchg(&iter->position, dead_memcg, NULL);
 	}
@@ -2397,7 +2398,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 				struct mem_cgroup_per_node *pn;
 				long x;
 
-				pn = mem_cgroup_nodeinfo(memcg, nid);
+				pn = memcg->nodeinfo[nid];
 				lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
 
 				x = lstatc->count[i];
@@ -4098,7 +4099,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		unsigned long file_cost = 0;
 
 		for_each_online_pgdat(pgdat) {
-			mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+			mz = memcg->nodeinfo[pgdat->node_id];
 
 			anon_cost += mz->lruvec.anon_cost;
 			file_cost += mz->lruvec.file_cost;
-- 
cgit v1.2.3


From a18e6e6e150a98b9ce3e9acabeff407e7b6ba0c0 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 29 Apr 2021 22:56:17 -0700
Subject: mm: memcontrol: privatize memcg_page_state query functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no users outside of the memory controller itself. The rest
of the kernel cares either about node or lruvec stats.

Link: https://lkml.kernel.org/r/20210209163304.77088-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 44 --------------------------------------------
 mm/memcontrol.c            | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 149dc2fd97a6..9a02aabd0000 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -864,39 +864,6 @@ extern bool cgroup_memory_noswap;
 void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page_state().
- */
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
-	long x = atomic_long_read(&memcg->vmstats[idx]);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
-}
-
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page_state().
- */
-static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
-						   int idx)
-{
-	long x = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
-}
-
 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1322,17 +1289,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 {
 }
 
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
-	return 0;
-}
-
-static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
-						   int idx)
-{
-	return 0;
-}
-
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
 				     int idx,
 				     int nr)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b4a73c41312e..56d1c6e58c3b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -789,6 +789,38 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
 }
 
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page_state().
+ */
+static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+	long x = atomic_long_read(&memcg->vmstats[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page_state().
+ */
+static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
+{
+	long x = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
 static struct mem_cgroup_per_node *
 parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
 {
-- 
cgit v1.2.3


From 2d146aa3aa842d7f5065802556b4f9a2c6e8ef12 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 29 Apr 2021 22:56:26 -0700
Subject: mm: memcontrol: switch to rstat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the memory controller's custom hierarchical stats code with the
generic rstat infrastructure provided by the cgroup core.

The current implementation does batched upward propagation from the
write side (i.e.  as stats change).  The per-cpu batches introduce an
error, which is multiplied by the number of subgroups in a tree.  In
systems with many CPUs and sizable cgroup trees, the error can be large
enough to confuse users (e.g.  32 batch pages * 32 CPUs * 32 subgroups
results in an error of up to 128M per stat item).  This can entirely
swallow allocation bursts inside a workload that the user is expecting
to see reflected in the statistics.

In the past, we've done read-side aggregation, where a memory.stat read
would have to walk the entire subtree and add up per-cpu counts.  This
became problematic with lazily-freed cgroups: we could have large
subtrees where most cgroups were entirely idle.  Hence the switch to
change-driven upward propagation.  Unfortunately, it needed to trade
accuracy for speed due to the write side being so hot.

Rstat combines the best of both worlds: from the write side, it cheaply
maintains a queue of cgroups that have pending changes, so that the read
side can do selective tree aggregation.  This way the reported stats
will always be precise and recent as can be, while the aggregation can
skip over potentially large numbers of idle cgroups.

The way rstat works is that it implements a tree for tracking cgroups
with pending local changes, as well as a flush function that walks the
tree upwards.  The controller then drives this by 1) telling rstat when
a local cgroup stat changes (e.g.  mod_memcg_state) and 2) when a flush
is required to get uptodate hierarchy stats for a given subtree (e.g.
when memory.stat is read).  The controller also provides a flush
callback that is called during the rstat flush walk for each cgroup and
aggregates its local per-cpu counters and propagates them upwards.

This adds a second vmstats to struct mem_cgroup (MEMCG_NR_STAT +
NR_VM_EVENT_ITEMS) to track pending subtree deltas during upward
aggregation.  It removes 3 words from the per-cpu data.  It eliminates
memcg_exact_page_state(), since memcg_page_state() is now exact.

[akpm@linux-foundation.org: merge fix]
[hannes@cmpxchg.org: fix a sleep in atomic section problem]
  Link: https://lkml.kernel.org/r/20210315234100.64307-1-hannes@cmpxchg.org

Link: https://lkml.kernel.org/r/20210209163304.77088-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  67 ++++++++------
 mm/memcontrol.c            | 218 ++++++++++++++++++---------------------------
 2 files changed, 127 insertions(+), 158 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9a02aabd0000..74910ce9a3f9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -76,10 +76,27 @@ enum mem_cgroup_events_target {
 };
 
 struct memcg_vmstats_percpu {
-	long stat[MEMCG_NR_STAT];
-	unsigned long events[NR_VM_EVENT_ITEMS];
-	unsigned long nr_page_events;
-	unsigned long targets[MEM_CGROUP_NTARGETS];
+	/* Local (CPU and cgroup) page state & events */
+	long			state[MEMCG_NR_STAT];
+	unsigned long		events[NR_VM_EVENT_ITEMS];
+
+	/* Delta calculation for lockless upward propagation */
+	long			state_prev[MEMCG_NR_STAT];
+	unsigned long		events_prev[NR_VM_EVENT_ITEMS];
+
+	/* Cgroup1: threshold notifications & softlimit tree updates */
+	unsigned long		nr_page_events;
+	unsigned long		targets[MEM_CGROUP_NTARGETS];
+};
+
+struct memcg_vmstats {
+	/* Aggregated (CPU and subtree) page state & events */
+	long			state[MEMCG_NR_STAT];
+	unsigned long		events[NR_VM_EVENT_ITEMS];
+
+	/* Pending child counts during tree propagation */
+	long			state_pending[MEMCG_NR_STAT];
+	unsigned long		events_pending[NR_VM_EVENT_ITEMS];
 };
 
 struct mem_cgroup_reclaim_iter {
@@ -287,8 +304,8 @@ struct mem_cgroup {
 
 	MEMCG_PADDING(_pad1_);
 
-	atomic_long_t		vmstats[MEMCG_NR_STAT];
-	atomic_long_t		vmevents[NR_VM_EVENT_ITEMS];
+	/* memory.stat */
+	struct memcg_vmstats	vmstats;
 
 	/* memory.events */
 	atomic_long_t		memory_events[MEMCG_NR_MEMORY_EVENTS];
@@ -315,10 +332,6 @@ struct mem_cgroup {
 	atomic_t		moving_account;
 	struct task_struct	*move_lock_task;
 
-	/* Legacy local VM stats and events */
-	struct memcg_vmstats_percpu __percpu *vmstats_local;
-
-	/* Subtree VM stats and events (batched updates) */
 	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -939,10 +952,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
 	local_irq_restore(flags);
 }
 
-unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
-						gfp_t gfp_mask,
-						unsigned long *total_scanned);
-
 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 			  unsigned long count);
 
@@ -1023,6 +1032,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 
 void split_page_memcg(struct page *head, unsigned int nr);
 
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+						gfp_t gfp_mask,
+						unsigned long *total_scanned);
+
 #else /* CONFIG_MEMCG */
 
 #define MEM_CGROUP_ID_SHIFT	0
@@ -1131,6 +1144,10 @@ static inline bool lruvec_holds_page_lru_lock(struct page *page,
 	return lruvec == &pgdat->__lruvec;
 }
 
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+
 static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
 	return NULL;
@@ -1334,18 +1351,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 	mod_node_page_state(page_pgdat(page), idx, val);
 }
 
-static inline
-unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
-					    gfp_t gfp_mask,
-					    unsigned long *total_scanned)
-{
-	return 0;
-}
-
-static inline void split_page_memcg(struct page *head, unsigned int nr)
-{
-}
-
 static inline void count_memcg_events(struct mem_cgroup *memcg,
 				      enum vm_event_item idx,
 				      unsigned long count)
@@ -1368,8 +1373,16 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
 
-static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+static inline void split_page_memcg(struct page *head, unsigned int nr)
+{
+}
+
+static inline
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
+					    gfp_t gfp_mask,
+					    unsigned long *total_scanned)
 {
+	return 0;
 }
 #endif /* CONFIG_MEMCG */
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 56d1c6e58c3b..b323588223ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -765,37 +765,17 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  */
 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 {
-	long x, threshold = MEMCG_CHARGE_BATCH;
-
 	if (mem_cgroup_disabled())
 		return;
 
-	if (memcg_stat_item_in_bytes(idx))
-		threshold <<= PAGE_SHIFT;
-
-	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
-	if (unlikely(abs(x) > threshold)) {
-		struct mem_cgroup *mi;
-
-		/*
-		 * Batch local counters to keep them in sync with
-		 * the hierarchical ones.
-		 */
-		__this_cpu_add(memcg->vmstats_local->stat[idx], x);
-		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
-			atomic_long_add(x, &mi->vmstats[idx]);
-		x = 0;
-	}
-	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 }
 
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page_state().
- */
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
 static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
-	long x = atomic_long_read(&memcg->vmstats[idx]);
+	long x = READ_ONCE(memcg->vmstats.state[idx]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -803,17 +783,14 @@ static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 	return x;
 }
 
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page_state().
- */
+/* idx can be of type enum memcg_stat_item or node_stat_item. */
 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 {
 	long x = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
+		x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -936,30 +913,16 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 			  unsigned long count)
 {
-	unsigned long x;
-
 	if (mem_cgroup_disabled())
 		return;
 
-	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
-	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
-		struct mem_cgroup *mi;
-
-		/*
-		 * Batch local counters to keep them in sync with
-		 * the hierarchical ones.
-		 */
-		__this_cpu_add(memcg->vmstats_local->events[idx], x);
-		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
-			atomic_long_add(x, &mi->vmevents[idx]);
-		x = 0;
-	}
-	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+	__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 }
 
 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 {
-	return atomic_long_read(&memcg->vmevents[event]);
+	return READ_ONCE(memcg->vmstats.events[event]);
 }
 
 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
@@ -968,7 +931,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		x += per_cpu(memcg->vmstats_local->events[event], cpu);
+		x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
 	return x;
 }
 
@@ -1604,6 +1567,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
 	 *
 	 * Current memory state:
 	 */
+	cgroup_rstat_flush(memcg->css.cgroup);
 
 	for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
 		u64 size;
@@ -2409,22 +2373,11 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 	drain_stock(stock);
 
 	for_each_mem_cgroup(memcg) {
-		struct memcg_vmstats_percpu *statc;
 		int i;
 
-		statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
-
-		for (i = 0; i < MEMCG_NR_STAT; i++) {
+		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
 			int nid;
 
-			if (statc->stat[i]) {
-				mod_memcg_state(memcg, i, statc->stat[i]);
-				statc->stat[i] = 0;
-			}
-
-			if (i >= NR_VM_NODE_STAT_ITEMS)
-				continue;
-
 			for_each_node(nid) {
 				struct batched_lruvec_stat *lstatc;
 				struct mem_cgroup_per_node *pn;
@@ -2443,13 +2396,6 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 				}
 			}
 		}
-
-		for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
-			if (statc->events[i]) {
-				count_memcg_events(memcg, i, statc->events[i]);
-				statc->events[i] = 0;
-			}
-		}
 	}
 
 	return 0;
@@ -3572,6 +3518,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 	unsigned long val;
 
 	if (mem_cgroup_is_root(memcg)) {
+		cgroup_rstat_flush(memcg->css.cgroup);
 		val = memcg_page_state(memcg, NR_FILE_PAGES) +
 			memcg_page_state(memcg, NR_ANON_MAPPED);
 		if (swap)
@@ -3636,26 +3583,15 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 	}
 }
 
-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
+static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg)
 {
-	unsigned long stat[MEMCG_NR_STAT] = {0};
-	struct mem_cgroup *mi;
-	int node, cpu, i;
-
-	for_each_online_cpu(cpu)
-		for (i = 0; i < MEMCG_NR_STAT; i++)
-			stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
-
-	for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
-		for (i = 0; i < MEMCG_NR_STAT; i++)
-			atomic_long_add(stat[i], &mi->vmstats[i]);
+	int node;
 
 	for_each_node(node) {
 		struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+		unsigned long stat[NR_VM_NODE_STAT_ITEMS] = { 0 };
 		struct mem_cgroup_per_node *pi;
-
-		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-			stat[i] = 0;
+		int cpu, i;
 
 		for_each_online_cpu(cpu)
 			for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
@@ -3668,25 +3604,6 @@ static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
 	}
 }
 
-static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
-{
-	unsigned long events[NR_VM_EVENT_ITEMS];
-	struct mem_cgroup *mi;
-	int cpu, i;
-
-	for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
-		events[i] = 0;
-
-	for_each_online_cpu(cpu)
-		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
-			events[i] += per_cpu(memcg->vmstats_percpu->events[i],
-					     cpu);
-
-	for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
-		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
-			atomic_long_add(events[i], &mi->vmevents[i]);
-}
-
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
@@ -4003,6 +3920,8 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
 	int nid;
 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
+	cgroup_rstat_flush(memcg->css.cgroup);
+
 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 		seq_printf(m, "%s=%lu", stat->name,
 			   mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
@@ -4073,6 +3992,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 
 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 
+	cgroup_rstat_flush(memcg->css.cgroup);
+
 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
 		unsigned long nr;
 
@@ -4549,22 +4470,6 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 	return &memcg->cgwb_domain;
 }
 
-/*
- * idx can be of type enum memcg_stat_item or node_stat_item.
- * Keep in sync with memcg_exact_page().
- */
-static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
-{
-	long x = atomic_long_read(&memcg->vmstats[idx]);
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
-	if (x < 0)
-		x = 0;
-	return x;
-}
-
 /**
  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
  * @wb: bdi_writeback in question
@@ -4590,13 +4495,14 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+	cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
 
-	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
-	*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
-			memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
-	*pheadroom = PAGE_COUNTER_MAX;
+	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+	*pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
+			memcg_page_state(memcg, NR_ACTIVE_FILE);
 
+	*pheadroom = PAGE_COUNTER_MAX;
 	while ((parent = parent_mem_cgroup(memcg))) {
 		unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
 					    READ_ONCE(memcg->memory.high));
@@ -5228,7 +5134,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
 	free_percpu(memcg->vmstats_percpu);
-	free_percpu(memcg->vmstats_local);
 	kfree(memcg);
 }
 
@@ -5236,11 +5141,10 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	memcg_wb_domain_exit(memcg);
 	/*
-	 * Flush percpu vmstats and vmevents to guarantee the value correctness
-	 * on parent's and all ancestor levels.
+	 * Flush percpu lruvec stats to guarantee the value
+	 * correctness on parent's and all ancestor levels.
 	 */
-	memcg_flush_percpu_vmstats(memcg);
-	memcg_flush_percpu_vmevents(memcg);
+	memcg_flush_lruvec_page_state(memcg);
 	__mem_cgroup_free(memcg);
 }
 
@@ -5267,11 +5171,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 		goto fail;
 	}
 
-	memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
-						GFP_KERNEL_ACCOUNT);
-	if (!memcg->vmstats_local)
-		goto fail;
-
 	memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
 						 GFP_KERNEL_ACCOUNT);
 	if (!memcg->vmstats_percpu)
@@ -5471,6 +5370,62 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	memcg_wb_domain_size_changed(memcg);
 }
 
+static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+	struct memcg_vmstats_percpu *statc;
+	long delta, v;
+	int i;
+
+	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+
+	for (i = 0; i < MEMCG_NR_STAT; i++) {
+		/*
+		 * Collect the aggregated propagation counts of groups
+		 * below us. We're in a per-cpu loop here and this is
+		 * a global counter, so the first cycle will get them.
+		 */
+		delta = memcg->vmstats.state_pending[i];
+		if (delta)
+			memcg->vmstats.state_pending[i] = 0;
+
+		/* Add CPU changes on this level since the last flush */
+		v = READ_ONCE(statc->state[i]);
+		if (v != statc->state_prev[i]) {
+			delta += v - statc->state_prev[i];
+			statc->state_prev[i] = v;
+		}
+
+		if (!delta)
+			continue;
+
+		/* Aggregate counts on this level and propagate upwards */
+		memcg->vmstats.state[i] += delta;
+		if (parent)
+			parent->vmstats.state_pending[i] += delta;
+	}
+
+	for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+		delta = memcg->vmstats.events_pending[i];
+		if (delta)
+			memcg->vmstats.events_pending[i] = 0;
+
+		v = READ_ONCE(statc->events[i]);
+		if (v != statc->events_prev[i]) {
+			delta += v - statc->events_prev[i];
+			statc->events_prev[i] = v;
+		}
+
+		if (!delta)
+			continue;
+
+		memcg->vmstats.events[i] += delta;
+		if (parent)
+			parent->vmstats.events_pending[i] += delta;
+	}
+}
+
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 static int mem_cgroup_do_precharge(unsigned long count)
@@ -6524,6 +6479,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.css_released = mem_cgroup_css_released,
 	.css_free = mem_cgroup_css_free,
 	.css_reset = mem_cgroup_css_reset,
+	.css_rstat_flush = mem_cgroup_css_rstat_flush,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.post_attach = mem_cgroup_move_task,
-- 
cgit v1.2.3


From 0add0c77a9bd0ce7cd3b53894fb08154881402a4 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Thu, 29 Apr 2021 22:56:36 -0700
Subject: memcg: charge before adding to swapcache on swapin

Currently the kernel adds the page, allocated for swapin, to the
swapcache before charging the page.  This is fine but now we want a
per-memcg swapcache stat which is essential for folks who wants to
transparently migrate from cgroup v1's memsw to cgroup v2's memory and
swap counters.  In addition charging a page before exposing it to other
parts of the kernel is a step in the right direction.

To correctly maintain the per-memcg swapcache stat, this patch has
adopted to charge the page before adding it to swapcache.  One challenge
in this option is the failure case of add_to_swap_cache() on which we
need to undo the mem_cgroup_charge().  Specifically undoing
mem_cgroup_uncharge_swap() is not simple.

To resolve the issue, this patch decouples the charging for swapin pages
from mem_cgroup_charge().  Two new functions are introduced,
mem_cgroup_swapin_charge_page() for just charging the swapin page and
mem_cgroup_swapin_uncharge_swap() for uncharging the swap slot once the
page has been successfully added to the swapcache.

[shakeelb@google.com: set page->private before calling swap_readpage]
  Link: https://lkml.kernel.org/r/20210318015959.2986837-1-shakeelb@google.com

Link: https://lkml.kernel.org/r/20210305212639.775498-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hugh Dickins <hughd@google.com>
Tested-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  13 +++++
 mm/memcontrol.c            | 117 ++++++++++++++++++++++++++++-----------------
 mm/memory.c                |  16 +++----
 mm/swap_state.c            |  13 +++--
 4 files changed, 100 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 74910ce9a3f9..e946c96daa32 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -609,6 +609,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
 }
 
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+				  gfp_t gfp, swp_entry_t entry);
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 
 void mem_cgroup_uncharge(struct page *page);
 void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -1112,6 +1115,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mem_cgroup_swapin_charge_page(struct page *page,
+			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
+{
+	return 0;
+}
+
+static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
+}
+
 static inline void mem_cgroup_uncharge(struct page *page)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5582e1531b43..6d0863db3e41 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6644,6 +6644,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
 			atomic_long_read(&parent->memory.children_low_usage)));
 }
 
+static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
+			       gfp_t gfp)
+{
+	unsigned int nr_pages = thp_nr_pages(page);
+	int ret;
+
+	ret = try_charge(memcg, gfp, nr_pages);
+	if (ret)
+		goto out;
+
+	css_get(&memcg->css);
+	commit_charge(page, memcg);
+
+	local_irq_disable();
+	mem_cgroup_charge_statistics(memcg, page, nr_pages);
+	memcg_check_events(memcg, page);
+	local_irq_enable();
+out:
+	return ret;
+}
+
 /**
  * mem_cgroup_charge - charge a newly allocated page to a cgroup
  * @page: page to charge
@@ -6653,55 +6674,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
  * Try to charge @page to the memcg that @mm belongs to, reclaiming
  * pages according to @gfp_mask if necessary.
  *
+ * Do not use this for pages allocated for swapin.
+ *
  * Returns 0 on success. Otherwise, an error code is returned.
  */
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
-	unsigned int nr_pages = thp_nr_pages(page);
-	struct mem_cgroup *memcg = NULL;
-	int ret = 0;
+	struct mem_cgroup *memcg;
+	int ret;
 
 	if (mem_cgroup_disabled())
-		goto out;
+		return 0;
 
-	if (PageSwapCache(page)) {
-		swp_entry_t ent = { .val = page_private(page), };
-		unsigned short id;
+	memcg = get_mem_cgroup_from_mm(mm);
+	ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+	css_put(&memcg->css);
 
-		/*
-		 * Every swap fault against a single page tries to charge the
-		 * page, bail as early as possible.  shmem_unuse() encounters
-		 * already charged pages, too.  page and memcg binding is
-		 * protected by the page lock, which serializes swap cache
-		 * removal, which in turn serializes uncharging.
-		 */
-		VM_BUG_ON_PAGE(!PageLocked(page), page);
-		if (page_memcg(compound_head(page)))
-			goto out;
+	return ret;
+}
 
-		id = lookup_swap_cgroup_id(ent);
-		rcu_read_lock();
-		memcg = mem_cgroup_from_id(id);
-		if (memcg && !css_tryget_online(&memcg->css))
-			memcg = NULL;
-		rcu_read_unlock();
-	}
+/**
+ * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp: reclaim mode
+ * @entry: swap entry for which the page is allocated
+ *
+ * This function charges a page allocated for swapin. Please call this before
+ * adding the page to the swapcache.
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+				  gfp_t gfp, swp_entry_t entry)
+{
+	struct mem_cgroup *memcg;
+	unsigned short id;
+	int ret;
 
-	if (!memcg)
+	if (mem_cgroup_disabled())
+		return 0;
+
+	id = lookup_swap_cgroup_id(entry);
+	rcu_read_lock();
+	memcg = mem_cgroup_from_id(id);
+	if (!memcg || !css_tryget_online(&memcg->css))
 		memcg = get_mem_cgroup_from_mm(mm);
+	rcu_read_unlock();
 
-	ret = try_charge(memcg, gfp_mask, nr_pages);
-	if (ret)
-		goto out_put;
+	ret = __mem_cgroup_charge(page, memcg, gfp);
 
-	css_get(&memcg->css);
-	commit_charge(page, memcg);
-
-	local_irq_disable();
-	mem_cgroup_charge_statistics(memcg, page, nr_pages);
-	memcg_check_events(memcg, page);
-	local_irq_enable();
+	css_put(&memcg->css);
+	return ret;
+}
 
+/*
+ * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
+ * @entry: swap entry for which the page is charged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
 	/*
 	 * Cgroup1's unified memory+swap counter has been charged with the
 	 * new swapcache page, finish the transfer by uncharging the swap
@@ -6714,20 +6751,14 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 	 * correspond 1:1 to page and swap slot lifetimes: we charge the
 	 * page to memory here, and uncharge swap when the slot is freed.
 	 */
-	if (do_memsw_account() && PageSwapCache(page)) {
-		swp_entry_t entry = { .val = page_private(page) };
+	if (!mem_cgroup_disabled() && do_memsw_account()) {
 		/*
 		 * The swap entry might not get freed for a long time,
 		 * let's not wait for it.  The page already received a
 		 * memory+swap charge, drop the swap entry duplicate.
 		 */
-		mem_cgroup_uncharge_swap(entry, nr_pages);
+		mem_cgroup_uncharge_swap(entry, 1);
 	}
-
-out_put:
-	css_put(&memcg->css);
-out:
-	return ret;
 }
 
 struct uncharge_gather {
diff --git a/mm/memory.c b/mm/memory.c
index 550405fc3b5e..3196fa5e7f7d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3309,28 +3309,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
 							vmf->address);
 			if (page) {
-				int err;
-
 				__SetPageLocked(page);
 				__SetPageSwapBacked(page);
-				set_page_private(page, entry.val);
 
-				/* Tell memcg to use swap ownership records */
-				SetPageSwapCache(page);
-				err = mem_cgroup_charge(page, vma->vm_mm,
-							GFP_KERNEL);
-				ClearPageSwapCache(page);
-				if (err) {
+				if (mem_cgroup_swapin_charge_page(page,
+					vma->vm_mm, GFP_KERNEL, entry)) {
 					ret = VM_FAULT_OOM;
 					goto out_page;
 				}
+				mem_cgroup_swapin_uncharge_swap(entry);
 
 				shadow = get_shadow_from_swap_cache(entry);
 				if (shadow)
 					workingset_refault(page, shadow);
 
 				lru_cache_add(page);
+
+				/* To provide entry to swap_readpage() */
+				set_page_private(page, entry.val);
 				swap_readpage(page, true);
+				set_page_private(page, 0);
 			}
 		} else {
 			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3cdee7b11da9..fb7efa08fe57 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -497,16 +497,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	__SetPageLocked(page);
 	__SetPageSwapBacked(page);
 
-	/* May fail (-ENOMEM) if XArray node allocation failed. */
-	if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
-		put_swap_page(page, entry);
+	if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
 		goto fail_unlock;
-	}
 
-	if (mem_cgroup_charge(page, NULL, gfp_mask)) {
-		delete_from_swap_cache(page);
+	/* May fail (-ENOMEM) if XArray node allocation failed. */
+	if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
 		goto fail_unlock;
-	}
+
+	mem_cgroup_swapin_uncharge_swap(entry);
 
 	if (shadow)
 		workingset_refault(page, shadow);
@@ -517,6 +515,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	return page;
 
 fail_unlock:
+	put_swap_page(page, entry);
 	unlock_page(page);
 	put_page(page);
 	return NULL;
-- 
cgit v1.2.3


From b4e0b68fbd9d1fd7e31cbe8adca3ad6cf556e2ee Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 29 Apr 2021 22:56:52 -0700
Subject: mm: memcontrol: use obj_cgroup APIs to charge kmem pages

Since Roman's series "The new cgroup slab memory controller" applied.
All slab objects are charged via the new APIs of obj_cgroup.  The new
APIs introduce a struct obj_cgroup to charge slab objects.  It prevents
long-living objects from pinning the original memory cgroup in the
memory.  But there are still some corner objects (e.g.  allocations
larger than order-1 page on SLUB) which are not charged via the new
APIs.  Those objects (include the pages which are allocated from buddy
allocator directly) are charged as kmem pages which still hold a
reference to the memory cgroup.

We want to reuse the obj_cgroup APIs to charge the kmem pages.  If we do
that, we should store an object cgroup pointer to page->memcg_data for
the kmem pages.

Finally, page->memcg_data will have 3 different meanings.

  1) For the slab pages, page->memcg_data points to an object cgroups
     vector.

  2) For the kmem pages (exclude the slab pages), page->memcg_data
     points to an object cgroup.

  3) For the user pages (e.g. the LRU pages), page->memcg_data points
     to a memory cgroup.

We do not change the behavior of page_memcg() and page_memcg_rcu().  They
are also suitable for LRU pages and kmem pages.  Why?

Because memory allocations pinning memcgs for a long time - it exists at a
larger scale and is causing recurring problems in the real world: page
cache doesn't get reclaimed for a long time, or is used by the second,
third, fourth, ...  instance of the same job that was restarted into a new
cgroup every time.  Unreclaimable dying cgroups pile up, waste memory, and
make page reclaim very inefficient.

We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always point
to an object cgroup pointer.  At that time, LRU pages and kmem pages will
be treated the same.  The implementation of page_memcg() will remove the
kmem page check.

This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup.  Finally, the page->memcg_data of the kmem page points to an
object cgroup.  We can use the __page_objcg() to get the object cgroup
associated with a kmem page.  Or we can use page_memcg() to get the memory
cgroup associated with a kmem page, but caller must ensure that the
returned memcg won't be released (e.g.  acquire the rcu_read_lock or
css_set_lock).

  Link: https://lkml.kernel.org/r/20210401030141.37061-1-songmuchun@bytedance.com

Link: https://lkml.kernel.org/r/20210319163821.20704-6-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
[songmuchun@bytedance.com: fix forget to obtain the ref to objcg in split_page_memcg]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 120 ++++++++++++++++++++++++++++++++++++---------
 mm/memcontrol.c            | 116 +++++++++++++++++++++----------------------
 2 files changed, 155 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e946c96daa32..78ca34c935ab 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -371,6 +371,62 @@ enum page_memcg_data_flags {
 
 #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
 
+static inline bool PageMemcgKmem(struct page *page);
+
+/*
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
+ *
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+{
+	return READ_ONCE(objcg->memcg);
+}
+
+/*
+ * __page_memcg - get the memory cgroup associated with a non-kmem page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper memory cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages or
+ * kmem pages.
+ */
+static inline struct mem_cgroup *__page_memcg(struct page *page)
+{
+	unsigned long memcg_data = page->memcg_data;
+
+	VM_BUG_ON_PAGE(PageSlab(page), page);
+	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page);
+
+	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+/*
+ * __page_objcg - get the object cgroup associated with a kmem page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper object cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages or
+ * LRU pages.
+ */
+static inline struct obj_cgroup *__page_objcg(struct page *page)
+{
+	unsigned long memcg_data = page->memcg_data;
+
+	VM_BUG_ON_PAGE(PageSlab(page), page);
+	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+	VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+
+	return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
 /*
  * page_memcg - get the memory cgroup associated with a page
  * @page: a pointer to the page struct
@@ -380,20 +436,23 @@ enum page_memcg_data_flags {
  * proper memory cgroup pointer. It's not safe to call this function
  * against some type of pages, e.g. slab pages or ex-slab pages.
  *
- * Any of the following ensures page and memcg binding stability:
+ * For a non-kmem page any of the following ensures page and memcg binding
+ * stability:
+ *
  * - the page lock
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
+ *
+ * For a kmem page a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem page from being released.
  */
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
-	unsigned long memcg_data = page->memcg_data;
-
-	VM_BUG_ON_PAGE(PageSlab(page), page);
-	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
-
-	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+	if (PageMemcgKmem(page))
+		return obj_cgroup_memcg(__page_objcg(page));
+	else
+		return __page_memcg(page);
 }
 
 /*
@@ -407,11 +466,19 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
  */
 static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
 {
+	unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
 	VM_BUG_ON_PAGE(PageSlab(page), page);
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
-				     ~MEMCG_DATA_FLAGS_MASK);
+	if (memcg_data & MEMCG_DATA_KMEM) {
+		struct obj_cgroup *objcg;
+
+		objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+		return obj_cgroup_memcg(objcg);
+	}
+
+	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
 /*
@@ -419,15 +486,21 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
  * @page: a pointer to the page struct
  *
  * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function unlike page_memcg() can take any  page
+ * or NULL. This function unlike page_memcg() can take any page
  * as an argument. It has to be used in cases when it's not known if a page
- * has an associated memory cgroup pointer or an object cgroups vector.
+ * has an associated memory cgroup pointer or an object cgroups vector or
+ * an object cgroup.
+ *
+ * For a non-kmem page any of the following ensures page and memcg binding
+ * stability:
  *
- * Any of the following ensures page and memcg binding stability:
  * - the page lock
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
+ *
+ * For a kmem page a caller should hold an rcu read lock to protect memcg
+ * associated with a kmem page from being released.
  */
 static inline struct mem_cgroup *page_memcg_check(struct page *page)
 {
@@ -440,6 +513,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	if (memcg_data & MEMCG_DATA_OBJCGS)
 		return NULL;
 
+	if (memcg_data & MEMCG_DATA_KMEM) {
+		struct obj_cgroup *objcg;
+
+		objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+		return obj_cgroup_memcg(objcg);
+	}
+
 	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
@@ -718,21 +798,15 @@ static inline void obj_cgroup_get(struct obj_cgroup *objcg)
 	percpu_ref_get(&objcg->refcnt);
 }
 
-static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
+				       unsigned long nr)
 {
-	percpu_ref_put(&objcg->refcnt);
+	percpu_ref_get_many(&objcg->refcnt, nr);
 }
 
-/*
- * After the initialization objcg->memcg is always pointing at
- * a valid memcg, but can be atomically swapped to the parent memcg.
- *
- * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
- */
-static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+static inline void obj_cgroup_put(struct obj_cgroup *objcg)
 {
-	return READ_ONCE(objcg->memcg);
+	percpu_ref_put(&objcg->refcnt);
 }
 
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 85fd2d86d23f..f3ddf94c9485 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -865,18 +865,22 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 			     int val)
 {
 	struct page *head = compound_head(page); /* rmap on tail pages */
-	struct mem_cgroup *memcg = page_memcg(head);
+	struct mem_cgroup *memcg;
 	pg_data_t *pgdat = page_pgdat(page);
 	struct lruvec *lruvec;
 
+	rcu_read_lock();
+	memcg = page_memcg(head);
 	/* Untracked pages have no memcg, no lruvec. Update only the node */
 	if (!memcg) {
+		rcu_read_unlock();
 		__mod_node_page_state(pgdat, idx, val);
 		return;
 	}
 
 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
 	__mod_lruvec_state(lruvec, idx, val);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(__mod_lruvec_page_state);
 
@@ -1051,20 +1055,6 @@ static __always_inline struct mem_cgroup *active_memcg(void)
 		return current->active_memcg;
 }
 
-static __always_inline struct mem_cgroup *get_active_memcg(void)
-{
-	struct mem_cgroup *memcg;
-
-	rcu_read_lock();
-	memcg = active_memcg();
-	/* remote memcg must hold a ref. */
-	if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
-		memcg = root_mem_cgroup;
-	rcu_read_unlock();
-
-	return memcg;
-}
-
 static __always_inline bool memcg_kmem_bypass(void)
 {
 	/* Allow remote memcg charging from any context. */
@@ -1078,20 +1068,6 @@ static __always_inline bool memcg_kmem_bypass(void)
 	return false;
 }
 
-/**
- * If active memcg is set, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
-{
-	if (memcg_kmem_bypass())
-		return NULL;
-
-	if (unlikely(active_memcg()))
-		return get_active_memcg();
-
-	return get_mem_cgroup_from_mm(current->mm);
-}
-
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
@@ -3121,18 +3097,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
  */
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
 {
-	struct mem_cgroup *memcg;
+	struct obj_cgroup *objcg;
 	int ret = 0;
 
-	memcg = get_mem_cgroup_from_current();
-	if (memcg && !mem_cgroup_is_root(memcg)) {
-		ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+	objcg = get_obj_cgroup_from_current();
+	if (objcg) {
+		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
 		if (!ret) {
-			page->memcg_data = (unsigned long)memcg |
+			page->memcg_data = (unsigned long)objcg |
 				MEMCG_DATA_KMEM;
 			return 0;
 		}
-		css_put(&memcg->css);
+		obj_cgroup_put(objcg);
 	}
 	return ret;
 }
@@ -3144,16 +3120,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
  */
 void __memcg_kmem_uncharge_page(struct page *page, int order)
 {
-	struct mem_cgroup *memcg = page_memcg(page);
+	struct obj_cgroup *objcg;
 	unsigned int nr_pages = 1 << order;
 
-	if (!memcg)
+	if (!PageMemcgKmem(page))
 		return;
 
-	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-	__memcg_kmem_uncharge(memcg, nr_pages);
+	objcg = __page_objcg(page);
+	obj_cgroup_uncharge_pages(objcg, nr_pages);
 	page->memcg_data = 0;
-	css_put(&memcg->css);
+	obj_cgroup_put(objcg);
 }
 
 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3294,7 +3270,11 @@ void split_page_memcg(struct page *head, unsigned int nr)
 
 	for (i = 1; i < nr; i++)
 		head[i].memcg_data = head->memcg_data;
-	css_get_many(&memcg->css, nr - 1);
+
+	if (PageMemcgKmem(head))
+		obj_cgroup_get_many(__page_objcg(head), nr - 1);
+	else
+		css_get_many(&memcg->css, nr - 1);
 }
 
 #ifdef CONFIG_MEMCG_SWAP
@@ -6788,7 +6768,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
 
 struct uncharge_gather {
 	struct mem_cgroup *memcg;
-	unsigned long nr_pages;
+	unsigned long nr_memory;
 	unsigned long pgpgout;
 	unsigned long nr_kmem;
 	struct page *dummy_page;
@@ -6803,10 +6783,10 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 {
 	unsigned long flags;
 
-	if (!mem_cgroup_is_root(ug->memcg)) {
-		page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
+	if (ug->nr_memory) {
+		page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
 		if (do_memsw_account())
-			page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
+			page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
 		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
 			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
 		memcg_oom_recover(ug->memcg);
@@ -6814,7 +6794,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 
 	local_irq_save(flags);
 	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
-	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
+	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
 	memcg_check_events(ug->memcg, ug->dummy_page);
 	local_irq_restore(flags);
 
@@ -6825,40 +6805,60 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 {
 	unsigned long nr_pages;
+	struct mem_cgroup *memcg;
+	struct obj_cgroup *objcg;
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 
-	if (!page_memcg(page))
-		return;
-
 	/*
 	 * Nobody should be changing or seriously looking at
-	 * page_memcg(page) at this point, we have fully
+	 * page memcg or objcg at this point, we have fully
 	 * exclusive access to the page.
 	 */
+	if (PageMemcgKmem(page)) {
+		objcg = __page_objcg(page);
+		/*
+		 * This get matches the put at the end of the function and
+		 * kmem pages do not hold memcg references anymore.
+		 */
+		memcg = get_mem_cgroup_from_objcg(objcg);
+	} else {
+		memcg = __page_memcg(page);
+	}
 
-	if (ug->memcg != page_memcg(page)) {
+	if (!memcg)
+		return;
+
+	if (ug->memcg != memcg) {
 		if (ug->memcg) {
 			uncharge_batch(ug);
 			uncharge_gather_clear(ug);
 		}
-		ug->memcg = page_memcg(page);
+		ug->memcg = memcg;
 		ug->dummy_page = page;
 
 		/* pairs with css_put in uncharge_batch */
-		css_get(&ug->memcg->css);
+		css_get(&memcg->css);
 	}
 
 	nr_pages = compound_nr(page);
-	ug->nr_pages += nr_pages;
 
-	if (PageMemcgKmem(page))
+	if (PageMemcgKmem(page)) {
+		ug->nr_memory += nr_pages;
 		ug->nr_kmem += nr_pages;
-	else
+
+		page->memcg_data = 0;
+		obj_cgroup_put(objcg);
+	} else {
+		/* LRU pages aren't accounted at the root level */
+		if (!mem_cgroup_is_root(memcg))
+			ug->nr_memory += nr_pages;
 		ug->pgpgout++;
 
-	page->memcg_data = 0;
-	css_put(&ug->memcg->css);
+		page->memcg_data = 0;
+	}
+
+	css_put(&memcg->css);
 }
 
 /**
-- 
cgit v1.2.3


From bd290e1e75d8a8b2d87031b63db56ae165677870 Mon Sep 17 00:00:00 2001
From: Muchun Song <songmuchun@bytedance.com>
Date: Thu, 29 Apr 2021 22:56:58 -0700
Subject: mm: memcontrol: move PageMemcgKmem to the scope of CONFIG_MEMCG_KMEM

The page only can be marked as kmem when CONFIG_MEMCG_KMEM is enabled.
So move PageMemcgKmem() to the scope of the CONFIG_MEMCG_KMEM.

As a bonus, on !CONFIG_MEMCG_KMEM build some code can be compiled out.

Link: https://lkml.kernel.org/r/20210319163821.20704-8-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 78ca34c935ab..bcf92b99f001 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -523,6 +523,7 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
 /*
  * PageMemcgKmem - check if the page has MemcgKmem flag set
  * @page: a pointer to the page struct
@@ -537,7 +538,6 @@ static inline bool PageMemcgKmem(struct page *page)
 	return page->memcg_data & MEMCG_DATA_KMEM;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
 /*
  * page_objcgs - get the object cgroups vector associated with a page
  * @page: a pointer to the page struct
@@ -579,6 +579,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 }
 
 #else
+static inline bool PageMemcgKmem(struct page *page)
+{
+	return false;
+}
+
 static inline struct obj_cgroup **page_objcgs(struct page *page)
 {
 	return NULL;
-- 
cgit v1.2.3


From a10e995749a6c65833edd201c55665e5d44d14fc Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Thu, 29 Apr 2021 22:57:01 -0700
Subject: linux/memcontrol.h: remove duplicate struct declaration

struct mem_cgroup is declared twice.  One has been declared at forward
struct declaration.  Remove the duplicate.

Link: https://lkml.kernel.org/r/20210330020246.2265371-1-wanjiabing@vivo.com
Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bcf92b99f001..5904716f29ba 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1123,8 +1123,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 #define MEM_CGROUP_ID_SHIFT	0
 #define MEM_CGROUP_ID_MAX	0
 
-struct mem_cgroup;
-
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
 	return NULL;
-- 
cgit v1.2.3


From 74ffa5a3e68504dd289135b1cf0422c19ffb3f2e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Apr 2021 22:57:29 -0700
Subject: mm: add remap_pfn_range_notrack

Patch series "add remap_pfn_range_notrack instead of reinventing it in i915", v2.

i915 has some reason to want to avoid the track_pfn_remap overhead in
remap_pfn_range.  Add a function to the core VM to do just that rather
than reinventing the functionality poorly in the driver.

Note that the remap_io_sg path does get exercises when using Xorg on my
Thinkpad X1, so this should be considered lightly tested, I've not managed
to hit the remap_io_mapping path at all.

This patch (of 4):

Add a version of remap_pfn_range that does not call track_pfn_range.  This
will be used to fix horrible abuses of VM internals in the i915 driver.

Link: https://lkml.kernel.org/r/20210326055505.1424432-1-hch@lst.de
Link: https://lkml.kernel.org/r/20210326055505.1424432-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 51 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 33 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64be3baf861a..a8335cecf706 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2732,6 +2732,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 			unsigned long pfn, unsigned long size, pgprot_t);
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t prot);
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
 int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
 			struct page **pages, unsigned long *num);
diff --git a/mm/memory.c b/mm/memory.c
index f95d12032b34..8dc69471be6c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2260,26 +2260,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 	return 0;
 }
 
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
+/*
+ * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
+ * must have pre-validated the caching bits of the pgprot_t.
  */
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
-		    unsigned long pfn, unsigned long size, pgprot_t prot)
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t prot)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	struct mm_struct *mm = vma->vm_mm;
-	unsigned long remap_pfn = pfn;
 	int err;
 
 	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
@@ -2309,10 +2300,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		vma->vm_pgoff = pfn;
 	}
 
-	err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
-	if (err)
-		return -EINVAL;
-
 	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 
 	BUG_ON(addr >= end);
@@ -2324,12 +2311,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		err = remap_p4d_range(mm, pgd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
-			break;
+			return err;
 	} while (pgd++, addr = next, addr != end);
 
+	return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+		    unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	int err;
+
+	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
 	if (err)
-		untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
+		return -EINVAL;
 
+	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+	if (err)
+		untrack_pfn(vma, pfn, PAGE_ALIGN(size));
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
-- 
cgit v1.2.3


From 1fbaf8fc12a0136c7e62e7ad6fe886fe1749912c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 29 Apr 2021 22:57:32 -0700
Subject: mm: add a io_mapping_map_user helper

Add a helper that calls remap_pfn_range for an struct io_mapping, relying
on the pgprot pre-validation done when creating the mapping instead of
doing it at runtime.

Link: https://lkml.kernel.org/r/20210326055505.1424432-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/io-mapping.h |  3 +++
 mm/Kconfig                 |  3 +++
 mm/Makefile                |  1 +
 mm/io-mapping.c            | 29 +++++++++++++++++++++++++++++
 4 files changed, 36 insertions(+)
 create mode 100644 mm/io-mapping.c

(limited to 'include/linux')

diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index c093e81310a9..e9743cfd8585 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -220,3 +220,6 @@ io_mapping_free(struct io_mapping *iomap)
 }
 
 #endif /* _LINUX_IO_MAPPING_H */
+
+int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long pfn, unsigned long size);
diff --git a/mm/Kconfig b/mm/Kconfig
index d0808a23e54b..8a27c7c91428 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -871,4 +871,7 @@ config MAPPING_DIRTY_HELPERS
 config KMAP_LOCAL
 	bool
 
+# struct io_mapping based helper.  Selected by drivers that need them
+config IO_MAPPING
+	bool
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 72227b24a616..c0135e385984 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -120,3 +120,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
 obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
 obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
 obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_IO_MAPPING) += io-mapping.o
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
new file mode 100644
index 000000000000..01b362799930
--- /dev/null
+++ b/mm/io-mapping.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/io-mapping.h>
+
+/**
+ * io_mapping_map_user - remap an I/O mapping to userspace
+ * @iomap: the source io_mapping
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ *
+ *  Note: this is only safe if the mm semaphore is held when called.
+ */
+int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
+		unsigned long addr, unsigned long pfn, unsigned long size)
+{
+	vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
+
+	if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
+		return -EINVAL;
+
+	/* We rely on prevalidation of the io-mapping to skip track_pfn(). */
+	return remap_pfn_range_notrack(vma, addr, pfn, size,
+		__pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
+			 (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)));
+}
+EXPORT_SYMBOL_GPL(io_mapping_map_user);
-- 
cgit v1.2.3


From 14d071134c740cfe61c09fc506fd3ab052beea10 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Thu, 29 Apr 2021 22:57:48 -0700
Subject: Revert "mremap: don't allow MREMAP_DONTUNMAP on special_mappings and
 aio"

This reverts commit cd544fd1dc9293c6702fab6effa63dac1cc67e99.

As discussed in [1] this commit was a no-op because the mapping type was
checked in vma_to_resize before move_vma is ever called.  This meant that
vm_ops->mremap() would never be called on such mappings.  Furthermore,
we've since expanded support of MREMAP_DONTUNMAP to non-anonymous
mappings, and these special mappings are still protected by the existing
check of !VM_DONTEXPAND and !VM_PFNMAP which will result in a -EINVAL.

1. https://lkml.org/lkml/2020/12/28/2340

Link: https://lkml.kernel.org/r/20210323182520.2712101-2-bgeffon@google.com
Signed-off-by: Brian Geffon <bgeffon@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Alejandro Colomar <alx.manpages@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Michael S . Tsirkin" <mst@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sonny Rao <sonnyrao@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +-
 fs/aio.c                                  | 5 +----
 include/linux/mm.h                        | 2 +-
 mm/mmap.c                                 | 6 +-----
 mm/mremap.c                               | 2 +-
 5 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 935af2ac6b1a..05a89e33fde2 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
 {
 	/* Not supported */
 	return -EINVAL;
diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39e..76ce0cc3ee4e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -323,16 +323,13 @@ static void aio_free_ring(struct kioctx *ctx)
 	}
 }
 
-static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags)
+static int aio_ring_mremap(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
 	struct kioctx_table *table;
 	int i, res = -EINVAL;
 
-	if (flags & MREMAP_DONTUNMAP)
-		return -EINVAL;
-
 	spin_lock(&mm->ioctx_lock);
 	rcu_read_lock();
 	table = rcu_dereference(mm->ioctx_table);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a8335cecf706..93097dbd9604 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -580,7 +580,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	/* Called any time before splitting to check if it's allowed */
 	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
-	int (*mremap)(struct vm_area_struct *area, unsigned long flags);
+	int (*mremap)(struct vm_area_struct *area);
 	/*
 	 * Called by mprotect() to make driver-specific permission
 	 * checks before mprotect() is finalised.   The VMA must not
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d96a21acb2f..347ef9b83bb5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3409,14 +3409,10 @@ static const char *special_mapping_name(struct vm_area_struct *vma)
 	return ((struct vm_special_mapping *)vma->vm_private_data)->name;
 }
 
-static int special_mapping_mremap(struct vm_area_struct *new_vma,
-				  unsigned long flags)
+static int special_mapping_mremap(struct vm_area_struct *new_vma)
 {
 	struct vm_special_mapping *sm = new_vma->vm_private_data;
 
-	if (flags & MREMAP_DONTUNMAP)
-		return -EINVAL;
-
 	if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
 		return -EFAULT;
 
diff --git a/mm/mremap.c b/mm/mremap.c
index db5b8b28c2dd..d22629ff8f3c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -545,7 +545,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (moved_len < old_len) {
 		err = -ENOMEM;
 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
-		err = vma->vm_ops->mremap(new_vma, flags);
+		err = vma->vm_ops->mremap(new_vma);
 	}
 
 	if (unlikely(err)) {
-- 
cgit v1.2.3


From bbc180a5adb05ee8053fab7a0c0bd56c5964240e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 29 Apr 2021 22:58:26 -0700
Subject: mm: HUGE_VMAP arch support cleanup

This changes the awkward approach where architectures provide init
functions to determine which levels they can provide large mappings for,
to one where the arch is queried for each call.

This removes code and indirection, and allows constant-folding of dead
code for unsupported levels.

This also adds a prot argument to the arch query.  This is unused
currently but could help with some architectures (e.g., some powerpc
processors can't map uncacheable memory with large pages).

Link: https://lkml.kernel.org/r/20210317062402.533919-7-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Ding Tianhong <dingtianhong@huawei.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com> [arm64]
Cc: Will Deacon <will@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/vmalloc.h         |  8 +++
 arch/arm64/mm/mmu.c                      | 10 ++--
 arch/powerpc/include/asm/vmalloc.h       |  8 +++
 arch/powerpc/mm/book3s64/radix_pgtable.c |  8 +--
 arch/x86/include/asm/vmalloc.h           |  7 +++
 arch/x86/mm/ioremap.c                    | 12 ++--
 include/linux/io.h                       |  9 ---
 include/linux/vmalloc.h                  |  6 ++
 init/main.c                              |  1 -
 mm/debug_vm_pgtable.c                    |  4 +-
 mm/ioremap.c                             | 94 ++++++++++++++------------------
 11 files changed, 87 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 2ca708ab9b20..597b40405319 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -1,4 +1,12 @@
 #ifndef _ASM_ARM64_VMALLOC_H
 #define _ASM_ARM64_VMALLOC_H
 
+#include <asm/page.h>
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
 #endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d563335ad43f..8436e0755361 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1339,12 +1339,12 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 	return dt_virt;
 }
 
-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
 {
-	return 0;
+	return false;
 }
 
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
 {
 	/*
 	 * Only 4k granule supports level 1 block mappings.
@@ -1354,9 +1354,9 @@ int __init arch_ioremap_pud_supported(void)
 	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
 }
 
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
 {
-	/* See arch_ioremap_pud_supported() */
+	/* See arch_vmap_pud_supported() */
 	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
 }
 
diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h
index b992dfaaa161..105abb73f075 100644
--- a/arch/powerpc/include/asm/vmalloc.h
+++ b/arch/powerpc/include/asm/vmalloc.h
@@ -1,4 +1,12 @@
 #ifndef _ASM_POWERPC_VMALLOC_H
 #define _ASM_POWERPC_VMALLOC_H
 
+#include <asm/page.h>
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
 #endif /* _ASM_POWERPC_VMALLOC_H */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 98f0b243c1ab..743807fc210f 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1082,13 +1082,13 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
 	set_pte_at(mm, addr, ptep, pte);
 }
 
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
 {
 	/* HPT does not cope with large pages in the vmalloc area */
 	return radix_enabled();
 }
 
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
 {
 	return radix_enabled();
 }
@@ -1182,7 +1182,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 	return 1;
 }
 
-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
 {
-	return 0;
+	return false;
 }
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
index 29837740b520..094ea2b565f3 100644
--- a/arch/x86/include/asm/vmalloc.h
+++ b/arch/x86/include/asm/vmalloc.h
@@ -1,6 +1,13 @@
 #ifndef _ASM_X86_VMALLOC_H
 #define _ASM_X86_VMALLOC_H
 
+#include <asm/page.h>
 #include <asm/pgtable_areas.h>
 
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
 #endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 9e5ccc56f8e0..fbaf0c447986 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -481,24 +481,26 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
-int __init arch_ioremap_p4d_supported(void)
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot)
 {
-	return 0;
+	return false;
 }
 
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
 {
 #ifdef CONFIG_X86_64
 	return boot_cpu_has(X86_FEATURE_GBPAGES);
 #else
-	return 0;
+	return false;
 #endif
 }
 
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
 {
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
+#endif
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
diff --git a/include/linux/io.h b/include/linux/io.h
index 61ff7d6278b6..9595151d800d 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -31,15 +31,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
 }
 #endif
 
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-void __init ioremap_huge_init(void);
-int arch_ioremap_p4d_supported(void);
-int arch_ioremap_pud_supported(void);
-int arch_ioremap_pmd_supported(void);
-#else
-static inline void ioremap_huge_init(void) { }
-#endif
-
 /*
  * Managed iomap interface
  */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3de7be6dd17c..358c51c702c0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -78,6 +78,12 @@ struct vmap_area {
 	};
 };
 
+#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP
+static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; }
+static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; }
+static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; }
+#endif
+
 /*
  *	Highlevel APIs for driver use
  */
diff --git a/init/main.c b/init/main.c
index f498aac26e8c..ae96c79ad2d3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -837,7 +837,6 @@ static void __init mm_init(void)
 	pgtable_init();
 	debug_objects_mem_init();
 	vmalloc_init();
-	ioremap_huge_init();
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
 	/* Should be run after espfix64 is set up. */
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index a9bd6ce1ba02..05efe98a9ac2 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -247,7 +247,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
 {
 	pmd_t pmd;
 
-	if (!arch_ioremap_pmd_supported())
+	if (!arch_vmap_pmd_supported(prot))
 		return;
 
 	pr_debug("Validating PMD huge\n");
@@ -385,7 +385,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
 {
 	pud_t pud;
 
-	if (!arch_ioremap_pud_supported())
+	if (!arch_vmap_pud_supported(prot))
 		return;
 
 	pr_debug("Validating PUD huge\n");
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 3f4d36f9745a..3264d0203785 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -16,49 +16,16 @@
 #include "pgalloc-track.h"
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static int __read_mostly ioremap_p4d_capable;
-static int __read_mostly ioremap_pud_capable;
-static int __read_mostly ioremap_pmd_capable;
-static int __read_mostly ioremap_huge_disabled;
+static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT;
 
 static int __init set_nohugeiomap(char *str)
 {
-	ioremap_huge_disabled = 1;
+	iomap_max_page_shift = P4D_SHIFT;
 	return 0;
 }
 early_param("nohugeiomap", set_nohugeiomap);
-
-void __init ioremap_huge_init(void)
-{
-	if (!ioremap_huge_disabled) {
-		if (arch_ioremap_p4d_supported())
-			ioremap_p4d_capable = 1;
-		if (arch_ioremap_pud_supported())
-			ioremap_pud_capable = 1;
-		if (arch_ioremap_pmd_supported())
-			ioremap_pmd_capable = 1;
-	}
-}
-
-static inline int ioremap_p4d_enabled(void)
-{
-	return ioremap_p4d_capable;
-}
-
-static inline int ioremap_pud_enabled(void)
-{
-	return ioremap_pud_capable;
-}
-
-static inline int ioremap_pmd_enabled(void)
-{
-	return ioremap_pmd_capable;
-}
-
-#else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static inline int ioremap_p4d_enabled(void) { return 0; }
-static inline int ioremap_pud_enabled(void) { return 0; }
-static inline int ioremap_pmd_enabled(void) { return 0; }
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static const bool iomap_max_page_shift = PAGE_SHIFT;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
@@ -82,9 +49,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 
 static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot)
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
 {
-	if (!ioremap_pmd_enabled())
+	if (max_page_shift < PMD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pmd_supported(prot))
 		return 0;
 
 	if ((end - addr) != PMD_SIZE)
@@ -104,7 +75,7 @@ static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -115,7 +86,8 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 	do {
 		next = pmd_addr_end(addr, end);
 
-		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+					max_page_shift)) {
 			*mask |= PGTBL_PMD_MODIFIED;
 			continue;
 		}
@@ -127,9 +99,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 }
 
 static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot)
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
 {
-	if (!ioremap_pud_enabled())
+	if (max_page_shift < PUD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pud_supported(prot))
 		return 0;
 
 	if ((end - addr) != PUD_SIZE)
@@ -149,7 +125,7 @@ static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
 
 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -160,21 +136,27 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 	do {
 		next = pud_addr_end(addr, end);
 
-		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+					max_page_shift)) {
 			*mask |= PGTBL_PUD_MODIFIED;
 			continue;
 		}
 
-		if (vmap_pmd_range(pud, addr, next, phys_addr, prot, mask))
+		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+					max_page_shift, mask))
 			return -ENOMEM;
 	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
 static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot)
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
 {
-	if (!ioremap_p4d_enabled())
+	if (max_page_shift < P4D_SHIFT)
+		return 0;
+
+	if (!arch_vmap_p4d_supported(prot))
 		return 0;
 
 	if ((end - addr) != P4D_SIZE)
@@ -194,7 +176,7 @@ static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
 
 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
 {
 	p4d_t *p4d;
 	unsigned long next;
@@ -205,19 +187,22 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	do {
 		next = p4d_addr_end(addr, end);
 
-		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+					max_page_shift)) {
 			*mask |= PGTBL_P4D_MODIFIED;
 			continue;
 		}
 
-		if (vmap_pud_range(p4d, addr, next, phys_addr, prot, mask))
+		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+					max_page_shift, mask))
 			return -ENOMEM;
 	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
 	return 0;
 }
 
 static int vmap_range(unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot)
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
 {
 	pgd_t *pgd;
 	unsigned long start;
@@ -232,7 +217,8 @@ static int vmap_range(unsigned long addr, unsigned long end,
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, &mask);
+		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+					max_page_shift, &mask);
 		if (err)
 			break;
 	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
@@ -248,7 +234,7 @@ static int vmap_range(unsigned long addr, unsigned long end,
 int ioremap_page_range(unsigned long addr,
 		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
-	return vmap_range(addr, end, phys_addr, prot);
+	return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
 }
 
 #ifdef CONFIG_GENERIC_IOREMAP
-- 
cgit v1.2.3


From 6f680e70b6ff58c9670769534196800233685d55 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 29 Apr 2021 22:58:39 -0700
Subject: mm/vmalloc: provide fallback arch huge vmap support functions

If an architecture doesn't support a particular page table level as a huge
vmap page size then allow it to skip defining the support query function.

Link: https://lkml.kernel.org/r/20210317062402.533919-11-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/vmalloc.h   |  7 +++----
 arch/powerpc/include/asm/vmalloc.h |  7 +++----
 arch/x86/include/asm/vmalloc.h     | 13 +++++--------
 include/linux/vmalloc.h            | 24 ++++++++++++++++++++----
 4 files changed, 31 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index fc9a12d6cc1a..7a22aeea9bb5 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -4,11 +4,8 @@
 #include <asm/page.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static inline bool arch_vmap_p4d_supported(pgprot_t prot)
-{
-	return false;
-}
 
+#define arch_vmap_pud_supported arch_vmap_pud_supported
 static inline bool arch_vmap_pud_supported(pgprot_t prot)
 {
 	/*
@@ -19,11 +16,13 @@ static inline bool arch_vmap_pud_supported(pgprot_t prot)
 	       !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
 }
 
+#define arch_vmap_pmd_supported arch_vmap_pmd_supported
 static inline bool arch_vmap_pmd_supported(pgprot_t prot)
 {
 	/* See arch_vmap_pud_supported() */
 	return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
 }
+
 #endif
 
 #endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h
index 3f0c153befb0..4c69ece52a31 100644
--- a/arch/powerpc/include/asm/vmalloc.h
+++ b/arch/powerpc/include/asm/vmalloc.h
@@ -5,21 +5,20 @@
 #include <asm/page.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static inline bool arch_vmap_p4d_supported(pgprot_t prot)
-{
-	return false;
-}
 
+#define arch_vmap_pud_supported arch_vmap_pud_supported
 static inline bool arch_vmap_pud_supported(pgprot_t prot)
 {
 	/* HPT does not cope with large pages in the vmalloc area */
 	return radix_enabled();
 }
 
+#define arch_vmap_pmd_supported arch_vmap_pmd_supported
 static inline bool arch_vmap_pmd_supported(pgprot_t prot)
 {
 	return radix_enabled();
 }
+
 #endif
 
 #endif /* _ASM_POWERPC_VMALLOC_H */
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
index e714b00fc0ca..49ce331f3ac6 100644
--- a/arch/x86/include/asm/vmalloc.h
+++ b/arch/x86/include/asm/vmalloc.h
@@ -6,24 +6,21 @@
 #include <asm/pgtable_areas.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static inline bool arch_vmap_p4d_supported(pgprot_t prot)
-{
-	return false;
-}
 
+#ifdef CONFIG_X86_64
+#define arch_vmap_pud_supported arch_vmap_pud_supported
 static inline bool arch_vmap_pud_supported(pgprot_t prot)
 {
-#ifdef CONFIG_X86_64
 	return boot_cpu_has(X86_FEATURE_GBPAGES);
-#else
-	return false;
-#endif
 }
+#endif
 
+#define arch_vmap_pmd_supported arch_vmap_pmd_supported
 static inline bool arch_vmap_pmd_supported(pgprot_t prot)
 {
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
+
 #endif
 
 #endif /* _ASM_X86_VMALLOC_H */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 358c51c702c0..eb5630be6783 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -78,10 +78,26 @@ struct vmap_area {
 	};
 };
 
-#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP
-static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; }
-static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; }
-static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; }
+/* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */
+#ifndef arch_vmap_p4d_supported
+static inline bool arch_vmap_p4d_supported(pgprot_t prot)
+{
+	return false;
+}
+#endif
+
+#ifndef arch_vmap_pud_supported
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+	return false;
+}
+#endif
+
+#ifndef arch_vmap_pmd_supported
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+	return false;
+}
 #endif
 
 /*
-- 
cgit v1.2.3


From 5e9e3d777b99aabe2f91f793a52e870a02642160 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 29 Apr 2021 22:58:43 -0700
Subject: mm: move vmap_range from mm/ioremap.c to mm/vmalloc.c

This is a generic kernel virtual memory mapper, not specific to ioremap.

Code is unchanged other than making vmap_range non-static.

Link: https://lkml.kernel.org/r/20210317062402.533919-12-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |   3 +
 mm/ioremap.c            | 203 ------------------------------------------------
 mm/vmalloc.c            | 202 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 205 insertions(+), 203 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index eb5630be6783..ae9eb07d30d4 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -189,6 +189,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 extern struct vm_struct *find_vm_area(const void *addr);
 
 #ifdef CONFIG_MMU
+int vmap_range(unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift);
 extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
 				    pgprot_t prot, struct page **pages);
 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 3264d0203785..d1dcc7e744ac 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -28,209 +28,6 @@ early_param("nohugeiomap", set_nohugeiomap);
 static const bool iomap_max_page_shift = PAGE_SHIFT;
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			pgtbl_mod_mask *mask)
-{
-	pte_t *pte;
-	u64 pfn;
-
-	pfn = phys_addr >> PAGE_SHIFT;
-	pte = pte_alloc_kernel_track(pmd, addr, mask);
-	if (!pte)
-		return -ENOMEM;
-	do {
-		BUG_ON(!pte_none(*pte));
-		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
-		pfn++;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	*mask |= PGTBL_PTE_MODIFIED;
-	return 0;
-}
-
-static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < PMD_SHIFT)
-		return 0;
-
-	if (!arch_vmap_pmd_supported(prot))
-		return 0;
-
-	if ((end - addr) != PMD_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, PMD_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
-		return 0;
-
-	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
-		return 0;
-
-	return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
-	if (!pmd)
-		return -ENOMEM;
-	do {
-		next = pmd_addr_end(addr, end);
-
-		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
-					max_page_shift)) {
-			*mask |= PGTBL_PMD_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
-			return -ENOMEM;
-	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < PUD_SHIFT)
-		return 0;
-
-	if (!arch_vmap_pud_supported(prot))
-		return 0;
-
-	if ((end - addr) != PUD_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, PUD_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
-		return 0;
-
-	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
-		return 0;
-
-	return pud_set_huge(pud, phys_addr, prot);
-}
-
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
-	if (!pud)
-		return -ENOMEM;
-	do {
-		next = pud_addr_end(addr, end);
-
-		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
-					max_page_shift)) {
-			*mask |= PGTBL_PUD_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
-					max_page_shift, mask))
-			return -ENOMEM;
-	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	if (max_page_shift < P4D_SHIFT)
-		return 0;
-
-	if (!arch_vmap_p4d_supported(prot))
-		return 0;
-
-	if ((end - addr) != P4D_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(addr, P4D_SIZE))
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
-		return 0;
-
-	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
-		return 0;
-
-	return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
-	p4d_t *p4d;
-	unsigned long next;
-
-	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
-	if (!p4d)
-		return -ENOMEM;
-	do {
-		next = p4d_addr_end(addr, end);
-
-		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
-					max_page_shift)) {
-			*mask |= PGTBL_P4D_MODIFIED;
-			continue;
-		}
-
-		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
-					max_page_shift, mask))
-			return -ENOMEM;
-	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int vmap_range(unsigned long addr, unsigned long end,
-			phys_addr_t phys_addr, pgprot_t prot,
-			unsigned int max_page_shift)
-{
-	pgd_t *pgd;
-	unsigned long start;
-	unsigned long next;
-	int err;
-	pgtbl_mod_mask mask = 0;
-
-	might_sleep();
-	BUG_ON(addr >= end);
-
-	start = addr;
-	pgd = pgd_offset_k(addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
-					max_page_shift, &mask);
-		if (err)
-			break;
-	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
-	flush_cache_vmap(start, end);
-
-	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
-		arch_sync_kernel_mappings(start, end);
-
-	return err;
-}
-
 int ioremap_page_range(unsigned long addr,
 		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ce4066b46955..5c81717f7e0e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -68,6 +68,208 @@ static void free_work(struct work_struct *w)
 }
 
 /*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			pgtbl_mod_mask *mask)
+{
+	pte_t *pte;
+	u64 pfn;
+
+	pfn = phys_addr >> PAGE_SHIFT;
+	pte = pte_alloc_kernel_track(pmd, addr, mask);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		BUG_ON(!pte_none(*pte));
+		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	*mask |= PGTBL_PTE_MODIFIED;
+	return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < PMD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pmd_supported(prot))
+		return 0;
+
+	if ((end - addr) != PMD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, PMD_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+		return 0;
+
+	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+		return 0;
+
+	return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+
+		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+					max_page_shift)) {
+			*mask |= PGTBL_PMD_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+			return -ENOMEM;
+	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < PUD_SHIFT)
+		return 0;
+
+	if (!arch_vmap_pud_supported(prot))
+		return 0;
+
+	if ((end - addr) != PUD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, PUD_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+		return 0;
+
+	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+		return 0;
+
+	return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+
+		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+					max_page_shift)) {
+			*mask |= PGTBL_PUD_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+					max_page_shift, mask))
+			return -ENOMEM;
+	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (max_page_shift < P4D_SHIFT)
+		return 0;
+
+	if (!arch_vmap_p4d_supported(prot))
+		return 0;
+
+	if ((end - addr) != P4D_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(addr, P4D_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+		return 0;
+
+	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+		return 0;
+
+	return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+	if (!p4d)
+		return -ENOMEM;
+	do {
+		next = p4d_addr_end(addr, end);
+
+		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+					max_page_shift)) {
+			*mask |= PGTBL_P4D_MODIFIED;
+			continue;
+		}
+
+		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+					max_page_shift, mask))
+			return -ENOMEM;
+	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+int vmap_range(unsigned long addr, unsigned long end,
+			phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	pgd_t *pgd;
+	unsigned long start;
+	unsigned long next;
+	int err;
+	pgtbl_mod_mask mask = 0;
+
+	might_sleep();
+	BUG_ON(addr >= end);
+
+	start = addr;
+	pgd = pgd_offset_k(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+					max_page_shift, &mask);
+		if (err)
+			break;
+	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+	flush_cache_vmap(start, end);
+
+	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+		arch_sync_kernel_mappings(start, end);
+
+	return err;
+}
 
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			     pgtbl_mod_mask *mask)
-- 
cgit v1.2.3


From 121e6f3258fe393e22c36f61a319be8a4f2c05ae Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 29 Apr 2021 22:58:49 -0700
Subject: mm/vmalloc: hugepage vmalloc mappings

Support huge page vmalloc mappings.  Config option HAVE_ARCH_HUGE_VMALLOC
enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
supports PMD sized vmap mappings.

vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or
larger, and fall back to small pages if that was unsuccessful.

Architectures must ensure that any arch specific vmalloc allocations that
require PAGE_SIZE mappings (e.g., module allocations vs strict module rwx)
use the VM_NOHUGE flag to inhibit larger mappings.

This can result in more internal fragmentation and memory overhead for a
given allocation, an option nohugevmalloc is added to disable at boot.

[colin.king@canonical.com: fix read of uninitialized pointer area]
  Link: https://lkml.kernel.org/r/20210318155955.18220-1-colin.king@canonical.com

Link: https://lkml.kernel.org/r/20210317062402.533919-14-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig            |  11 +++
 include/linux/vmalloc.h |  21 +++++
 mm/page_alloc.c         |   5 +-
 mm/vmalloc.c            | 220 +++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 209 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 5e8f6680d4bf..bf27159be4d9 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -829,6 +829,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 config HAVE_ARCH_HUGE_VMAP
 	bool
 
+#
+#  Archs that select this would be capable of PMD-sized vmaps (i.e.,
+#  arch_vmap_pmd_supported() returns true), and they must make no assumptions
+#  that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
+#  can be used to prohibit arch-specific allocations from using hugepages to
+#  help with this (e.g., modules may require it).
+#
+config HAVE_ARCH_HUGE_VMALLOC
+	depends on HAVE_ARCH_HUGE_VMAP
+	bool
+
 config ARCH_WANT_HUGE_PMD_SHARE
 	bool
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index ae9eb07d30d4..b4c82f2d40dc 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -26,6 +26,7 @@ struct notifier_block;		/* in notifier.h */
 #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
 #define VM_FLUSH_RESET_PERMS	0x00000100	/* reset direct map and flush TLB on unmap, can't be freed in atomic context */
 #define VM_MAP_PUT_PAGES	0x00000200	/* put pages and free array in vfree */
+#define VM_NO_HUGE_VMAP		0x00000400	/* force PAGE_SIZE pte mapping */
 
 /*
  * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@@ -54,6 +55,9 @@ struct vm_struct {
 	unsigned long		size;
 	unsigned long		flags;
 	struct page		**pages;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+	unsigned int		page_order;
+#endif
 	unsigned int		nr_pages;
 	phys_addr_t		phys_addr;
 	const void		*caller;
@@ -188,6 +192,22 @@ void free_vm_area(struct vm_struct *area);
 extern struct vm_struct *remove_vm_area(const void *addr);
 extern struct vm_struct *find_vm_area(const void *addr);
 
+static inline bool is_vm_area_hugepages(const void *addr)
+{
+	/*
+	 * This may not 100% tell if the area is mapped with > PAGE_SIZE
+	 * page table entries, if for some reason the architecture indicates
+	 * larger sizes are available but decides not to use them, nothing
+	 * prevents that. This only indicates the size of the physical page
+	 * allocated in the vmalloc layer.
+	 */
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+	return find_vm_area(addr)->page_order > 0;
+#else
+	return false;
+#endif
+}
+
 #ifdef CONFIG_MMU
 int vmap_range(unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
@@ -205,6 +225,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
 	if (vm)
 		vm->flags |= VM_FLUSH_RESET_PERMS;
 }
+
 #else
 static inline int
 map_kernel_range_noflush(unsigned long start, unsigned long size,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 56a8103580d6..39ff5c604cef 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,6 +72,7 @@
 #include <linux/padata.h>
 #include <linux/khugepaged.h>
 #include <linux/buffer_head.h>
+#include <linux/vmalloc.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -8222,6 +8223,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 	void *table = NULL;
 	gfp_t gfp_flags;
 	bool virt;
+	bool huge;
 
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
@@ -8289,6 +8291,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 		} else if (get_order(size) >= MAX_ORDER || hashdist) {
 			table = __vmalloc(size, gfp_flags);
 			virt = true;
+			huge = is_vm_area_hugepages(table);
 		} else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
@@ -8305,7 +8308,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 
 	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
 		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
-		virt ? "vmalloc" : "linear");
+		virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
 
 	if (_hash_shift)
 		*_hash_shift = log2qty;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 41c1dbdd2677..59c815eb7e74 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -42,6 +42,19 @@
 #include "internal.h"
 #include "pgalloc-track.h"
 
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+	vmap_allow_huge = false;
+	return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
 bool is_vmalloc_addr(const void *x)
 {
 	unsigned long addr = (unsigned long)x;
@@ -483,31 +496,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
 	return 0;
 }
 
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing.  The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
-			     pgprot_t prot, struct page **pages)
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+		pgprot_t prot, struct page **pages)
 {
 	unsigned long start = addr;
-	unsigned long end = addr + size;
-	unsigned long next;
 	pgd_t *pgd;
+	unsigned long next;
 	int err = 0;
 	int nr = 0;
 	pgtbl_mod_mask mask = 0;
@@ -529,6 +523,66 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 	return 0;
 }
 
+static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+		pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+	WARN_ON(page_shift < PAGE_SHIFT);
+
+	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
+			page_shift == PAGE_SHIFT)
+		return vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+		int err;
+
+		err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+					__pa(page_address(pages[i])), prot,
+					page_shift);
+		if (err)
+			return err;
+
+		addr += 1UL << page_shift;
+	}
+
+	return 0;
+}
+
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+		pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+	int err;
+
+	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+	flush_cache_vmap(addr, end);
+	return err;
+}
+
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+			     pgprot_t prot, struct page **pages)
+{
+	return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
+}
+
 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
 		struct page **pages)
 {
@@ -2112,6 +2166,24 @@ EXPORT_SYMBOL(vm_map_ram);
 
 static struct vm_struct *vmlist __initdata;
 
+static inline unsigned int vm_area_page_order(struct vm_struct *vm)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+	return vm->page_order;
+#else
+	return 0;
+#endif
+}
+
+static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
+{
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+	vm->page_order = order;
+#else
+	BUG_ON(order != 0);
+#endif
+}
+
 /**
  * vm_area_add_early - add vmap area early during boot
  * @vm: vm_struct to add
@@ -2422,6 +2494,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
 {
 	int i;
 
+	/* HUGE_VMALLOC passes small pages to set_direct_map */
 	for (i = 0; i < area->nr_pages; i++)
 		if (page_address(area->pages[i]))
 			set_direct_map(area->pages[i]);
@@ -2431,6 +2504,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
 {
 	unsigned long start = ULONG_MAX, end = 0;
+	unsigned int page_order = vm_area_page_order(area);
 	int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
 	int flush_dmap = 0;
 	int i;
@@ -2455,11 +2529,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
 	 * map. Find the start and end range of the direct mappings to make sure
 	 * the vm_unmap_aliases() flush includes the direct map.
 	 */
-	for (i = 0; i < area->nr_pages; i++) {
+	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
 		unsigned long addr = (unsigned long)page_address(area->pages[i]);
 		if (addr) {
+			unsigned long page_size;
+
+			page_size = PAGE_SIZE << page_order;
 			start = min(addr, start);
-			end = max(addr + PAGE_SIZE, end);
+			end = max(addr + page_size, end);
 			flush_dmap = 1;
 		}
 	}
@@ -2500,13 +2577,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
 	vm_remove_mappings(area, deallocate_pages);
 
 	if (deallocate_pages) {
+		unsigned int page_order = vm_area_page_order(area);
 		int i;
 
-		for (i = 0; i < area->nr_pages; i++) {
+		for (i = 0; i < area->nr_pages; i += 1U << page_order) {
 			struct page *page = area->pages[i];
 
 			BUG_ON(!page);
-			__free_pages(page, 0);
+			__free_pages(page, page_order);
 		}
 		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
 
@@ -2697,15 +2775,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
 #endif /* CONFIG_VMAP_PFN */
 
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-				 pgprot_t prot, int node)
+				 pgprot_t prot, unsigned int page_shift,
+				 int node)
 {
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
-	unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+	unsigned long addr = (unsigned long)area->addr;
+	unsigned long size = get_vm_area_size(area);
 	unsigned long array_size;
-	unsigned int i;
+	unsigned int nr_small_pages = size >> PAGE_SHIFT;
+	unsigned int page_order;
 	struct page **pages;
+	unsigned int i;
 
-	array_size = (unsigned long)nr_pages * sizeof(struct page *);
+	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
 	gfp_mask |= __GFP_NOWARN;
 	if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
 		gfp_mask |= __GFP_HIGHMEM;
@@ -2724,30 +2806,38 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	}
 
 	area->pages = pages;
-	area->nr_pages = nr_pages;
+	area->nr_pages = nr_small_pages;
+	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
 
-	for (i = 0; i < area->nr_pages; i++) {
-		struct page *page;
+	page_order = vm_area_page_order(area);
 
-		if (node == NUMA_NO_NODE)
-			page = alloc_page(gfp_mask);
-		else
-			page = alloc_pages_node(node, gfp_mask, 0);
+	/*
+	 * Careful, we allocate and map page_order pages, but tracking is done
+	 * per PAGE_SIZE page so as to keep the vm_struct APIs independent of
+	 * the physical/mapped size.
+	 */
+	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+		struct page *page;
+		int p;
 
+		/* Compound pages required for remap_vmalloc_page */
+		page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order);
 		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vfree() */
 			area->nr_pages = i;
 			atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
 			goto fail;
 		}
-		area->pages[i] = page;
+
+		for (p = 0; p < (1U << page_order); p++)
+			area->pages[i + p] = page + p;
+
 		if (gfpflags_allow_blocking(gfp_mask))
 			cond_resched();
 	}
 	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
 
-	if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
-			prot, pages) < 0)
+	if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
 		goto fail;
 
 	return area->addr;
@@ -2755,7 +2845,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 fail:
 	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
-			  (area->nr_pages*PAGE_SIZE), area->size);
+			  (area->nr_pages*PAGE_SIZE), size);
 	__vfree(area->addr);
 	return NULL;
 }
@@ -2786,19 +2876,45 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	struct vm_struct *area;
 	void *addr;
 	unsigned long real_size = size;
+	unsigned long real_align = align;
+	unsigned int shift = PAGE_SHIFT;
 
-	size = PAGE_ALIGN(size);
-	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
+	if (!size || (size >> PAGE_SHIFT) > totalram_pages()) {
+		area = NULL;
 		goto fail;
+	}
+
+	if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
+			arch_vmap_pmd_supported(prot)) {
+		unsigned long size_per_node;
 
-	area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
+		/*
+		 * Try huge pages. Only try for PAGE_KERNEL allocations,
+		 * others like modules don't yet expect huge pages in
+		 * their allocations due to apply_to_page_range not
+		 * supporting them.
+		 */
+
+		size_per_node = size;
+		if (node == NUMA_NO_NODE)
+			size_per_node /= num_online_nodes();
+		if (size_per_node >= PMD_SIZE) {
+			shift = PMD_SHIFT;
+			align = max(real_align, 1UL << shift);
+			size = ALIGN(real_size, 1UL << shift);
+		}
+	}
+
+again:
+	size = PAGE_ALIGN(size);
+	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
 				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
-	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+	addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
 	if (!addr)
-		return NULL;
+		goto fail;
 
 	/*
 	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2812,8 +2928,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	return addr;
 
 fail:
-	warn_alloc(gfp_mask, NULL,
+	if (shift > PAGE_SHIFT) {
+		shift = PAGE_SHIFT;
+		align = real_align;
+		size = real_size;
+		goto again;
+	}
+
+	if (!area) {
+		/* Warn for area allocation, page allocations already warn */
+		warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure: %lu bytes", real_size);
+	}
 	return NULL;
 }
 
-- 
cgit v1.2.3


From b67177ecd956333029dbc1a4971a857fee0ccbb1 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 29 Apr 2021 22:58:53 -0700
Subject: mm/vmalloc: remove map_kernel_range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm/vmalloc: cleanup after hugepage series", v2.

Christoph pointed out some overdue cleanups required after the huge
vmalloc series, and I had another failure error message improvement as
well.

This patch (of 5):

This is a shim around vmap_pages_range, get rid of it.

Move the main API comment from the _noflush variant to the normal variant,
and make _noflush internal to mm/.

Link: https://lkml.kernel.org/r/20210322021806.892164-1-npiggin@gmail.com
Link: https://lkml.kernel.org/r/20210322021806.892164-2-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/cachetlb.rst |  2 +-
 include/linux/vmalloc.h             | 11 -------
 mm/internal.h                       |  6 ++++
 mm/percpu-vm.c                      |  5 +--
 mm/vmalloc.c                        | 65 ++++++++++++++++---------------------
 5 files changed, 38 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst
index a1582cc79f0f..756f7bcf8191 100644
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
@@ -213,7 +213,7 @@ Here are the routines, one by one:
 	there will be no entries in the cache for the kernel address
 	space for virtual addresses in the range 'start' to 'end-1'.
 
-	The first of these two routines is invoked after map_kernel_range()
+	The first of these two routines is invoked after vmap_range()
 	has installed the page table entries.  The second is invoked
 	before unmap_kernel_range() deletes the page table entries.
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index b4c82f2d40dc..fb3b9989a4c5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -212,10 +212,6 @@ static inline bool is_vm_area_hugepages(const void *addr)
 int vmap_range(unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
 			unsigned int max_page_shift);
-extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
-				    pgprot_t prot, struct page **pages);
-int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
-		struct page **pages);
 extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 static inline void set_vm_flush_reset_perms(void *addr)
@@ -227,13 +223,6 @@ static inline void set_vm_flush_reset_perms(void *addr)
 }
 
 #else
-static inline int
-map_kernel_range_noflush(unsigned long start, unsigned long size,
-			pgprot_t prot, struct page **pages)
-{
-	return size >> PAGE_SHIFT;
-}
-#define map_kernel_range map_kernel_range_noflush
 static inline void
 unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
 {
diff --git a/mm/internal.h b/mm/internal.h
index bbe900f9f095..58c3757c52d9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -637,4 +637,10 @@ struct migration_target_control {
 	gfp_t gfp_mask;
 };
 
+/*
+ * mm/vmalloc.c
+ */
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+                pgprot_t prot, struct page **pages, unsigned int page_shift);
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index e46f7a6917f9..88a53eb68a94 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -8,6 +8,7 @@
  * Chunks are mapped into vmalloc areas and populated page by page.
  * This is the default chunk allocator.
  */
+#include "internal.h"
 
 static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
 				    unsigned int cpu, int page_idx)
@@ -192,8 +193,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
 static int __pcpu_map_pages(unsigned long addr, struct page **pages,
 			    int nr_pages)
 {
-	return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
-					PAGE_KERNEL, pages);
+	return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
+					PAGE_KERNEL, pages, PAGE_SHIFT);
 }
 
 /**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 59c815eb7e74..527781a3a0fe 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -523,7 +523,16 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
 	return 0;
 }
 
-static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+/*
+ * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
+ * flush caches.
+ *
+ * The caller is responsible for calling flush_cache_vmap() after this
+ * function returns successfully and before the addresses are accessed.
+ *
+ * This is an internal function only. Do not use outside mm/.
+ */
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 		pgprot_t prot, struct page **pages, unsigned int page_shift)
 {
 	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
@@ -549,48 +558,26 @@ static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
 	return 0;
 }
 
-static int vmap_pages_range(unsigned long addr, unsigned long end,
-		pgprot_t prot, struct page **pages, unsigned int page_shift)
-{
-	int err;
-
-	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
-	flush_cache_vmap(addr, end);
-	return err;
-}
-
 /**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * vmap_pages_range - map pages to a kernel virtual address
  * @addr: start of the VM area to map
- * @size: size of the VM area to map
+ * @end: end of the VM area to map (non-inclusive)
  * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing.  The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
+ * @pages: pages to map (always PAGE_SIZE pages)
+ * @page_shift: maximum shift that the pages may be mapped with, @pages must
+ * be aligned and contiguous up to at least this shift.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
-			     pgprot_t prot, struct page **pages)
-{
-	return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
-}
-
-int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
-		struct page **pages)
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+		pgprot_t prot, struct page **pages, unsigned int page_shift)
 {
-	int ret;
+	int err;
 
-	ret = map_kernel_range_noflush(start, size, prot, pages);
-	flush_cache_vmap(start, start + size);
-	return ret;
+	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+	flush_cache_vmap(addr, end);
+	return err;
 }
 
 int is_vmalloc_or_module_addr(const void *x)
@@ -2156,10 +2143,12 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 
 	kasan_unpoison_vmalloc(mem, size);
 
-	if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
+	if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
+				pages, PAGE_SHIFT) < 0) {
 		vm_unmap_ram(mem, count);
 		return NULL;
 	}
+
 	return mem;
 }
 EXPORT_SYMBOL(vm_map_ram);
@@ -2703,6 +2692,7 @@ void *vmap(struct page **pages, unsigned int count,
 	   unsigned long flags, pgprot_t prot)
 {
 	struct vm_struct *area;
+	unsigned long addr;
 	unsigned long size;		/* In bytes */
 
 	might_sleep();
@@ -2715,8 +2705,9 @@ void *vmap(struct page **pages, unsigned int count,
 	if (!area)
 		return NULL;
 
-	if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
-			pages) < 0) {
+	addr = (unsigned long)area->addr;
+	if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
+				pages, PAGE_SHIFT) < 0) {
 		vunmap(area->addr);
 		return NULL;
 	}
-- 
cgit v1.2.3


From 4ad0ae8c64ac8f81a3651bca11be7c3cb086df80 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 29 Apr 2021 22:59:01 -0700
Subject: mm/vmalloc: remove unmap_kernel_range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a shim around vunmap_range, get rid of it.

Move the main API comment from the _noflush variant to the normal
variant, and make _noflush internal to mm/.

[npiggin@gmail.com: fix nommu builds and a comment bug per sfr]
  Link: https://lkml.kernel.org/r/1617292598.m6g0knx24s.astroid@bobo.none
[akpm@linux-foundation.org: move vunmap_range_noflush() stub inside !CONFIG_MMU, not !CONFIG_NUMA]
[npiggin@gmail.com: fix nommu builds]
  Link: https://lkml.kernel.org/r/1617292497.o1uhq5ipxp.astroid@bobo.none

Link: https://lkml.kernel.org/r/20210322021806.892164-5-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Cédric Le Goater <clg@kaod.org>
Cc: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/cachetlb.rst |  2 +-
 arch/arm64/mm/init.c                |  2 +-
 arch/powerpc/kernel/isa-bridge.c    |  4 +--
 arch/powerpc/kernel/pci_64.c        |  2 +-
 arch/powerpc/mm/ioremap.c           |  2 +-
 drivers/pci/pci.c                   |  2 +-
 include/linux/vmalloc.h             |  8 +----
 mm/internal.h                       | 15 +++++++++-
 mm/percpu-vm.c                      |  2 +-
 mm/vmalloc.c                        | 59 ++++++++++++++++++-------------------
 10 files changed, 51 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst
index 756f7bcf8191..fe4290e26729 100644
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
@@ -215,7 +215,7 @@ Here are the routines, one by one:
 
 	The first of these two routines is invoked after vmap_range()
 	has installed the page table entries.  The second is invoked
-	before unmap_kernel_range() deletes the page table entries.
+	before vunmap_range() deletes the page table entries.
 
 There exists another whole class of cpu cache issues which currently
 require a whole different set of interfaces to handle properly.
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 3685e12aba9b..470f92e6a542 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -521,7 +521,7 @@ void free_initmem(void)
 	 * prevents the region from being reused for kernel modules, which
 	 * is not supported by kallsyms.
 	 */
-	unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
+	vunmap_range((u64)__init_begin, (u64)__init_end);
 }
 
 void dump_mem_limit(void)
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index 2257d24e6a26..39c625737c09 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -48,7 +48,7 @@ static void remap_isa_base(phys_addr_t pa, unsigned long size)
 	if (slab_is_available()) {
 		if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa,
 				pgprot_noncached(PAGE_KERNEL)))
-			unmap_kernel_range(ISA_IO_BASE, size);
+			vunmap_range(ISA_IO_BASE, ISA_IO_BASE + size);
 	} else {
 		early_ioremap_range(ISA_IO_BASE, pa, size,
 				pgprot_noncached(PAGE_KERNEL));
@@ -311,7 +311,7 @@ static void isa_bridge_remove(void)
 	isa_bridge_pcidev = NULL;
 
 	/* Unmap the ISA area */
-	unmap_kernel_range(ISA_IO_BASE, 0x10000);
+	vunmap_range(ISA_IO_BASE, ISA_IO_BASE + 0x10000);
 }
 
 /**
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 9312e6eda7ff..3fb7e572abed 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -140,7 +140,7 @@ void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size)
 	addr = (unsigned long)area->addr;
 	if (ioremap_page_range(addr, addr + size, paddr,
 			pgprot_noncached(PAGE_KERNEL))) {
-		unmap_kernel_range(addr, size);
+		vunmap_range(addr, addr + size);
 		return NULL;
 	}
 
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index b1a0aebe8c48..57342154d2b0 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -93,7 +93,7 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
 	if (!ret)
 		return (void __iomem *)area->addr + offset;
 
-	unmap_kernel_range(va, size);
+	vunmap_range(va, va + size);
 	free_vm_area(area);
 
 	return NULL;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e4d4e399004b..f4c26e6118ea 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4102,7 +4102,7 @@ void pci_unmap_iospace(struct resource *res)
 #if defined(PCI_IOBASE) && defined(CONFIG_MMU)
 	unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start;
 
-	unmap_kernel_range(vaddr, resource_size(res));
+	vunmap_range(vaddr, vaddr + resource_size(res));
 #endif
 }
 EXPORT_SYMBOL(pci_unmap_iospace);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index fb3b9989a4c5..394d03cc0e92 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -212,8 +212,7 @@ static inline bool is_vm_area_hugepages(const void *addr)
 int vmap_range(unsigned long addr, unsigned long end,
 			phys_addr_t phys_addr, pgprot_t prot,
 			unsigned int max_page_shift);
-extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
-extern void unmap_kernel_range(unsigned long addr, unsigned long size);
+void vunmap_range(unsigned long addr, unsigned long end);
 static inline void set_vm_flush_reset_perms(void *addr)
 {
 	struct vm_struct *vm = find_vm_area(addr);
@@ -223,11 +222,6 @@ static inline void set_vm_flush_reset_perms(void *addr)
 }
 
 #else
-static inline void
-unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
-{
-}
-#define unmap_kernel_range unmap_kernel_range_noflush
 static inline void set_vm_flush_reset_perms(void *addr)
 {
 }
diff --git a/mm/internal.h b/mm/internal.h
index 58c3757c52d9..42e30e71554a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -446,7 +446,9 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
-
+static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+}
 #endif /* !CONFIG_MMU */
 
 /*
@@ -640,7 +642,18 @@ struct migration_target_control {
 /*
  * mm/vmalloc.c
  */
+#ifdef CONFIG_MMU
 int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                 pgprot_t prot, struct page **pages, unsigned int page_shift);
+#else
+static inline
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+                pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+	return -EINVAL;
+}
+#endif
+
+void vunmap_range_noflush(unsigned long start, unsigned long end);
 
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 88a53eb68a94..8d3844bc0c7c 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -134,7 +134,7 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
 
 static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
 {
-	unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+	vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
 }
 
 /**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 527781a3a0fe..f7a53c19e84b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -378,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	} while (p4d++, addr = next, addr != end);
 }
 
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @start: start of the VM area to unmap
- * @size: size of the VM area to unmap
+/*
+ * vunmap_range_noflush is similar to vunmap_range, but does not
+ * flush caches or TLBs.
  *
- * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify
- * should have been allocated using get_vm_area() and its friends.
+ * The caller is responsible for calling flush_cache_vmap() before calling
+ * this function, and flush_tlb_kernel_range after it has returned
+ * successfully (and before the addresses are expected to cause a page fault
+ * or be re-mapped for something else, if TLB flushes are being delayed or
+ * coalesced).
  *
- * NOTE:
- * This function does NOT do any cache flushing.  The caller is responsible
- * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
- * function and flush_tlb_kernel_range() after.
+ * This is an internal function only. Do not use outside mm/.
  */
-void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
+void vunmap_range_noflush(unsigned long start, unsigned long end)
 {
-	unsigned long end = start + size;
 	unsigned long next;
 	pgd_t *pgd;
 	unsigned long addr = start;
@@ -414,6 +412,22 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
 		arch_sync_kernel_mappings(start, end);
 }
 
+/**
+ * vunmap_range - unmap kernel virtual addresses
+ * @addr: start of the VM area to unmap
+ * @end: end of the VM area to unmap (non-inclusive)
+ *
+ * Clears any present PTEs in the virtual address range, flushes TLBs and
+ * caches. Any subsequent access to the address before it has been re-mapped
+ * is a kernel bug.
+ */
+void vunmap_range(unsigned long addr, unsigned long end)
+{
+	flush_cache_vunmap(addr, end);
+	vunmap_range_noflush(addr, end);
+	flush_tlb_kernel_range(addr, end);
+}
+
 static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 		pgtbl_mod_mask *mask)
@@ -1712,7 +1726,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 static void free_unmap_vmap_area(struct vmap_area *va)
 {
 	flush_cache_vunmap(va->va_start, va->va_end);
-	unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
+	vunmap_range_noflush(va->va_start, va->va_end);
 	if (debug_pagealloc_enabled_static())
 		flush_tlb_kernel_range(va->va_start, va->va_end);
 
@@ -1990,7 +2004,7 @@ static void vb_free(unsigned long addr, unsigned long size)
 	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
 	vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
 
-	unmap_kernel_range_noflush(addr, size);
+	vunmap_range_noflush(addr, addr + size);
 
 	if (debug_pagealloc_enabled_static())
 		flush_tlb_kernel_range(addr, addr + size);
@@ -2307,23 +2321,6 @@ void __init vmalloc_init(void)
 	vmap_initialized = true;
 }
 
-/**
- * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Similar to unmap_kernel_range_noflush() but flushes vcache before
- * the unmapping and tlb after.
- */
-void unmap_kernel_range(unsigned long addr, unsigned long size)
-{
-	unsigned long end = addr + size;
-
-	flush_cache_vunmap(addr, end);
-	unmap_kernel_range_noflush(addr, size);
-	flush_tlb_kernel_range(addr, end);
-}
-
 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
-- 
cgit v1.2.3


From 78f4841e34763079be0661744c1ca997be64eb56 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 29 Apr 2021 22:59:25 -0700
Subject: mm/doc: fix fault_flag_allow_retry_first kerneldoc

make htmldocs reports:

  include/linux/mm.h:496: warning: Function parameter or member 'flags' not described in 'fault_flag_allow_retry_first'

Add a description.

Link: https://lkml.kernel.org/r/20210322195022.2143603-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 93097dbd9604..382c33e7d906 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -485,6 +485,7 @@ extern pgprot_t protection_map[16];
 
 /**
  * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
+ * @flags: Fault flags.
  *
  * This is mostly used for places where we want to try to avoid taking
  * the mmap_lock for too long a time when waiting for another condition
-- 
cgit v1.2.3


From 136dfc9949f84089217f84e6478471dabbf14ba7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 29 Apr 2021 22:59:28 -0700
Subject: mm/doc: fix page_maybe_dma_pinned kerneldoc

make htmldocs reports:

  include/linux/mm.h:1341: warning: Excess function parameter 'Return' description in 'page_maybe_dma_pinned'

Fix a few other formatting nits while I'm editing this description.

Link: https://lkml.kernel.org/r/20210322195022.2143603-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 382c33e7d906..5acda8761935 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1271,10 +1271,11 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
 void unpin_user_pages(struct page **pages, unsigned long npages);
 
 /**
- * page_maybe_dma_pinned() - report if a page is pinned for DMA.
+ * page_maybe_dma_pinned - Report if a page is pinned for DMA.
+ * @page: The page.
  *
  * This function checks if a page has been pinned via a call to
- * pin_user_pages*().
+ * a function in the pin_user_pages() family.
  *
  * For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
  * because it means "definitely not pinned for DMA", but true means "probably
@@ -1292,9 +1293,8 @@ void unpin_user_pages(struct page **pages, unsigned long npages);
  *
  * For more information, please see Documentation/core-api/pin_user_pages.rst.
  *
- * @page:	pointer to page to be queried.
- * @Return:	True, if it is likely that the page has been "dma-pinned".
- *		False, if the page is definitely not dma-pinned.
+ * Return: True, if it is likely that the page has been "dma-pinned".
+ * False, if the page is definitely not dma-pinned.
  */
 static inline bool page_maybe_dma_pinned(struct page *page)
 {
-- 
cgit v1.2.3


From da2f5eb3d344503c4d851bdf1ae2379167074413 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 29 Apr 2021 22:59:31 -0700
Subject: mm/doc: turn fault flags into an enum

The kernel-doc script complains about

  include/linux/mm.h:425: warning: wrong kernel-doc identifier on line:
   * Fault flag definitions.

I don't know how to document a series of #defines, so turn these
definitions into an enum and document that instead.

Link: https://lkml.kernel.org/r/20210322195022.2143603-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5acda8761935..64b2e3a0b94d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -432,8 +432,7 @@ extern unsigned int kobjsize(const void *objp);
 extern pgprot_t protection_map[16];
 
 /**
- * Fault flag definitions.
- *
+ * enum fault_flag - Fault flag definitions.
  * @FAULT_FLAG_WRITE: Fault was a write fault.
  * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
  * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
@@ -464,16 +463,18 @@ extern pgprot_t protection_map[16];
  * signals before a retry to make sure the continuous page faults can still be
  * interrupted if necessary.
  */
-#define FAULT_FLAG_WRITE			0x01
-#define FAULT_FLAG_MKWRITE			0x02
-#define FAULT_FLAG_ALLOW_RETRY			0x04
-#define FAULT_FLAG_RETRY_NOWAIT			0x08
-#define FAULT_FLAG_KILLABLE			0x10
-#define FAULT_FLAG_TRIED			0x20
-#define FAULT_FLAG_USER				0x40
-#define FAULT_FLAG_REMOTE			0x80
-#define FAULT_FLAG_INSTRUCTION  		0x100
-#define FAULT_FLAG_INTERRUPTIBLE		0x200
+enum fault_flag {
+	FAULT_FLAG_WRITE =		1 << 0,
+	FAULT_FLAG_MKWRITE =		1 << 1,
+	FAULT_FLAG_ALLOW_RETRY =	1 << 2,
+	FAULT_FLAG_RETRY_NOWAIT = 	1 << 3,
+	FAULT_FLAG_KILLABLE =		1 << 4,
+	FAULT_FLAG_TRIED = 		1 << 5,
+	FAULT_FLAG_USER =		1 << 6,
+	FAULT_FLAG_REMOTE =		1 << 7,
+	FAULT_FLAG_INSTRUCTION =	1 << 8,
+	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
+};
 
 /*
  * The default fault flags that should be used by most of the
@@ -496,7 +497,7 @@ extern pgprot_t protection_map[16];
  * Return: true if the page fault allows retry and this is the first
  * attempt of the fault handling; false otherwise.
  */
-static inline bool fault_flag_allow_retry_first(unsigned int flags)
+static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
 {
 	return (flags & FAULT_FLAG_ALLOW_RETRY) &&
 	    (!(flags & FAULT_FLAG_TRIED));
@@ -531,7 +532,7 @@ struct vm_fault {
 		pgoff_t pgoff;			/* Logical page offset based on vma */
 		unsigned long address;		/* Faulting virtual address */
 	};
-	unsigned int flags;		/* FAULT_FLAG_xxx flags
+	enum fault_flag flags;		/* FAULT_FLAG_xxx flags
 					 * XXX: should really be 'const' */
 	pmd_t *pmd;			/* Pointer to pmd entry matching
 					 * the 'address' */
-- 
cgit v1.2.3


From 91ab1a41191ef2d4c6e123951a0f0c3876bd9376 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 29 Apr 2021 22:59:40 -0700
Subject: pagewalk: prefix struct kernel-doc descriptions

The script './scripts/kernel-doc -none ./include/linux/pagewalk.h' reports:

  include/linux/pagewalk.h:37: warning: cannot understand function prototype: 'struct mm_walk_ops '
  include/linux/pagewalk.h:85: warning: cannot understand function prototype: 'struct mm_walk '

A kernel-doc description for a structure requires to prefix the struct
name with the keyword 'struct'.  So, do that such that no further
kernel-doc warnings are reported for this file.

Link: https://lkml.kernel.org/r/20210322122542.15072-3-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ralf Ramsauer <ralf.ramsauer@oth-regensburg.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagewalk.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index b1cb6b753abb..ac7b38ad5903 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -7,7 +7,7 @@
 struct mm_walk;
 
 /**
- * mm_walk_ops - callbacks for walk_page_range
+ * struct mm_walk_ops - callbacks for walk_page_range
  * @pgd_entry:		if set, called for each non-empty PGD (top-level) entry
  * @p4d_entry:		if set, called for each non-empty P4D entry
  * @pud_entry:		if set, called for each non-empty PUD entry
@@ -71,7 +71,7 @@ enum page_walk_action {
 };
 
 /**
- * mm_walk - walk_page_range data
+ * struct mm_walk - walk_page_range data
  * @ops:	operation to call during the walk
  * @mm:		mm_struct representing the target process of page table walk
  * @pgd:	pointer to PGD; only valid with no_vma (otherwise set to NULL)
-- 
cgit v1.2.3


From a064cb00d359bc464df6fd2ab6dfb8dc4b31e361 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 29 Apr 2021 22:59:49 -0700
Subject: kasan: initialize shadow to TAG_INVALID for SW_TAGS

Currently, KASAN_SW_TAGS uses 0xFF as the default tag value for
unallocated memory.  The underlying idea is that since that memory hasn't
been allocated yet, it's only supposed to be dereferenced through a
pointer with the native 0xFF tag.

While this is a good idea in terms on consistency, practically it doesn't
bring any benefit.  Since the 0xFF pointer tag is a match-all tag, it
doesn't matter what tag the accessed memory has.  No accesses through
0xFF-tagged pointers are considered buggy by KASAN.

This patch changes the default tag value for unallocated memory to 0xFE,
which is the tag KASAN uses for inaccessible memory.  This doesn't affect
accesses through 0xFF-tagged pointer to this memory, but this allows KASAN
to detect wild and large out-of-bounds invalid memory accesses through
otherwise-tagged pointers.

This is a prepatory patch for the next one, which changes the tag-based
KASAN modes to not poison the boot memory.

Link: https://lkml.kernel.org/r/c8e93571c18b3528aac5eb33ade213bf133d10ad.1613692950.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d53ea3c047bc..9f5faefd1744 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -30,7 +30,8 @@ struct kunit_kasan_expectation {
 /* Software KASAN implementations use shadow memory. */
 
 #ifdef CONFIG_KASAN_SW_TAGS
-#define KASAN_SHADOW_INIT 0xFF
+/* This matches KASAN_TAG_INVALID. */
+#define KASAN_SHADOW_INIT 0xFE
 #else
 #define KASAN_SHADOW_INIT 0
 #endif
-- 
cgit v1.2.3


From 1bb5eab30d68c1a3d9dbc822e1895e6c06dbe748 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 29 Apr 2021 23:00:02 -0700
Subject: kasan, mm: integrate page_alloc init with HW_TAGS

This change uses the previously added memory initialization feature of
HW_TAGS KASAN routines for page_alloc memory when init_on_alloc/free is
enabled.

With this change, kernel_init_free_pages() is no longer called when both
HW_TAGS KASAN and init_on_alloc/free are enabled.  Instead, memory is
initialized in KASAN runtime.

To avoid discrepancies with which memory gets initialized that can be
caused by future changes, both KASAN and kernel_init_free_pages() hooks
are put together and a warning comment is added.

This patch changes the order in which memory initialization and page
poisoning hooks are called.  This doesn't lead to any side-effects, as
whenever page poisoning is enabled, memory initialization gets disabled.

Combining setting allocation tags with memory initialization improves
HW_TAGS KASAN performance when init_on_alloc/free is enabled.

[andreyknvl@google.com: fix for "integrate page_alloc init with HW_TAGS"]
  Link: https://lkml.kernel.org/r/65b6028dea2e9a6e8e2cb779b5115c09457363fc.1617122211.git.andreyknvl@google.com

Link: https://lkml.kernel.org/r/e77f0d5b1b20658ef0b8288625c74c2b3690e725.1615296150.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Sergei Trofimovich <slyfox@gentoo.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 30 ++++++++++++++++++++++--------
 mm/kasan/common.c     |  8 ++++----
 mm/mempool.c          |  4 ++--
 mm/page_alloc.c       | 41 +++++++++++++++++++++++++++++++----------
 4 files changed, 59 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 9f5faefd1744..30aa2bee8400 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -96,6 +96,11 @@ static __always_inline bool kasan_enabled(void)
 	return static_branch_likely(&kasan_flag_enabled);
 }
 
+static inline bool kasan_has_integrated_init(void)
+{
+	return kasan_enabled();
+}
+
 #else /* CONFIG_KASAN_HW_TAGS */
 
 static inline bool kasan_enabled(void)
@@ -103,6 +108,11 @@ static inline bool kasan_enabled(void)
 	return true;
 }
 
+static inline bool kasan_has_integrated_init(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_KASAN_HW_TAGS */
 
 slab_flags_t __kasan_never_merge(void);
@@ -120,20 +130,20 @@ static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
 		__kasan_unpoison_range(addr, size);
 }
 
-void __kasan_alloc_pages(struct page *page, unsigned int order);
+void __kasan_alloc_pages(struct page *page, unsigned int order, bool init);
 static __always_inline void kasan_alloc_pages(struct page *page,
-						unsigned int order)
+						unsigned int order, bool init)
 {
 	if (kasan_enabled())
-		__kasan_alloc_pages(page, order);
+		__kasan_alloc_pages(page, order, init);
 }
 
-void __kasan_free_pages(struct page *page, unsigned int order);
+void __kasan_free_pages(struct page *page, unsigned int order, bool init);
 static __always_inline void kasan_free_pages(struct page *page,
-						unsigned int order)
+						unsigned int order, bool init)
 {
 	if (kasan_enabled())
-		__kasan_free_pages(page, order);
+		__kasan_free_pages(page, order, init);
 }
 
 void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
@@ -277,13 +287,17 @@ static inline bool kasan_enabled(void)
 {
 	return false;
 }
+static inline bool kasan_has_integrated_init(void)
+{
+	return false;
+}
 static inline slab_flags_t kasan_never_merge(void)
 {
 	return 0;
 }
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
-static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
-static inline void kasan_free_pages(struct page *page, unsigned int order) {}
+static inline void kasan_alloc_pages(struct page *page, unsigned int order, bool init) {}
+static inline void kasan_free_pages(struct page *page, unsigned int order, bool init) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      unsigned int *size,
 				      slab_flags_t *flags) {}
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b3d8eb154f59..efe58e58cc93 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void)
 	return 0;
 }
 
-void __kasan_alloc_pages(struct page *page, unsigned int order)
+void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
 {
 	u8 tag;
 	unsigned long i;
@@ -108,14 +108,14 @@ void __kasan_alloc_pages(struct page *page, unsigned int order)
 	tag = kasan_random_tag();
 	for (i = 0; i < (1 << order); i++)
 		page_kasan_tag_set(page + i, tag);
-	kasan_unpoison(page_address(page), PAGE_SIZE << order, false);
+	kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
 }
 
-void __kasan_free_pages(struct page *page, unsigned int order)
+void __kasan_free_pages(struct page *page, unsigned int order, bool init)
 {
 	if (likely(!PageHighMem(page)))
 		kasan_poison(page_address(page), PAGE_SIZE << order,
-			     KASAN_FREE_PAGE, false);
+			     KASAN_FREE_PAGE, init);
 }
 
 /*
diff --git a/mm/mempool.c b/mm/mempool.c
index 79959fac27d7..fe19d290a301 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -106,7 +106,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		kasan_slab_free_mempool(element);
 	else if (pool->alloc == mempool_alloc_pages)
-		kasan_free_pages(element, (unsigned long)pool->pool_data);
+		kasan_free_pages(element, (unsigned long)pool->pool_data, false);
 }
 
 static void kasan_unpoison_element(mempool_t *pool, void *element)
@@ -114,7 +114,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
 		kasan_unpoison_range(element, __ksize(element));
 	else if (pool->alloc == mempool_alloc_pages)
-		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+		kasan_alloc_pages(element, (unsigned long)pool->pool_data, false);
 }
 
 static __always_inline void add_element(mempool_t *pool, void *element)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 342cabf912a7..6314de5387f5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -397,14 +397,14 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
  * initialization is done, but this is not likely to happen.
  */
 static inline void kasan_free_nondeferred_pages(struct page *page, int order,
-							fpi_t fpi_flags)
+						bool init, fpi_t fpi_flags)
 {
 	if (static_branch_unlikely(&deferred_pages))
 		return;
 	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
 			(fpi_flags & FPI_SKIP_KASAN_POISON))
 		return;
-	kasan_free_pages(page, order);
+	kasan_free_pages(page, order, init);
 }
 
 /* Returns true if the struct page for the pfn is uninitialised */
@@ -456,12 +456,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 }
 #else
 static inline void kasan_free_nondeferred_pages(struct page *page, int order,
-							fpi_t fpi_flags)
+						bool init, fpi_t fpi_flags)
 {
 	if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
 			(fpi_flags & FPI_SKIP_KASAN_POISON))
 		return;
-	kasan_free_pages(page, order);
+	kasan_free_pages(page, order, init);
 }
 
 static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1243,6 +1243,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 			unsigned int order, bool check_free, fpi_t fpi_flags)
 {
 	int bad = 0;
+	bool init;
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
@@ -1300,16 +1301,21 @@ static __always_inline bool free_pages_prepare(struct page *page,
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
-	if (want_init_on_free())
-		kernel_init_free_pages(page, 1 << order);
 
 	kernel_poison_pages(page, 1 << order);
 
 	/*
+	 * As memory initialization might be integrated into KASAN,
+	 * kasan_free_pages and kernel_init_free_pages must be
+	 * kept together to avoid discrepancies in behavior.
+	 *
 	 * With hardware tag-based KASAN, memory tags must be set before the
 	 * page becomes unavailable via debug_pagealloc or arch_free_page.
 	 */
-	kasan_free_nondeferred_pages(page, order, fpi_flags);
+	init = want_init_on_free();
+	if (init && !kasan_has_integrated_init())
+		kernel_init_free_pages(page, 1 << order);
+	kasan_free_nondeferred_pages(page, order, init, fpi_flags);
 
 	/*
 	 * arch_free_page() can make the page's contents inaccessible.  s390
@@ -2316,17 +2322,32 @@ static bool check_new_pages(struct page *page, unsigned int order)
 inline void post_alloc_hook(struct page *page, unsigned int order,
 				gfp_t gfp_flags)
 {
+	bool init;
+
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 
 	arch_alloc_page(page, order);
 	debug_pagealloc_map_pages(page, 1 << order);
-	kasan_alloc_pages(page, order);
+
+	/*
+	 * Page unpoisoning must happen before memory initialization.
+	 * Otherwise, the poison pattern will be overwritten for __GFP_ZERO
+	 * allocations and the page unpoisoning code will complain.
+	 */
 	kernel_unpoison_pages(page, 1 << order);
-	set_page_owner(page, order, gfp_flags);
 
-	if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
+	/*
+	 * As memory initialization might be integrated into KASAN,
+	 * kasan_alloc_pages and kernel_init_free_pages must be
+	 * kept together to avoid discrepancies in behavior.
+	 */
+	init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+	kasan_alloc_pages(page, order, init);
+	if (init && !kasan_has_integrated_init())
 		kernel_init_free_pages(page, 1 << order);
+
+	set_page_owner(page, order, gfp_flags);
 }
 
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
-- 
cgit v1.2.3


From da844b787245194cfd69f0f1d2fb1dd3640a8a6d Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 29 Apr 2021 23:00:06 -0700
Subject: kasan, mm: integrate slab init_on_alloc with HW_TAGS

This change uses the previously added memory initialization feature of
HW_TAGS KASAN routines for slab memory when init_on_alloc is enabled.

With this change, memory initialization memset() is no longer called when
both HW_TAGS KASAN and init_on_alloc are enabled.  Instead, memory is
initialized in KASAN runtime.

The memory initialization memset() is moved into slab_post_alloc_hook()
that currently directly follows the initialization loop.  A new argument
is added to slab_post_alloc_hook() that indicates whether to initialize
the memory or not.

To avoid discrepancies with which memory gets initialized that can be
caused by future changes, both KASAN hook and initialization memset() are
put together and a warning comment is added.

Combining setting allocation tags with memory initialization improves
HW_TAGS KASAN performance when init_on_alloc is enabled.

Link: https://lkml.kernel.org/r/c1292aeb5d519da221ec74a0684a949b027d7720.1615296150.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  8 ++++----
 mm/kasan/common.c     |  4 ++--
 mm/slab.c             | 28 +++++++++++++---------------
 mm/slab.h             | 17 +++++++++++++----
 mm/slub.c             | 27 +++++++++++----------------
 5 files changed, 43 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 30aa2bee8400..629aee484b6c 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -226,12 +226,12 @@ static __always_inline void kasan_slab_free_mempool(void *ptr)
 }
 
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
-				       void *object, gfp_t flags);
+				       void *object, gfp_t flags, bool init);
 static __always_inline void * __must_check kasan_slab_alloc(
-				struct kmem_cache *s, void *object, gfp_t flags)
+		struct kmem_cache *s, void *object, gfp_t flags, bool init)
 {
 	if (kasan_enabled())
-		return __kasan_slab_alloc(s, object, flags);
+		return __kasan_slab_alloc(s, object, flags, init);
 	return object;
 }
 
@@ -320,7 +320,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
 static inline void kasan_kfree_large(void *ptr) {}
 static inline void kasan_slab_free_mempool(void *ptr) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
-				   gfp_t flags)
+				   gfp_t flags, bool init)
 {
 	return object;
 }
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index efe58e58cc93..ac0d4ed9c921 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -428,7 +428,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object,
 }
 
 void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
-					void *object, gfp_t flags)
+					void *object, gfp_t flags, bool init)
 {
 	u8 tag;
 	void *tagged_object;
@@ -453,7 +453,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
 	 * Unpoison the whole object.
 	 * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
 	 */
-	kasan_unpoison(tagged_object, cache->object_size, false);
+	kasan_unpoison(tagged_object, cache->object_size, init);
 
 	/* Save alloc info (if possible) for non-kmalloc() allocations. */
 	if (kasan_stack_collection_enabled())
diff --git a/mm/slab.c b/mm/slab.c
index 4e212cda8693..84f183e9b31a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3216,6 +3216,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
 	void *ptr;
 	int slab_node = numa_mem_id();
 	struct obj_cgroup *objcg = NULL;
+	bool init = false;
 
 	flags &= gfp_allowed_mask;
 	cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
@@ -3254,12 +3255,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
   out:
 	local_irq_restore(save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-
-	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
-		memset(ptr, 0, cachep->object_size);
+	init = slab_want_init_on_alloc(flags, cachep);
 
 out_hooks:
-	slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
+	slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
 	return ptr;
 }
 
@@ -3301,6 +3300,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
 	unsigned long save_flags;
 	void *objp;
 	struct obj_cgroup *objcg = NULL;
+	bool init = false;
 
 	flags &= gfp_allowed_mask;
 	cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
@@ -3317,12 +3317,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
 	prefetchw(objp);
-
-	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
-		memset(objp, 0, cachep->object_size);
+	init = slab_want_init_on_alloc(flags, cachep);
 
 out:
-	slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
+	slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
 	return objp;
 }
 
@@ -3542,18 +3540,18 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 
 	cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
 
-	/* Clear memory outside IRQ disabled section */
-	if (unlikely(slab_want_init_on_alloc(flags, s)))
-		for (i = 0; i < size; i++)
-			memset(p[i], 0, s->object_size);
-
-	slab_post_alloc_hook(s, objcg, flags, size, p);
+	/*
+	 * memcg and kmem_cache debug support and memory initialization.
+	 * Done outside of the IRQ disabled section.
+	 */
+	slab_post_alloc_hook(s, objcg, flags, size, p,
+				slab_want_init_on_alloc(flags, s));
 	/* FIXME: Trace call missing. Christoph would like a bulk variant */
 	return size;
 error:
 	local_irq_enable();
 	cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
-	slab_post_alloc_hook(s, objcg, flags, i, p);
+	slab_post_alloc_hook(s, objcg, flags, i, p, false);
 	__kmem_cache_free_bulk(s, i, p);
 	return 0;
 }
diff --git a/mm/slab.h b/mm/slab.h
index c30ed35b3d5d..18c1927cd196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -506,15 +506,24 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
 }
 
 static inline void slab_post_alloc_hook(struct kmem_cache *s,
-					struct obj_cgroup *objcg,
-					gfp_t flags, size_t size, void **p)
+					struct obj_cgroup *objcg, gfp_t flags,
+					size_t size, void **p, bool init)
 {
 	size_t i;
 
 	flags &= gfp_allowed_mask;
+
+	/*
+	 * As memory initialization might be integrated into KASAN,
+	 * kasan_slab_alloc and initialization memset must be
+	 * kept together to avoid discrepancies in behavior.
+	 *
+	 * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
+	 */
 	for (i = 0; i < size; i++) {
-		p[i] = kasan_slab_alloc(s, p[i], flags);
-		/* As p[i] might get tagged, call kmemleak hook after KASAN. */
+		p[i] = kasan_slab_alloc(s, p[i], flags, init);
+		if (p[i] && init && !kasan_has_integrated_init())
+			memset(p[i], 0, s->object_size);
 		kmemleak_alloc_recursive(p[i], s->object_size, 1,
 					 s->flags, flags);
 	}
diff --git a/mm/slub.c b/mm/slub.c
index a178c738fc92..5cf35250f20c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2823,6 +2823,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 	struct page *page;
 	unsigned long tid;
 	struct obj_cgroup *objcg = NULL;
+	bool init = false;
 
 	s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
 	if (!s)
@@ -2900,12 +2901,10 @@ redo:
 	}
 
 	maybe_wipe_obj_freeptr(s, object);
-
-	if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
-		memset(kasan_reset_tag(object), 0, s->object_size);
+	init = slab_want_init_on_alloc(gfpflags, s);
 
 out:
-	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
+	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init);
 
 	return object;
 }
@@ -3357,20 +3356,16 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	c->tid = next_tid(c->tid);
 	local_irq_enable();
 
-	/* Clear memory outside IRQ disabled fastpath loop */
-	if (unlikely(slab_want_init_on_alloc(flags, s))) {
-		int j;
-
-		for (j = 0; j < i; j++)
-			memset(kasan_reset_tag(p[j]), 0, s->object_size);
-	}
-
-	/* memcg and kmem_cache debug support */
-	slab_post_alloc_hook(s, objcg, flags, size, p);
+	/*
+	 * memcg and kmem_cache debug support and memory initialization.
+	 * Done outside of the IRQ disabled fastpath loop.
+	 */
+	slab_post_alloc_hook(s, objcg, flags, size, p,
+				slab_want_init_on_alloc(flags, s));
 	return i;
 error:
 	local_irq_enable();
-	slab_post_alloc_hook(s, objcg, flags, i, p);
+	slab_post_alloc_hook(s, objcg, flags, i, p, false);
 	__kmem_cache_free_bulk(s, i, p);
 	return 0;
 }
@@ -3580,7 +3575,7 @@ static void early_kmem_cache_node_alloc(int node)
 	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
 	init_tracking(kmem_cache_node, n);
 #endif
-	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL);
+	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
 	page->freelist = get_freepointer(kmem_cache_node, n);
 	page->inuse = 1;
 	page->frozen = 0;
-- 
cgit v1.2.3


From d57a964e09c22441e9fb497d1d7a5c1983a5d1fb Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Thu, 29 Apr 2021 23:00:09 -0700
Subject: kasan, mm: integrate slab init_on_free with HW_TAGS

This change uses the previously added memory initialization feature of
HW_TAGS KASAN routines for slab memory when init_on_free is enabled.

With this change, memory initialization memset() is no longer called when
both HW_TAGS KASAN and init_on_free are enabled.  Instead, memory is
initialized in KASAN runtime.

For SLUB, the memory initialization memset() is moved into
slab_free_hook() that currently directly follows the initialization loop.
A new argument is added to slab_free_hook() that indicates whether to
initialize the memory or not.

To avoid discrepancies with which memory gets initialized that can be
caused by future changes, both KASAN hook and initialization memset() are
put together and a warning comment is added.

Combining setting allocation tags with memory initialization improves
HW_TAGS KASAN performance when init_on_free is enabled.

Link: https://lkml.kernel.org/r/190fd15c1886654afdec0d19ebebd5ade665b601.1615296150.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 10 ++++++----
 mm/kasan/common.c     | 13 +++++++------
 mm/slab.c             | 15 +++++++++++----
 mm/slub.c             | 43 ++++++++++++++++++++++++-------------------
 4 files changed, 48 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 629aee484b6c..b1678a61e6a7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -203,11 +203,13 @@ static __always_inline void * __must_check kasan_init_slab_obj(
 	return (void *)object;
 }
 
-bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
-static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+bool __kasan_slab_free(struct kmem_cache *s, void *object,
+			unsigned long ip, bool init);
+static __always_inline bool kasan_slab_free(struct kmem_cache *s,
+						void *object, bool init)
 {
 	if (kasan_enabled())
-		return __kasan_slab_free(s, object, _RET_IP_);
+		return __kasan_slab_free(s, object, _RET_IP_, init);
 	return false;
 }
 
@@ -313,7 +315,7 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
 {
 	return (void *)object;
 }
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init)
 {
 	return false;
 }
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index ac0d4ed9c921..6bb87f2acd4e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -322,8 +322,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
 	return (void *)object;
 }
 
-static inline bool ____kasan_slab_free(struct kmem_cache *cache,
-				void *object, unsigned long ip, bool quarantine)
+static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
+				unsigned long ip, bool quarantine, bool init)
 {
 	u8 tag;
 	void *tagged_object;
@@ -351,7 +351,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache,
 	}
 
 	kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
-			KASAN_KMALLOC_FREE, false);
+			KASAN_KMALLOC_FREE, init);
 
 	if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
 		return false;
@@ -362,9 +362,10 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache,
 	return kasan_quarantine_put(cache, object);
 }
 
-bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+				unsigned long ip, bool init)
 {
-	return ____kasan_slab_free(cache, object, ip, true);
+	return ____kasan_slab_free(cache, object, ip, true, init);
 }
 
 static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
@@ -409,7 +410,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
 			return;
 		kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false);
 	} else {
-		____kasan_slab_free(page->slab_cache, ptr, ip, false);
+		____kasan_slab_free(page->slab_cache, ptr, ip, false, false);
 	}
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index 84f183e9b31a..df45c437b394 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3425,17 +3425,24 @@ free_done:
 static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
 					 unsigned long caller)
 {
+	bool init;
+
 	if (is_kfence_address(objp)) {
 		kmemleak_free_recursive(objp, cachep->flags);
 		__kfence_free(objp);
 		return;
 	}
 
-	if (unlikely(slab_want_init_on_free(cachep)))
+	/*
+	 * As memory initialization might be integrated into KASAN,
+	 * kasan_slab_free and initialization memset must be
+	 * kept together to avoid discrepancies in behavior.
+	 */
+	init = slab_want_init_on_free(cachep);
+	if (init && !kasan_has_integrated_init())
 		memset(objp, 0, cachep->object_size);
-
-	/* Put the object into the quarantine, don't touch it for now. */
-	if (kasan_slab_free(cachep, objp))
+	/* KASAN might put objp into memory quarantine, delaying its reuse. */
+	if (kasan_slab_free(cachep, objp, init))
 		return;
 
 	/* Use KCSAN to help debug racy use-after-free. */
diff --git a/mm/slub.c b/mm/slub.c
index 5cf35250f20c..68123b21e65f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1533,7 +1533,8 @@ static __always_inline void kfree_hook(void *x)
 	kasan_kfree_large(x);
 }
 
-static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
+static __always_inline bool slab_free_hook(struct kmem_cache *s,
+						void *x, bool init)
 {
 	kmemleak_free_recursive(x, s->flags);
 
@@ -1559,8 +1560,25 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
 		__kcsan_check_access(x, s->object_size,
 				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
 
-	/* KASAN might put x into memory quarantine, delaying its reuse */
-	return kasan_slab_free(s, x);
+	/*
+	 * As memory initialization might be integrated into KASAN,
+	 * kasan_slab_free and initialization memset's must be
+	 * kept together to avoid discrepancies in behavior.
+	 *
+	 * The initialization memset's clear the object and the metadata,
+	 * but don't touch the SLAB redzone.
+	 */
+	if (init) {
+		int rsize;
+
+		if (!kasan_has_integrated_init())
+			memset(kasan_reset_tag(x), 0, s->object_size);
+		rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
+		memset((char *)kasan_reset_tag(x) + s->inuse, 0,
+		       s->size - s->inuse - rsize);
+	}
+	/* KASAN might put x into memory quarantine, delaying its reuse. */
+	return kasan_slab_free(s, x, init);
 }
 
 static inline bool slab_free_freelist_hook(struct kmem_cache *s,
@@ -1570,10 +1588,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 	void *object;
 	void *next = *head;
 	void *old_tail = *tail ? *tail : *head;
-	int rsize;
 
 	if (is_kfence_address(next)) {
-		slab_free_hook(s, next);
+		slab_free_hook(s, next, false);
 		return true;
 	}
 
@@ -1585,20 +1602,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
 		object = next;
 		next = get_freepointer(s, object);
 
-		if (slab_want_init_on_free(s)) {
-			/*
-			 * Clear the object and the metadata, but don't touch
-			 * the redzone.
-			 */
-			memset(kasan_reset_tag(object), 0, s->object_size);
-			rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
-							   : 0;
-			memset((char *)kasan_reset_tag(object) + s->inuse, 0,
-			       s->size - s->inuse - rsize);
-
-		}
 		/* If object's reuse doesn't have to be delayed */
-		if (!slab_free_hook(s, object)) {
+		if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
 			/* Move object to the new freelist */
 			set_freepointer(s, object, *head);
 			*head = object;
@@ -3236,7 +3241,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
 	}
 
 	if (is_kfence_address(object)) {
-		slab_free_hook(df->s, object);
+		slab_free_hook(df->s, object, false);
 		__kfence_free(object);
 		p[size] = NULL; /* mark object processed */
 		return size;
-- 
cgit v1.2.3


From 1f9d03c5e999ed5a57fa4d8aec9fdf67a6234b80 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 29 Apr 2021 23:00:55 -0700
Subject: mm: move mem_init_print_info() into mm_init()

mem_init_print_info() is called in mem_init() on each architecture, and
pass NULL argument, so using void argument and move it into mm_init().

Link: https://lkml.kernel.org/r/20210317015210.33641-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>	[x86]
Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>	[powerpc]
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Anatoly Pugachev <matorola@gmail.com>	[sparc64]
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>	[arm]
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Guo Ren <guoren@kernel.org>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Peter Zijlstra" <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/init.c             |  1 -
 arch/arc/mm/init.c               |  1 -
 arch/arm/mm/init.c               |  2 --
 arch/arm64/mm/init.c             |  2 --
 arch/csky/mm/init.c              |  1 -
 arch/h8300/mm/init.c             |  2 --
 arch/hexagon/mm/init.c           |  1 -
 arch/ia64/mm/init.c              |  1 -
 arch/m68k/mm/init.c              |  1 -
 arch/microblaze/mm/init.c        |  1 -
 arch/mips/loongson64/numa.c      |  1 -
 arch/mips/mm/init.c              |  1 -
 arch/mips/sgi-ip27/ip27-memory.c |  1 -
 arch/nds32/mm/init.c             |  1 -
 arch/nios2/mm/init.c             |  1 -
 arch/openrisc/mm/init.c          |  2 --
 arch/parisc/mm/init.c            |  2 --
 arch/powerpc/mm/mem.c            |  1 -
 arch/riscv/mm/init.c             |  1 -
 arch/s390/mm/init.c              |  2 --
 arch/sh/mm/init.c                |  1 -
 arch/sparc/mm/init_32.c          |  2 --
 arch/sparc/mm/init_64.c          |  1 -
 arch/um/kernel/mem.c             |  1 -
 arch/x86/mm/init_32.c            |  2 --
 arch/x86/mm/init_64.c            |  2 --
 arch/xtensa/mm/init.c            |  1 -
 include/linux/mm.h               |  2 +-
 init/main.c                      |  1 +
 mm/page_alloc.c                  | 10 +++++-----
 30 files changed, 7 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 3c42b3147fd6..a97650a618f1 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -282,5 +282,4 @@ mem_init(void)
 	set_max_mapnr(max_low_pfn);
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
 	memblock_free_all();
-	mem_init_print_info(NULL);
 }
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index ce07e697916c..33832e36bdb7 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -194,7 +194,6 @@ void __init mem_init(void)
 {
 	memblock_free_all();
 	highmem_init();
-	mem_init_print_info(NULL);
 }
 
 #ifdef CONFIG_HIGHMEM
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 828a2561b229..7022b7b5c400 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -316,8 +316,6 @@ void __init mem_init(void)
 
 	free_highpages();
 
-	mem_init_print_info(NULL);
-
 	/*
 	 * Check boundaries twice: Some fundamental inconsistencies can
 	 * be detected at build time already.
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 470f92e6a542..ef031511ce29 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -491,8 +491,6 @@ void __init mem_init(void)
 	/* this will put all unused low memory onto the freelists */
 	memblock_free_all();
 
-	mem_init_print_info(NULL);
-
 	/*
 	 * Check boundaries twice: Some fundamental inconsistencies can be
 	 * detected at build time already.
diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c
index 894050a8ce09..bf2004aa811a 100644
--- a/arch/csky/mm/init.c
+++ b/arch/csky/mm/init.c
@@ -107,7 +107,6 @@ void __init mem_init(void)
 			free_highmem_page(page);
 	}
 #endif
-	mem_init_print_info(NULL);
 }
 
 void free_initmem(void)
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 1f3b345d68b9..f7bf4693e3b2 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -98,6 +98,4 @@ void __init mem_init(void)
 
 	/* this will put all low memory onto the freelists */
 	memblock_free_all();
-
-	mem_init_print_info(NULL);
 }
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index f2e6c868e477..f01e91e10d95 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -55,7 +55,6 @@ void __init mem_init(void)
 {
 	/*  No idea where this is actually declared.  Seems to evade LXR.  */
 	memblock_free_all();
-	mem_init_print_info(NULL);
 
 	/*
 	 *  To-Do:  someone somewhere should wipe out the bootmem map
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 97a13eda81bf..064a967a7b6e 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -449,7 +449,6 @@ mem_init (void)
 	set_max_mapnr(max_low_pfn);
 	high_memory = __va(max_low_pfn * PAGE_SIZE);
 	memblock_free_all();
-	mem_init_print_info(NULL);
 
 	/*
 	 * For fsyscall entrpoints with no light-weight handler, use the ordinary
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 14c1e541451c..1759ab875d47 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -153,5 +153,4 @@ void __init mem_init(void)
 	/* this will put all memory onto the freelists */
 	memblock_free_all();
 	init_pointer_tables();
-	mem_init_print_info(NULL);
 }
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 05cf1fb3f5ff..ab55c70380a5 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -131,7 +131,6 @@ void __init mem_init(void)
 	highmem_setup();
 #endif
 
-	mem_init_print_info(NULL);
 	mem_init_done = 1;
 }
 
diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
index 8315c871c435..fa9b4a487a47 100644
--- a/arch/mips/loongson64/numa.c
+++ b/arch/mips/loongson64/numa.c
@@ -178,7 +178,6 @@ void __init mem_init(void)
 	high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT);
 	memblock_free_all();
 	setup_zero_pages();	/* This comes from node 0 */
-	mem_init_print_info(NULL);
 }
 
 /* All PCI device belongs to logical Node-0 */
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 5cb73bf74a8b..c36358758969 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -467,7 +467,6 @@ void __init mem_init(void)
 	memblock_free_all();
 	setup_zero_pages();	/* Setup zeroed pages.  */
 	mem_init_free_highmem();
-	mem_init_print_info(NULL);
 
 #ifdef CONFIG_64BIT
 	if ((unsigned long) &_text > (unsigned long) CKSEG0)
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 87bb6945ec25..6173684b5aaa 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -420,5 +420,4 @@ void __init mem_init(void)
 	high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT);
 	memblock_free_all();
 	setup_zero_pages();	/* This comes from node 0 */
-	mem_init_print_info(NULL);
 }
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index fa86f7b2f416..f63f839738c4 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -191,7 +191,6 @@ void __init mem_init(void)
 
 	/* this will put all low memory onto the freelists */
 	memblock_free_all();
-	mem_init_print_info(NULL);
 
 	pr_info("virtual kernel memory layout:\n"
 		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c
index 61862dbb0e32..613fcaa5988a 100644
--- a/arch/nios2/mm/init.c
+++ b/arch/nios2/mm/init.c
@@ -71,7 +71,6 @@ void __init mem_init(void)
 
 	/* this will put all memory onto the freelists */
 	memblock_free_all();
-	mem_init_print_info(NULL);
 }
 
 void __init mmu_init(void)
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index bf9b2310fc93..d5641198b90c 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -211,8 +211,6 @@ void __init mem_init(void)
 	/* this will put all low memory onto the freelists */
 	memblock_free_all();
 
-	mem_init_print_info(NULL);
-
 	printk("mem_init_done ...........................................\n");
 	mem_init_done = 1;
 	return;
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 9ca4e4ff6895..591a4e939415 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -573,8 +573,6 @@ void __init mem_init(void)
 #endif
 		parisc_vmalloc_start = SET_MAP_OFFSET(MAP_START);
 
-	mem_init_print_info(NULL);
-
 #if 0
 	/*
 	 * Do not expose the virtual kernel memory layout to userspace.
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 4e8ce6d85232..7e11c4cb08b8 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -312,7 +312,6 @@ void __init mem_init(void)
 		(mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
 #endif
 
-	mem_init_print_info(NULL);
 #ifdef CONFIG_PPC32
 	pr_info("Kernel virtual memory layout:\n");
 #ifdef CONFIG_KASAN
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 067583ab1bd7..92e39cfa5227 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -102,7 +102,6 @@ void __init mem_init(void)
 	high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
 	memblock_free_all();
 
-	mem_init_print_info(NULL);
 	print_vm_layout();
 }
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 0e76b2127dc6..8ac710de1ab1 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -209,8 +209,6 @@ void __init mem_init(void)
 	setup_zero_pages();	/* Setup zeroed pages. */
 
 	cmma_init_nodat();
-
-	mem_init_print_info(NULL);
 }
 
 void free_initmem(void)
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 0db6919af8d3..168d7d4dd735 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -359,7 +359,6 @@ void __init mem_init(void)
 
 	vsyscall_init();
 
-	mem_init_print_info(NULL);
 	pr_info("virtual kernel memory layout:\n"
 		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 		"    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 6139c5700ccc..1e9f577f084d 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -292,8 +292,6 @@ void __init mem_init(void)
 
 		map_high_region(start_pfn, end_pfn);
 	}
-
-	mem_init_print_info(NULL);
 }
 
 void sparc_flush_page_to_ram(struct page *page)
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 182bb7bdaa0a..e454f179cf5d 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2520,7 +2520,6 @@ void __init mem_init(void)
 	}
 	mark_page_reserved(mem_map_zero);
 
-	mem_init_print_info(NULL);
 
 	if (tlb_type == cheetah || tlb_type == cheetah_plus)
 		cheetah_ecache_flush_init();
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 9242dc91d751..9019ff5905b1 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -54,7 +54,6 @@ void __init mem_init(void)
 	memblock_free_all();
 	max_low_pfn = totalram_pages();
 	max_pfn = max_low_pfn;
-	mem_init_print_info(NULL);
 	kmalloc_ok = 1;
 }
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index da31c2635ee4..21ffb03f6c72 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -755,8 +755,6 @@ void __init mem_init(void)
 	after_bootmem = 1;
 	x86_init.hyper.init_after_bootmem();
 
-	mem_init_print_info(NULL);
-
 	/*
 	 * Check boundaries twice: Some fundamental inconsistencies can
 	 * be detected at build time already.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 58ae2f746b3e..e527d829e1ed 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1351,8 +1351,6 @@ void __init mem_init(void)
 		kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
 
 	preallocate_vmalloc_pages();
-
-	mem_init_print_info(NULL);
 }
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 2daeba9e454e..6a32b2cf2718 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -119,7 +119,6 @@ void __init mem_init(void)
 
 	memblock_free_all();
 
-	mem_init_print_info(NULL);
 	pr_info("virtual kernel memory layout:\n"
 #ifdef CONFIG_KASAN
 		"    kasan   : 0x%08lx - 0x%08lx  (%5lu MB)\n"
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64b2e3a0b94d..011f43605807 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2360,7 +2360,7 @@ extern unsigned long free_reserved_area(void *start, void *end,
 					int poison, const char *s);
 
 extern void adjust_managed_page_count(struct page *page, long count);
-extern void mem_init_print_info(const char *str);
+extern void mem_init_print_info(void);
 
 extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
diff --git a/init/main.c b/init/main.c
index ae96c79ad2d3..dd11bfd10ead 100644
--- a/init/main.c
+++ b/init/main.c
@@ -830,6 +830,7 @@ static void __init mm_init(void)
 	report_meminit();
 	stack_depot_init();
 	mem_init();
+	mem_init_print_info();
 	/* page_owner must be initialized after buddy is ready */
 	page_ext_init_flatmem_late();
 	kmem_cache_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6314de5387f5..b55073d0e84a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7734,7 +7734,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
 	return pages;
 }
 
-void __init mem_init_print_info(const char *str)
+void __init mem_init_print_info(void)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
 	unsigned long init_code_size, init_data_size;
@@ -7773,17 +7773,17 @@ void __init mem_init_print_info(const char *str)
 #ifdef	CONFIG_HIGHMEM
 		", %luK highmem"
 #endif
-		"%s%s)\n",
+		")\n",
 		nr_free_pages() << (PAGE_SHIFT - 10),
 		physpages << (PAGE_SHIFT - 10),
 		codesize >> 10, datasize >> 10, rosize >> 10,
 		(init_data_size + init_code_size) >> 10, bss_size >> 10,
 		(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
-		totalcma_pages << (PAGE_SHIFT - 10),
+		totalcma_pages << (PAGE_SHIFT - 10)
 #ifdef	CONFIG_HIGHMEM
-		totalhigh_pages() << (PAGE_SHIFT - 10),
+		, totalhigh_pages() << (PAGE_SHIFT - 10)
 #endif
-		str ? ", " : "", str ? str : "");
+		);
 }
 
 /**
-- 
cgit v1.2.3


From f73c6c8805ed0762d99122d5332fcf42b0c8fbb8 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 29 Apr 2021 23:01:04 -0700
Subject: include/linux/page-flags-layout.h: correctly determine
 LAST_CPUPID_WIDTH

The naming convention used in include/linux/page-flags-layout.h:
  *_SHIFT: the number of bits trying to allocate
  *_WIDTH: the number of bits successfully allocated

So when it comes to LAST_CPUPID_WIDTH, we need to check whether all
previous *_WIDTH and LAST_CPUPID_SHIFT can fit into page flags. This
means we need to use NODES_WIDTH, not NODES_SHIFT.

Link: https://lkml.kernel.org/r/20210303071609.797782-1-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags-layout.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 7d4ec26d8a3e..295c2c687d2c 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -83,7 +83,7 @@
 #define KASAN_TAG_WIDTH 0
 #endif
 
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \
 	<= BITS_PER_LONG - NR_PAGEFLAGS
 #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
 #else
-- 
cgit v1.2.3


From 1587db62d8c0dbd943752f657b452213e1c4d8d4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 29 Apr 2021 23:01:07 -0700
Subject: include/linux/page-flags-layout.h: cleanups

Tidy things up and delete comments stating the obvious with typos or
making no sense.

Link: https://lkml.kernel.org/r/20210303071609.797782-2-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags-layout.h | 62 ++++++++++++++++++---------------------
 mm/mm_init.c                      |  4 ---
 2 files changed, 29 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 295c2c687d2c..ef1e3e736e14 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -21,16 +21,17 @@
 #elif MAX_NR_ZONES <= 8
 #define ZONES_SHIFT 3
 #else
-#error ZONES_SHIFT -- too many zones configured adjust calculation
+#error ZONES_SHIFT "Too many zones configured"
 #endif
 
+#define ZONES_WIDTH		ZONES_SHIFT
+
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
-
-/* SECTION_SHIFT	#bits space required to store a section # */
 #define SECTIONS_SHIFT	(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
-
-#endif /* CONFIG_SPARSEMEM */
+#else
+#define SECTIONS_SHIFT	0
+#endif
 
 #ifndef BUILD_VDSO32_64
 /*
@@ -54,17 +55,28 @@
 #define SECTIONS_WIDTH		0
 #endif
 
-#define ZONES_WIDTH		ZONES_SHIFT
-
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
 #define NODES_WIDTH		NODES_SHIFT
-#else
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
 #error "Vmemmap: No space for nodes field in page flags"
-#endif
+#else
 #define NODES_WIDTH		0
 #endif
 
+/*
+ * Note that this #define MUST have a value so that it can be tested with
+ * the IS_ENABLED() macro.
+ */
+#if NODES_SHIFT != 0 && NODES_WIDTH == 0
+#define NODE_NOT_IN_PAGE_FLAGS	1
+#endif
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+#define KASAN_TAG_WIDTH 8
+#else
+#define KASAN_TAG_WIDTH 0
+#endif
+
 #ifdef CONFIG_NUMA_BALANCING
 #define LAST__PID_SHIFT 8
 #define LAST__PID_MASK  ((1 << LAST__PID_SHIFT)-1)
@@ -77,36 +89,20 @@
 #define LAST_CPUPID_SHIFT 0
 #endif
 
-#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
-#define KASAN_TAG_WIDTH 8
-#else
-#define KASAN_TAG_WIDTH 0
-#endif
-
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LAST_CPUPID_SHIFT+KASAN_TAG_WIDTH \
+#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
 	<= BITS_PER_LONG - NR_PAGEFLAGS
 #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
 #else
 #define LAST_CPUPID_WIDTH 0
 #endif
 
-#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH+LAST_CPUPID_WIDTH+KASAN_TAG_WIDTH \
-	> BITS_PER_LONG - NR_PAGEFLAGS
-#error "Not enough bits in page flags"
-#endif
-
-/*
- * We are going to use the flags for the page to node mapping if its in
- * there.  This includes the case where there is no node, so it is implicit.
- * Note that this #define MUST have a value so that it can be tested with
- * the IS_ENABLED() macro.
- */
-#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
-#define NODE_NOT_IN_PAGE_FLAGS 1
+#if LAST_CPUPID_SHIFT != 0 && LAST_CPUPID_WIDTH == 0
+#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
 #endif
 
-#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0
-#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
+	> BITS_PER_LONG - NR_PAGEFLAGS
+#error "Not enough bits in page flags"
 #endif
 
 #endif
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 8e02e865cc65..9ddaf0e1b0ab 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -19,10 +19,6 @@
 #ifdef CONFIG_DEBUG_MEMORY_INIT
 int __meminitdata mminit_loglevel;
 
-#ifndef SECTIONS_SHIFT
-#define SECTIONS_SHIFT	0
-#endif
-
 /* The zonelists are simply reported, validation is manual. */
 void __init mminit_verify_zonelist(void)
 {
-- 
cgit v1.2.3


From 84172f4bb752424415756351a40f8da5714e1554 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 29 Apr 2021 23:01:15 -0700
Subject: mm/page_alloc: combine __alloc_pages and __alloc_pages_nodemask

There are only two callers of __alloc_pages() so prune the thicket of
alloc_page variants by combining the two functions together.  Current
callers of __alloc_pages() simply add an extra 'NULL' parameter and
current callers of __alloc_pages_nodemask() call __alloc_pages() instead.

Link: https://lkml.kernel.org/r/20210225150642.2582252-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/mm/transhuge.rst |  2 +-
 include/linux/gfp.h                        | 13 +++----------
 mm/hugetlb.c                               |  2 +-
 mm/internal.h                              |  4 ++--
 mm/mempolicy.c                             |  6 +++---
 mm/migrate.c                               |  2 +-
 mm/page_alloc.c                            |  5 ++---
 7 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 3b8a336511a4..c9c37f16eef8 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -402,7 +402,7 @@ compact_fail
 	but failed.
 
 It is possible to establish how long the stalls were using the function
-tracer to record how long was spent in __alloc_pages_nodemask and
+tracer to record how long was spent in __alloc_pages() and
 using the mm_page_alloc tracepoint to identify which allocations were
 for huge pages.
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8572a1474e16..f39de931bdf9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -515,15 +515,8 @@ static inline int arch_make_page_accessible(struct page *page)
 }
 #endif
 
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
-							nodemask_t *nodemask);
-
-static inline struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
-{
-	return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
-}
+struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
+		nodemask_t *nodemask);
 
 /*
  * Allocate pages, preferring the node given as nid. The node must be valid and
@@ -535,7 +528,7 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
 	VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));
 
-	return __alloc_pages(gfp_mask, order, nid);
+	return __alloc_pages(gfp_mask, order, nid, NULL);
 }
 
 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a86a58ef132d..6c72433bec1e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1616,7 +1616,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 		gfp_mask |= __GFP_RETRY_MAYFAIL;
 	if (nid == NUMA_NO_NODE)
 		nid = numa_mem_id();
-	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+	page = __alloc_pages(gfp_mask, order, nid, nmask);
 	if (page)
 		__count_vm_event(HTLB_BUDDY_PGALLOC);
 	else
diff --git a/mm/internal.h b/mm/internal.h
index 42e30e71554a..ef5f336f59bd 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -145,10 +145,10 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
  * family of functions.
  *
  * nodemask, migratetype and highest_zoneidx are initialized only once in
- * __alloc_pages_nodemask() and then never change.
+ * __alloc_pages() and then never change.
  *
  * zonelist, preferred_zone and highest_zoneidx are set first in
- * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * __alloc_pages() for the fast path, and might be later changed
  * in __alloc_pages_slowpath(). All other functions pass the whole structure
  * by a const pointer.
  */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ab51132547b8..5f0d20298736 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2140,7 +2140,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 {
 	struct page *page;
 
-	page = __alloc_pages(gfp, order, nid);
+	page = __alloc_pages(gfp, order, nid, NULL);
 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
 	if (!static_branch_likely(&vm_numa_stat_key))
 		return page;
@@ -2237,7 +2237,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 
 	nmask = policy_nodemask(gfp, pol);
 	preferred_nid = policy_node(gfp, pol, node);
-	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
+	page = __alloc_pages(gfp, order, preferred_nid, nmask);
 	mpol_cond_put(pol);
 out:
 	return page;
@@ -2274,7 +2274,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 	if (pol->mode == MPOL_INTERLEAVE)
 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	else
-		page = __alloc_pages_nodemask(gfp, order,
+		page = __alloc_pages(gfp, order,
 				policy_node(gfp, pol, numa_node_id()),
 				policy_nodemask(gfp, pol));
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 62b81d5257aa..47df0df8f21a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1617,7 +1617,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
 	if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
 		gfp_mask |= __GFP_HIGHMEM;
 
-	new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+	new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
 
 	if (new_page && PageTransHuge(new_page))
 		prep_transhuge_page(new_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c565ebad02ee..fce4b9180bdb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5013,8 +5013,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp, unsigned int order, int preferred_nid,
+struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 							nodemask_t *nodemask)
 {
 	struct page *page;
@@ -5076,7 +5075,7 @@ out:
 
 	return page;
 }
-EXPORT_SYMBOL(__alloc_pages_nodemask);
+EXPORT_SYMBOL(__alloc_pages);
 
 /*
  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
-- 
cgit v1.2.3


From d7f946d0faf90014547ee5d090e9d05018278c7a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 29 Apr 2021 23:01:18 -0700
Subject: mm/mempolicy: rename alloc_pages_current to alloc_pages

When CONFIG_NUMA is enabled, alloc_pages() is a wrapper around
alloc_pages_current().  This is pointless, just implement alloc_pages()
directly.

Link: https://lkml.kernel.org/r/20210225150642.2582252-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 8 +-------
 mm/mempolicy.c      | 6 +++---
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f39de931bdf9..0a88f84b08f4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -546,13 +546,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 }
 
 #ifdef CONFIG_NUMA
-extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
-
-static inline struct page *
-alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
-	return alloc_pages_current(gfp_mask, order);
-}
+struct page *alloc_pages(gfp_t gfp, unsigned int order);
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr,
 			int node, bool hugepage);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5f0d20298736..c71532b7e3f8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2245,7 +2245,7 @@ out:
 EXPORT_SYMBOL(alloc_pages_vma);
 
 /**
- * 	alloc_pages_current - Allocate pages.
+ * 	alloc_pages - Allocate pages.
  *
  *	@gfp:
  *		%GFP_USER   user allocation,
@@ -2259,7 +2259,7 @@ EXPORT_SYMBOL(alloc_pages_vma);
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  */
-struct page *alloc_pages_current(gfp_t gfp, unsigned order)
+struct page *alloc_pages(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = &default_policy;
 	struct page *page;
@@ -2280,7 +2280,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 
 	return page;
 }
-EXPORT_SYMBOL(alloc_pages_current);
+EXPORT_SYMBOL(alloc_pages);
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
-- 
cgit v1.2.3


From 387ba26fb1cb9be9e35dc14a6d97188e916eda05 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 29 Apr 2021 23:01:45 -0700
Subject: mm/page_alloc: add a bulk page allocator

This patch adds a new page allocator interface via alloc_pages_bulk, and
__alloc_pages_bulk_nodemask.  A caller requests a number of pages to be
allocated and added to a list.

The API is not guaranteed to return the requested number of pages and
may fail if the preferred allocation zone has limited free memory, the
cpuset changes during the allocation or page debugging decides to fail
an allocation.  It's up to the caller to request more pages in batch if
necessary.

Note that this implementation is not very efficient and could be
improved but it would require refactoring.  The intent is to make it
available early to determine what semantics are required by different
callers.  Once the full semantics are nailed down, it can be refactored.

[mgorman@techsingularity.net: fix alloc_pages_bulk() return type, per Matthew]
  Link: https://lkml.kernel.org/r/20210325123713.GQ3697@techsingularity.net
[mgorman@techsingularity.net: fix uninit var warning]
  Link: https://lkml.kernel.org/r/20210330114847.GX3697@techsingularity.net
[mgorman@techsingularity.net: fix comment, per Vlastimil]
  Link: https://lkml.kernel.org/r/20210412110255.GV3697@techsingularity.net

Link: https://lkml.kernel.org/r/20210325114228.27719-3-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Alexander Lobakin <alobakin@pm.me>
Tested-by: Colin Ian King <colin.king@canonical.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: David Miller <davem@davemloft.net>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  11 +++++
 mm/page_alloc.c     | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0a88f84b08f4..a2be8f4174a9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -518,6 +518,17 @@ static inline int arch_make_page_accessible(struct page *page)
 struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 		nodemask_t *nodemask);
 
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+				nodemask_t *nodemask, int nr_pages,
+				struct list_head *list);
+
+/* Bulk allocate order-0 pages */
+static inline unsigned long
+alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
+{
+	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
+}
+
 /*
  * Allocate pages, preferring the node given as nid. The node must be valid and
  * online. For more general interface, see alloc_pages_node().
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd283d9b6a4d..d8209d48a543 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5006,6 +5006,124 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	return true;
 }
 
+/*
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list
+ * @gfp: GFP flags for the allocation
+ * @preferred_nid: The preferred NUMA node ID to allocate from
+ * @nodemask: Set of nodes to allocate from, may be NULL
+ * @nr_pages: The number of pages desired on the list
+ * @page_list: List to store the allocated pages
+ *
+ * This is a batched version of the page allocator that attempts to
+ * allocate nr_pages quickly and add them to a list.
+ *
+ * Returns the number of pages on the list.
+ */
+unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+			nodemask_t *nodemask, int nr_pages,
+			struct list_head *page_list)
+{
+	struct page *page;
+	unsigned long flags;
+	struct zone *zone;
+	struct zoneref *z;
+	struct per_cpu_pages *pcp;
+	struct list_head *pcp_list;
+	struct alloc_context ac;
+	gfp_t alloc_gfp;
+	unsigned int alloc_flags = ALLOC_WMARK_LOW;
+	int allocated = 0;
+
+	if (WARN_ON_ONCE(nr_pages <= 0))
+		return 0;
+
+	/* Use the single page allocator for one page. */
+	if (nr_pages == 1)
+		goto failed;
+
+	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
+	gfp &= gfp_allowed_mask;
+	alloc_gfp = gfp;
+	if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
+		return 0;
+	gfp = alloc_gfp;
+
+	/* Find an allowed local zone that meets the low watermark. */
+	for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+		unsigned long mark;
+
+		if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
+		    !__cpuset_zone_allowed(zone, gfp)) {
+			continue;
+		}
+
+		if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
+		    zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+			goto failed;
+		}
+
+		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
+		if (zone_watermark_fast(zone, 0,  mark,
+				zonelist_zone_idx(ac.preferred_zoneref),
+				alloc_flags, gfp)) {
+			break;
+		}
+	}
+
+	/*
+	 * If there are no allowed local zones that meets the watermarks then
+	 * try to allocate a single page and reclaim if necessary.
+	 */
+	if (!zone)
+		goto failed;
+
+	/* Attempt the batch allocation */
+	local_irq_save(flags);
+	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+	pcp_list = &pcp->lists[ac.migratetype];
+
+	while (allocated < nr_pages) {
+		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+								pcp, pcp_list);
+		if (!page) {
+			/* Try and get at least one page */
+			if (!allocated)
+				goto failed_irq;
+			break;
+		}
+
+		/*
+		 * Ideally this would be batched but the best way to do
+		 * that cheaply is to first convert zone_statistics to
+		 * be inaccurate per-cpu counter like vm_events to avoid
+		 * a RMW cycle then do the accounting with IRQs enabled.
+		 */
+		__count_zid_vm_events(PGALLOC, zone_idx(zone), 1);
+		zone_statistics(ac.preferred_zoneref->zone, zone);
+
+		prep_new_page(page, 0, gfp, 0);
+		list_add(&page->lru, page_list);
+		allocated++;
+	}
+
+	local_irq_restore(flags);
+
+	return allocated;
+
+failed_irq:
+	local_irq_restore(flags);
+
+failed:
+	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
+	if (page) {
+		list_add(&page->lru, page_list);
+		allocated = 1;
+	}
+
+	return allocated;
+}
+EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
-- 
cgit v1.2.3


From 0f87d9d30f21390dd71114f30e63038980e6eb3f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 29 Apr 2021 23:01:48 -0700
Subject: mm/page_alloc: add an array-based interface to the bulk page
 allocator

The proposed callers for the bulk allocator store pages from the bulk
allocator in an array.  This patch adds an array-based interface to the
API to avoid multiple list iterations.  The page list interface is
preserved to avoid requiring all users of the bulk API to allocate and
manage enough storage to store the pages.

[akpm@linux-foundation.org: remove now unused local `allocated']

Link: https://lkml.kernel.org/r/20210325114228.27719-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Alexander Lobakin <alobakin@pm.me>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Cc: David Miller <davem@davemloft.net>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 13 +++++++++---
 mm/page_alloc.c     | 60 +++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a2be8f4174a9..26f4d907254a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -520,13 +520,20 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 
 unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 				nodemask_t *nodemask, int nr_pages,
-				struct list_head *list);
+				struct list_head *page_list,
+				struct page **page_array);
 
 /* Bulk allocate order-0 pages */
 static inline unsigned long
-alloc_pages_bulk(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
+alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
 {
-	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list);
+	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
+}
+
+static inline unsigned long
+alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
+{
+	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d8209d48a543..e240704b5d39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5007,21 +5007,29 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 }
 
 /*
- * __alloc_pages_bulk - Allocate a number of order-0 pages to a list
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
  * @gfp: GFP flags for the allocation
  * @preferred_nid: The preferred NUMA node ID to allocate from
  * @nodemask: Set of nodes to allocate from, may be NULL
- * @nr_pages: The number of pages desired on the list
- * @page_list: List to store the allocated pages
+ * @nr_pages: The number of pages desired on the list or array
+ * @page_list: Optional list to store the allocated pages
+ * @page_array: Optional array to store the pages
  *
  * This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly and add them to a list.
+ * allocate nr_pages quickly. Pages are added to page_list if page_list
+ * is not NULL, otherwise it is assumed that the page_array is valid.
  *
- * Returns the number of pages on the list.
+ * For lists, nr_pages is the number of pages that should be allocated.
+ *
+ * For arrays, only NULL elements are populated with pages and nr_pages
+ * is the maximum number of pages that will be stored in the array.
+ *
+ * Returns the number of pages on the list or array.
  */
 unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 			nodemask_t *nodemask, int nr_pages,
-			struct list_head *page_list)
+			struct list_head *page_list,
+			struct page **page_array)
 {
 	struct page *page;
 	unsigned long flags;
@@ -5032,13 +5040,20 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	struct alloc_context ac;
 	gfp_t alloc_gfp;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
-	int allocated = 0;
+	int nr_populated = 0;
 
 	if (WARN_ON_ONCE(nr_pages <= 0))
 		return 0;
 
+	/*
+	 * Skip populated array elements to determine if any pages need
+	 * to be allocated before disabling IRQs.
+	 */
+	while (page_array && page_array[nr_populated] && nr_populated < nr_pages)
+		nr_populated++;
+
 	/* Use the single page allocator for one page. */
-	if (nr_pages == 1)
+	if (nr_pages - nr_populated == 1)
 		goto failed;
 
 	/* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
@@ -5082,12 +5097,19 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	pcp_list = &pcp->lists[ac.migratetype];
 
-	while (allocated < nr_pages) {
+	while (nr_populated < nr_pages) {
+
+		/* Skip existing pages */
+		if (page_array && page_array[nr_populated]) {
+			nr_populated++;
+			continue;
+		}
+
 		page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
 								pcp, pcp_list);
 		if (!page) {
 			/* Try and get at least one page */
-			if (!allocated)
+			if (!nr_populated)
 				goto failed_irq;
 			break;
 		}
@@ -5102,13 +5124,16 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		zone_statistics(ac.preferred_zoneref->zone, zone);
 
 		prep_new_page(page, 0, gfp, 0);
-		list_add(&page->lru, page_list);
-		allocated++;
+		if (page_list)
+			list_add(&page->lru, page_list);
+		else
+			page_array[nr_populated] = page;
+		nr_populated++;
 	}
 
 	local_irq_restore(flags);
 
-	return allocated;
+	return nr_populated;
 
 failed_irq:
 	local_irq_restore(flags);
@@ -5116,11 +5141,14 @@ failed_irq:
 failed:
 	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
 	if (page) {
-		list_add(&page->lru, page_list);
-		allocated = 1;
+		if (page_list)
+			list_add(&page->lru, page_list);
+		else
+			page_array[nr_populated] = page;
+		nr_populated++;
 	}
 
-	return allocated;
+	return nr_populated;
 }
 EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
 
-- 
cgit v1.2.3


From 198fba4137a1803a9cb93992b56c2ecba1aa83a3 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Thu, 29 Apr 2021 23:02:16 -0700
Subject: mm/mmzone.h: fix existing kernel-doc comments and link them to
 core-api

There are a couple of kernel-doc comments in include/linux/mmzone.h but
they have minor formatting issues that would cause kernel-doc warnings.

Fix the formatting of those comments, add missing Return: descriptions and
link include/linux/mmzone.h to Documentation/core-api/mm-api.rst

Link: https://lkml.kernel.org/r/20210426141927.1314326-2-rppt@kernel.org
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/mm-api.rst |  1 +
 include/linux/mmzone.h            | 43 ++++++++++++++++++++++-----------------
 2 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index 34f46df91a8b..a42f9baddfbf 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -97,3 +97,4 @@ More Memory Management Functions
    :internal:
 .. kernel-doc:: include/linux/mm.h
    :internal:
+.. kernel-doc:: include/linux/mmzone.h
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..3b2205741048 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -993,7 +993,8 @@ static inline int is_highmem_idx(enum zone_type idx)
  * is_highmem - helper function to quickly check if a struct zone is a
  *              highmem zone or not.  This is an attempt to keep references
  *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
- * @zone - pointer to struct zone variable
+ * @zone: pointer to struct zone variable
+ * Return: 1 for a highmem zone, 0 otherwise
  */
 static inline int is_highmem(struct zone *zone)
 {
@@ -1044,7 +1045,7 @@ extern struct zone *next_zone(struct zone *zone);
 
 /**
  * for_each_online_pgdat - helper macro to iterate over all online nodes
- * @pgdat - pointer to a pg_data_t variable
+ * @pgdat: pointer to a pg_data_t variable
  */
 #define for_each_online_pgdat(pgdat)			\
 	for (pgdat = first_online_pgdat();		\
@@ -1052,7 +1053,7 @@ extern struct zone *next_zone(struct zone *zone);
 	     pgdat = next_online_pgdat(pgdat))
 /**
  * for_each_zone - helper macro to iterate over all memory zones
- * @zone - pointer to struct zone variable
+ * @zone: pointer to struct zone variable
  *
  * The user only needs to declare the zone variable, for_each_zone
  * fills it in.
@@ -1091,15 +1092,18 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
 
 /**
  * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
- * @z - The cursor used as a starting point for the search
- * @highest_zoneidx - The zone index of the highest zone to return
- * @nodes - An optional nodemask to filter the zonelist with
+ * @z: The cursor used as a starting point for the search
+ * @highest_zoneidx: The zone index of the highest zone to return
+ * @nodes: An optional nodemask to filter the zonelist with
  *
  * This function returns the next zone at or below a given zone index that is
  * within the allowed nodemask using a cursor as the starting point for the
  * search. The zoneref returned is a cursor that represents the current zone
  * being examined. It should be advanced by one before calling
  * next_zones_zonelist again.
+ *
+ * Return: the next zone at or below highest_zoneidx within the allowed
+ * nodemask using a cursor within a zonelist as a starting point
  */
 static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
@@ -1112,10 +1116,9 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
 
 /**
  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
- * @zonelist - The zonelist to search for a suitable zone
- * @highest_zoneidx - The zone index of the highest zone to return
- * @nodes - An optional nodemask to filter the zonelist with
- * @return - Zoneref pointer for the first suitable zone found (see below)
+ * @zonelist: The zonelist to search for a suitable zone
+ * @highest_zoneidx: The zone index of the highest zone to return
+ * @nodes: An optional nodemask to filter the zonelist with
  *
  * This function returns the first zone at or below a given zone index that is
  * within the allowed nodemask. The zoneref returned is a cursor that can be
@@ -1125,6 +1128,8 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
  * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
  * never NULL). This may happen either genuinely, or due to concurrent nodemask
  * update due to cpuset modification.
+ *
+ * Return: Zoneref pointer for the first suitable zone found
  */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					enum zone_type highest_zoneidx,
@@ -1136,11 +1141,11 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 
 /**
  * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
- * @zone - The current zone in the iterator
- * @z - The current pointer within zonelist->_zonerefs being iterated
- * @zlist - The zonelist being iterated
- * @highidx - The zone index of the highest zone to return
- * @nodemask - Nodemask allowed by the allocator
+ * @zone: The current zone in the iterator
+ * @z: The current pointer within zonelist->_zonerefs being iterated
+ * @zlist: The zonelist being iterated
+ * @highidx: The zone index of the highest zone to return
+ * @nodemask: Nodemask allowed by the allocator
  *
  * This iterator iterates though all zones at or below a given zone index and
  * within a given nodemask
@@ -1160,10 +1165,10 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
- * @zone - The current zone in the iterator
- * @z - The current pointer within zonelist->zones being iterated
- * @zlist - The zonelist being iterated
- * @highidx - The zone index of the highest zone to return
+ * @zone: The current zone in the iterator
+ * @z: The current pointer within zonelist->zones being iterated
+ * @zlist: The zonelist being iterated
+ * @highidx: The zone index of the highest zone to return
  *
  * This iterator iterates though all zones at or below a given zone index.
  */
-- 
cgit v1.2.3


From 384d0c68204a4a657f4bbc096c50d729ae7d9ef0 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 12 Feb 2021 11:02:47 +0100
Subject: PCI/VPD: Remove pci_set_vpd_size()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

24a1720a0841 ("cxgb4: collect serial config version from register") removed
the only usage of pci_set_vpd_size(). If a device needs to override the
auto-detected VPD size, then this can be done with a PCI quirk, as is done
for Chelsio devices. There's no need to allow drivers to change the VPD
size.

Remove pci_set_vpd_size().

[bhelgaas: squash in Arnd's fix for "'pci_vpd_set_size' defined but not
used" from
https://lore.kernel.org/r/20210421140334.3847155-1-arnd@kernel.org]
Link: https://lore.kernel.org/r/47d86e52-9bcf-7da7-1edb-0d988a7a82ab@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Krzysztof Wilczyński <kw@linux.com>
---
 drivers/pci/vpd.c   | 58 ++++++++++++-----------------------------------------
 include/linux/pci.h |  1 -
 2 files changed, 13 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c
index ab81c7a5185f..a1d31c5d1864 100644
--- a/drivers/pci/vpd.c
+++ b/drivers/pci/vpd.c
@@ -16,7 +16,6 @@
 struct pci_vpd_ops {
 	ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
 	ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
-	int (*set_size)(struct pci_dev *dev, size_t len);
 };
 
 struct pci_vpd {
@@ -60,19 +59,6 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void
 }
 EXPORT_SYMBOL(pci_write_vpd);
 
-/**
- * pci_set_vpd_size - Set size of Vital Product Data space
- * @dev:	pci device struct
- * @len:	size of vpd space
- */
-int pci_set_vpd_size(struct pci_dev *dev, size_t len)
-{
-	if (!dev->vpd || !dev->vpd->ops)
-		return -ENODEV;
-	return dev->vpd->ops->set_size(dev, len);
-}
-EXPORT_SYMBOL(pci_set_vpd_size);
-
 #define PCI_VPD_MAX_SIZE (PCI_VPD_ADDR_MASK + 1)
 
 /**
@@ -297,23 +283,9 @@ out:
 	return ret ? ret : count;
 }
 
-static int pci_vpd_set_size(struct pci_dev *dev, size_t len)
-{
-	struct pci_vpd *vpd = dev->vpd;
-
-	if (len == 0 || len > PCI_VPD_MAX_SIZE)
-		return -EIO;
-
-	vpd->valid = 1;
-	vpd->len = len;
-
-	return 0;
-}
-
 static const struct pci_vpd_ops pci_vpd_ops = {
 	.read = pci_vpd_read,
 	.write = pci_vpd_write,
-	.set_size = pci_vpd_set_size,
 };
 
 static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count,
@@ -346,24 +318,9 @@ static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count,
 	return ret;
 }
 
-static int pci_vpd_f0_set_size(struct pci_dev *dev, size_t len)
-{
-	struct pci_dev *tdev = pci_get_slot(dev->bus,
-					    PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
-	int ret;
-
-	if (!tdev)
-		return -ENODEV;
-
-	ret = pci_set_vpd_size(tdev, len);
-	pci_dev_put(tdev);
-	return ret;
-}
-
 static const struct pci_vpd_ops pci_vpd_f0_ops = {
 	.read = pci_vpd_f0_read,
 	.write = pci_vpd_f0_write,
-	.set_size = pci_vpd_f0_set_size,
 };
 
 int pci_vpd_init(struct pci_dev *dev)
@@ -564,6 +521,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd);
 DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031,
 			      PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd);
 
+static void pci_vpd_set_size(struct pci_dev *dev, size_t len)
+{
+	struct pci_vpd *vpd = dev->vpd;
+
+	if (!vpd || len == 0 || len > PCI_VPD_MAX_SIZE)
+		return;
+
+	vpd->valid = 1;
+	vpd->len = len;
+}
+
 static void quirk_chelsio_extend_vpd(struct pci_dev *dev)
 {
 	int chip = (dev->device & 0xf000) >> 12;
@@ -582,9 +550,9 @@ static void quirk_chelsio_extend_vpd(struct pci_dev *dev)
 	 * limits.
 	 */
 	if (chip == 0x0 && prod >= 0x20)
-		pci_set_vpd_size(dev, 8192);
+		pci_vpd_set_size(dev, 8192);
 	else if (chip >= 0x4 && func < 0x8)
-		pci_set_vpd_size(dev, 2048);
+		pci_vpd_set_size(dev, 2048);
 }
 
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 86c799c97b77..edadc62ae058 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1302,7 +1302,6 @@ void pci_unlock_rescan_remove(void);
 /* Vital Product Data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
 ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
-int pci_set_vpd_size(struct pci_dev *dev, size_t len);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
 resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
-- 
cgit v1.2.3


From 4cf0abbce69bde3d07757dfa9be6420407fdbc45 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 1 Apr 2021 18:43:15 +0200
Subject: PCI/VPD: Remove pci_vpd_find_tag() 'offset' argument

All callers pass 0 as offset. Therefore remove the parameter and use a
fixed offset 0 in pci_vpd_find_tag().

Link: https://lore.kernel.org/r/f62e6e19-5423-2ead-b2bd-62844b23ef8f@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/net/ethernet/broadcom/bnx2.c             | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 3 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c        | 2 +-
 drivers/net/ethernet/broadcom/tg3.c              | 4 ++--
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c       | 2 +-
 drivers/net/ethernet/sfc/efx.c                   | 2 +-
 drivers/net/ethernet/sfc/falcon/efx.c            | 2 +-
 drivers/pci/vpd.c                                | 4 ++--
 drivers/scsi/cxlflash/main.c                     | 3 +--
 include/linux/pci.h                              | 3 +--
 10 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 3e8a179f39db..c0986096c701 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -8057,7 +8057,7 @@ bnx2_read_vpd_fw_ver(struct bnx2 *bp)
 		data[i + 3] = data[i + BNX2_VPD_LEN];
 	}
 
-	i = pci_vpd_find_tag(data, 0, BNX2_VPD_LEN, PCI_VPD_LRDT_RO_DATA);
+	i = pci_vpd_find_tag(data, BNX2_VPD_LEN, PCI_VPD_LRDT_RO_DATA);
 	if (i < 0)
 		goto vpd_done;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index b652ed72a621..d267e45a0518 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12207,8 +12207,7 @@ static void bnx2x_read_fwinfo(struct bnx2x *bp)
 	/* VPD RO tag should be first tag after identifier string, hence
 	 * we should be able to find it in first BNX2X_VPD_LEN chars
 	 */
-	i = pci_vpd_find_tag(vpd_start, 0, BNX2X_VPD_LEN,
-			     PCI_VPD_LRDT_RO_DATA);
+	i = pci_vpd_find_tag(vpd_start, BNX2X_VPD_LEN, PCI_VPD_LRDT_RO_DATA);
 	if (i < 0)
 		goto out_not_found;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index a680fd9c68ea..2bccdac28a24 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -12668,7 +12668,7 @@ static void bnxt_vpd_read_info(struct bnxt *bp)
 		goto exit;
 	}
 
-	i = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA);
+	i = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA);
 	if (i < 0) {
 		netdev_err(bp->dev, "VPD READ-Only not found\n");
 		goto exit;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index d2381929931b..b0e49643f483 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -13016,7 +13016,7 @@ static int tg3_test_nvram(struct tg3 *tp)
 	if (!buf)
 		return -ENOMEM;
 
-	i = pci_vpd_find_tag((u8 *)buf, 0, len, PCI_VPD_LRDT_RO_DATA);
+	i = pci_vpd_find_tag((u8 *)buf, len, PCI_VPD_LRDT_RO_DATA);
 	if (i > 0) {
 		j = pci_vpd_lrdt_size(&((u8 *)buf)[i]);
 		if (j < 0)
@@ -15629,7 +15629,7 @@ static void tg3_read_vpd(struct tg3 *tp)
 	if (!vpd_data)
 		goto out_no_vpd;
 
-	i = pci_vpd_find_tag(vpd_data, 0, vpdlen, PCI_VPD_LRDT_RO_DATA);
+	i = pci_vpd_find_tag(vpd_data, vpdlen, PCI_VPD_LRDT_RO_DATA);
 	if (i < 0)
 		goto out_not_found;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 98829e482bfa..ef5d10e1cce6 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -2774,7 +2774,7 @@ int t4_get_raw_vpd_params(struct adapter *adapter, struct vpd_params *p)
 	if (id_len > ID_LEN)
 		id_len = ID_LEN;
 
-	i = pci_vpd_find_tag(vpd, 0, VPD_LEN, PCI_VPD_LRDT_RO_DATA);
+	i = pci_vpd_find_tag(vpd, VPD_LEN, PCI_VPD_LRDT_RO_DATA);
 	if (i < 0) {
 		dev_err(adapter->pdev_dev, "missing VPD-R section\n");
 		ret = -EINVAL;
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 36c8625a6fd7..c746ca7235f1 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -920,7 +920,7 @@ static void efx_probe_vpd_strings(struct efx_nic *efx)
 	}
 
 	/* Get the Read only section */
-	ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA);
+	ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA);
 	if (ro_start < 0) {
 		netif_err(efx, drv, efx->net_dev, "VPD Read-only not found\n");
 		return;
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index f8979991970e..5e7a57b680ca 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -2800,7 +2800,7 @@ static void ef4_probe_vpd_strings(struct ef4_nic *efx)
 	}
 
 	/* Get the Read only section */
-	ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA);
+	ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA);
 	if (ro_start < 0) {
 		netif_err(efx, drv, efx->net_dev, "VPD Read-only not found\n");
 		return;
diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c
index c1c4f7d80a04..79d4313c91a3 100644
--- a/drivers/pci/vpd.c
+++ b/drivers/pci/vpd.c
@@ -410,11 +410,11 @@ void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev)
 	}
 }
 
-int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt)
+int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt)
 {
 	int i;
 
-	for (i = off; i < len; ) {
+	for (i = 0; i < len; ) {
 		u8 val = buf[i];
 
 		if (val & PCI_VPD_LRDT) {
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index e72440d919d2..ae0963cec222 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -1649,8 +1649,7 @@ static int read_vpd(struct cxlflash_cfg *cfg, u64 wwpn[])
 	}
 
 	/* Get the read only section offset */
-	ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size,
-				    PCI_VPD_LRDT_RO_DATA);
+	ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA);
 	if (unlikely(ro_start < 0)) {
 		dev_err(dev, "%s: VPD Read-only data not found\n", __func__);
 		rc = -ENODEV;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index edadc62ae058..1eb35c09674e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2310,14 +2310,13 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field)
 /**
  * pci_vpd_find_tag - Locates the Resource Data Type tag provided
  * @buf: Pointer to buffered vpd data
- * @off: The offset into the buffer at which to begin the search
  * @len: The length of the vpd buffer
  * @rdt: The Resource Data Type to search for
  *
  * Returns the index where the Resource Data Type was found or
  * -ENOENT otherwise.
  */
-int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt);
+int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt);
 
 /**
  * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD
-- 
cgit v1.2.3


From d0f9164eb294aeb884cbe36ddbbae34fa0124aa1 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 6 Apr 2021 20:04:44 +0300
Subject: vdpa: Follow kdoc comment style

Follow comment style mentioned in the Writing kernel-doc document [1].

Following warnings are fixed.
$ scripts/kernel-doc -v -none include/linux/vdpa.h
include/linux/vdpa.h:11: warning: missing initial short description on line:
 * vDPA callback definition.
include/linux/vdpa.h:11: info: Scanning doc for vDPA
include/linux/vdpa.h:15: warning: cannot understand function prototype: 'struct vdpa_callback '
include/linux/vdpa.h:21: warning: missing initial short description on line:
 * vDPA notification area
include/linux/vdpa.h:21: info: Scanning doc for vDPA
include/linux/vdpa.h:25: warning: cannot understand function prototype: 'struct vdpa_notification_area '
include/linux/vdpa.h:31: warning: missing initial short description on line:
 * vDPA vq_state definition
include/linux/vdpa.h:31: info: Scanning doc for vDPA
include/linux/vdpa.h:34: warning: cannot understand function prototype: 'struct vdpa_vq_state '
include/linux/vdpa.h:41: info: Scanning doc for vDPA device
include/linux/vdpa.h:51: warning: cannot understand function prototype: 'struct vdpa_device '
include/linux/vdpa.h:62: info: Scanning doc for vDPA IOVA range
include/linux/vdpa.h:66: warning: cannot understand function prototype: 'struct vdpa_iova_range '
include/linux/vdpa.h:72: info: Scanning doc for vDPA_config_ops
include/linux/vdpa.h:203: warning: cannot understand function prototype: 'struct vdpa_config_ops '
include/linux/vdpa.h:270: info: Scanning doc for vdpa_driver
include/linux/vdpa.h:275: warning: cannot understand function prototype: 'struct vdpa_driver '
include/linux/vdpa.h:347: info: Scanning doc for vdpa_mgmtdev_ops
include/linux/vdpa.h:360: warning: cannot understand function prototype: 'struct vdpa_mgmtdev_ops '

After this fix:

scripts/kernel-doc -v -none include/linux/vdpa.h
include/linux/vdpa.h:11: info: Scanning doc for struct vdpa_calllback
include/linux/vdpa.h:21: info: Scanning doc for struct vdpa_notification_area
include/linux/vdpa.h:31: info: Scanning doc for struct vdpa_vq_state
include/linux/vdpa.h:41: info: Scanning doc for struct vdpa_device
include/linux/vdpa.h:62: info: Scanning doc for struct vdpa_iova_range
include/linux/vdpa.h:72: info: Scanning doc for struct vdpa_config_ops
include/linux/vdpa.h:270: info: Scanning doc for struct vdpa_driver
include/linux/vdpa.h:347: info: Scanning doc for struct vdpa_mgmtdev_ops

[1] https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20210406170457.98481-2-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/vdpa.h | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 15fa085fab05..37b65ca940cf 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -8,7 +8,7 @@
 #include <linux/vhost_iotlb.h>
 
 /**
- * vDPA callback definition.
+ * struct vdpa_calllback - vDPA callback definition.
  * @callback: interrupt callback function
  * @private: the data passed to the callback function
  */
@@ -18,7 +18,7 @@ struct vdpa_callback {
 };
 
 /**
- * vDPA notification area
+ * struct vdpa_notification_area - vDPA notification area
  * @addr: base address of the notification area
  * @size: size of the notification area
  */
@@ -28,7 +28,7 @@ struct vdpa_notification_area {
 };
 
 /**
- * vDPA vq_state definition
+ * struct vdpa_vq_state - vDPA vq_state definition
  * @avail_index: available index
  */
 struct vdpa_vq_state {
@@ -38,7 +38,7 @@ struct vdpa_vq_state {
 struct vdpa_mgmt_dev;
 
 /**
- * vDPA device - representation of a vDPA device
+ * struct vdpa_device - representation of a vDPA device
  * @dev: underlying device
  * @dma_dev: the actual device that is performing DMA
  * @config: the configuration ops for this device.
@@ -59,7 +59,7 @@ struct vdpa_device {
 };
 
 /**
- * vDPA IOVA range - the IOVA range support by the device
+ * struct vdpa_iova_range - the IOVA range support by the device
  * @first: start of the IOVA range
  * @last: end of the IOVA range
  */
@@ -69,7 +69,7 @@ struct vdpa_iova_range {
 };
 
 /**
- * vDPA_config_ops - operations for configuring a vDPA device.
+ * struct vdpa_config_ops - operations for configuring a vDPA device.
  * Note: vDPA device drivers are required to implement all of the
  * operations unless it is mentioned to be optional in the following
  * list.
@@ -267,7 +267,7 @@ int _vdpa_register_device(struct vdpa_device *vdev, int nvqs);
 void _vdpa_unregister_device(struct vdpa_device *vdev);
 
 /**
- * vdpa_driver - operations for a vDPA driver
+ * struct vdpa_driver - operations for a vDPA driver
  * @driver: underlying device driver
  * @probe: the function to call when a device is found.  Returns 0 or -errno.
  * @remove: the function to call when a device is removed.
@@ -344,18 +344,18 @@ static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset,
 }
 
 /**
- * vdpa_mgmtdev_ops - vdpa device ops
- * @dev_add:	Add a vdpa device using alloc and register
- *		@mdev: parent device to use for device addition
- *		@name: name of the new vdpa device
- *		Driver need to add a new device using _vdpa_register_device()
- *		after fully initializing the vdpa device. Driver must return 0
- *		on success or appropriate error code.
- * @dev_del:	Remove a vdpa device using unregister
- *		@mdev: parent device to use for device removal
- *		@dev: vdpa device to remove
- *		Driver need to remove the specified device by calling
- *		_vdpa_unregister_device().
+ * struct vdpa_mgmtdev_ops - vdpa device ops
+ * @dev_add: Add a vdpa device using alloc and register
+ *	     @mdev: parent device to use for device addition
+ *	     @name: name of the new vdpa device
+ *	     Driver need to add a new device using _vdpa_register_device()
+ *	     after fully initializing the vdpa device. Driver must return 0
+ *	     on success or appropriate error code.
+ * @dev_del: Remove a vdpa device using unregister
+ *	     @mdev: parent device to use for device removal
+ *	     @dev: vdpa device to remove
+ *	     Driver need to remove the specified device by calling
+ *	     _vdpa_unregister_device().
  */
 struct vdpa_mgmtdev_ops {
 	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name);
-- 
cgit v1.2.3


From 9e3bb9b79a7131a088cfffbdcc30e747dad9d090 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 15 Apr 2021 03:31:41 -0400
Subject: virtio_pci_modern: introduce helper to map vq notify area

This patch factors out the logic of vq notify area mapping. Following
patches will switch to use this common helpers for both virtio_pci
library and virtio-pci vDPA driver.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210415073147.19331-2-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
---
 drivers/virtio/virtio_pci_modern_dev.c | 35 ++++++++++++++++++++++++++++++++++
 include/linux/virtio_pci_modern.h      |  2 ++
 2 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index cbd667496bb1..28cb5847fafa 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -593,6 +593,41 @@ u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
 }
 EXPORT_SYMBOL_GPL(vp_modern_get_queue_notify_off);
 
+/*
+ * vp_modern_map_vq_notify - map notification area for a
+ * specific virtqueue
+ * @mdev: the modern virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the address of the notification area
+ */
+void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
+			      u16 index)
+{
+	u16 off = vp_modern_get_queue_notify_off(mdev, index);
+
+	if (mdev->notify_base) {
+		/* offset should not wrap */
+		if ((u64)off * mdev->notify_offset_multiplier + 2
+			> mdev->notify_len) {
+			dev_warn(&mdev->pci_dev->dev,
+				 "bad notification offset %u (x %u) "
+				 "for queue %u > %zd",
+				 off, mdev->notify_offset_multiplier,
+				 index, mdev->notify_len);
+			return NULL;
+		}
+		return (void __force *)mdev->notify_base +
+			off * mdev->notify_offset_multiplier;
+	} else {
+		return (void __force *)vp_modern_map_capability(mdev,
+				       mdev->notify_map_cap, 2, 2,
+				       off * mdev->notify_offset_multiplier, 2,
+				       NULL);
+	}
+}
+EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify);
+
 MODULE_VERSION("0.1");
 MODULE_DESCRIPTION("Modern Virtio PCI Device");
 MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index f26acbeec965..1b95d39b00fc 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -106,6 +106,8 @@ void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, in
 				       u32 align,
 				       u32 start, u32 size,
 				       size_t *len);
+void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
+                              u16 index);
 int vp_modern_probe(struct virtio_pci_modern_device *mdev);
 void vp_modern_remove(struct virtio_pci_modern_device *mdev);
 #endif
-- 
cgit v1.2.3


From a5f7a24f49d81fab9f59611814a8817cc8a876a2 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 15 Apr 2021 03:31:44 -0400
Subject: virtio_pci_modern: hide vp_modern_get_queue_notify_off()

All users (both virtio-pci library and vp_vdpa driver) has been
switched to use vp_modern_map_vq_notify(). So there's no need to
export the low level helper of vp_modern_get_queue_notify_off().

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210415073147.19331-5-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
---
 drivers/virtio/virtio_pci_modern_dev.c | 5 ++---
 include/linux/virtio_pci_modern.h      | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 28cb5847fafa..5a657e56b46d 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -584,14 +584,13 @@ EXPORT_SYMBOL_GPL(vp_modern_get_num_queues);
  *
  * Returns the notification offset for a virtqueue
  */
-u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
-				   u16 index)
+static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
+					  u16 index)
 {
 	vp_iowrite16(index, &mdev->common->queue_select);
 
 	return vp_ioread16(&mdev->common->queue_notify_off);
 }
-EXPORT_SYMBOL_GPL(vp_modern_get_queue_notify_off);
 
 /*
  * vp_modern_map_vq_notify - map notification area for a
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 1b95d39b00fc..179a2fb4bf37 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -99,8 +99,6 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
 u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
 			     u16 idx);
 u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev);
-u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
-				   u16 idx);
 void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
 				       size_t minlen,
 				       u32 align,
-- 
cgit v1.2.3


From fd466b36940b22a506265edf12714bd0cf9ed836 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 15 Apr 2021 03:31:45 -0400
Subject: virito_pci libray: hide vp_modern_map_capability()

No user now and the capability should not be setup
externally. Instead, every access to the capability should be done via
virtio_pci_modern_device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210415073147.19331-6-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
---
 drivers/virtio/virtio_pci_modern_dev.c | 10 ++++------
 include/linux/virtio_pci_modern.h      |  5 -----
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 5a657e56b46d..9c241c9bd920 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -16,11 +16,10 @@
  *
  * Returns the io address of for the part of the capability
  */
-void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
-				       size_t minlen,
-				       u32 align,
-				       u32 start, u32 size,
-				       size_t *len)
+static void __iomem *
+vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
+			 size_t minlen, u32 align, u32 start, u32 size,
+			 size_t *len)
 {
 	struct pci_dev *dev = mdev->pci_dev;
 	u8 bar;
@@ -90,7 +89,6 @@ void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, in
 			length, offset, bar);
 	return p;
 }
-EXPORT_SYMBOL_GPL(vp_modern_map_capability);
 
 /**
  * virtio_pci_find_capability - walk capabilities to find device info.
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index 179a2fb4bf37..e6e7072413c1 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -99,11 +99,6 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
 u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
 			     u16 idx);
 u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev);
-void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
-				       size_t minlen,
-				       u32 align,
-				       u32 start, u32 size,
-				       size_t *len);
 void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
                               u16 index);
 int vp_modern_probe(struct virtio_pci_modern_device *mdev);
-- 
cgit v1.2.3


From 9e311bcad73dc14bd0a736db6ad3d382227e11fe Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 15 Apr 2021 03:31:46 -0400
Subject: virtio-pci library: report resource address

Sometimes it might be useful to report the capability physical
address. One example is to report the physical address of the doorbell
in order to be mapped by userspace.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20210415073147.19331-7-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/virtio_pci/vp_vdpa.c      |  3 ++-
 drivers/virtio/virtio_pci_modern.c     |  2 +-
 drivers/virtio/virtio_pci_modern_dev.c | 24 +++++++++++++++++-------
 include/linux/virtio_pci_modern.h      |  4 +++-
 4 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c
index 2afc90645660..98205e54d089 100644
--- a/drivers/vdpa/virtio_pci/vp_vdpa.c
+++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
@@ -415,7 +415,8 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	for (i = 0; i < vp_vdpa->queues; i++) {
 		vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
-		vp_vdpa->vring[i].notify = vp_modern_map_vq_notify(mdev, i);
+		vp_vdpa->vring[i].notify =
+			vp_modern_map_vq_notify(mdev, i, NULL);
 		if (!vp_vdpa->vring[i].notify) {
 			dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i);
 			goto err;
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 29607d9bd95c..722ea44e7579 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -224,7 +224,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 				virtqueue_get_avail_addr(vq),
 				virtqueue_get_used_addr(vq));
 
-	vq->priv = vp_modern_map_vq_notify(mdev, index);
+	vq->priv = vp_modern_map_vq_notify(mdev, index, NULL);
 	if (!vq->priv) {
 		err = -ENOMEM;
 		goto err_map_notify;
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 9c241c9bd920..ae87b3fa8858 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -13,13 +13,14 @@
  * @start: start from the capability
  * @size: map size
  * @len: the length that is actually mapped
+ * @pa: physical address of the capability
  *
  * Returns the io address of for the part of the capability
  */
 static void __iomem *
 vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
 			 size_t minlen, u32 align, u32 start, u32 size,
-			 size_t *len)
+			 size_t *len, resource_size_t *pa)
 {
 	struct pci_dev *dev = mdev->pci_dev;
 	u8 bar;
@@ -87,6 +88,9 @@ vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off,
 		dev_err(&dev->dev,
 			"virtio_pci: unable to map virtio %u@%u on bar %i\n",
 			length, offset, bar);
+	else if (pa)
+		*pa = pci_resource_start(dev, bar) + offset;
+
 	return p;
 }
 
@@ -273,12 +277,12 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 	mdev->common = vp_modern_map_capability(mdev, common,
 				      sizeof(struct virtio_pci_common_cfg), 4,
 				      0, sizeof(struct virtio_pci_common_cfg),
-				      NULL);
+				      NULL, NULL);
 	if (!mdev->common)
 		goto err_map_common;
 	mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
 					     0, 1,
-					     NULL);
+					     NULL, NULL);
 	if (!mdev->isr)
 		goto err_map_isr;
 
@@ -306,7 +310,8 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 		mdev->notify_base = vp_modern_map_capability(mdev, notify,
 							     2, 2,
 							     0, notify_length,
-							     &mdev->notify_len);
+							     &mdev->notify_len,
+							     &mdev->notify_pa);
 		if (!mdev->notify_base)
 			goto err_map_notify;
 	} else {
@@ -319,7 +324,8 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 	if (device) {
 		mdev->device = vp_modern_map_capability(mdev, device, 0, 4,
 							0, PAGE_SIZE,
-							&mdev->device_len);
+							&mdev->device_len,
+							NULL);
 		if (!mdev->device)
 			goto err_map_device;
 	}
@@ -595,11 +601,12 @@ static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
  * specific virtqueue
  * @mdev: the modern virtio-pci device
  * @index: the queue index
+ * @pa: the pointer to the physical address of the nofity area
  *
  * Returns the address of the notification area
  */
 void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
-			      u16 index)
+			      u16 index, resource_size_t *pa)
 {
 	u16 off = vp_modern_get_queue_notify_off(mdev, index);
 
@@ -614,13 +621,16 @@ void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
 				 index, mdev->notify_len);
 			return NULL;
 		}
+		if (pa)
+			*pa = mdev->notify_pa +
+			      off * mdev->notify_offset_multiplier;
 		return (void __force *)mdev->notify_base +
 			off * mdev->notify_offset_multiplier;
 	} else {
 		return (void __force *)vp_modern_map_capability(mdev,
 				       mdev->notify_map_cap, 2, 2,
 				       off * mdev->notify_offset_multiplier, 2,
-				       NULL);
+				       NULL, pa);
 	}
 }
 EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify);
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index e6e7072413c1..cdfabbefacdf 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -13,6 +13,8 @@ struct virtio_pci_modern_device {
 	void __iomem *device;
 	/* Base of vq notifications (non-legacy mode). */
 	void __iomem *notify_base;
+	/* Physical base of vq notifications */
+	resource_size_t notify_pa;
 	/* Where to read and clear interrupt */
 	u8 __iomem *isr;
 
@@ -100,7 +102,7 @@ u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
 			     u16 idx);
 u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev);
 void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
-                              u16 index);
+                              u16 index, resource_size_t *pa);
 int vp_modern_probe(struct virtio_pci_modern_device *mdev);
 void vp_modern_remove(struct virtio_pci_modern_device *mdev);
 #endif
-- 
cgit v1.2.3


From f53d9910d009bc015b42d88114e2d86a93b0e6b7 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Mon, 15 Mar 2021 17:34:38 +0100
Subject: vringh: add 'iotlb_lock' to synchronize iotlb accesses

Usually iotlb accesses are synchronized with a spinlock.
Let's request it as a new parameter in vringh_set_iotlb() and
hold it when we navigate the iotlb in iotlb_translate() to avoid
race conditions with any new additions/deletions of ranges from
the ioltb.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20210315163450.254396-3-sgarzare@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++-
 drivers/vhost/vringh.c           | 9 ++++++++-
 include/linux/vringh.h           | 6 +++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index fc2ec9599753..a92c08880098 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -284,7 +284,8 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
 		goto err_iommu;
 
 	for (i = 0; i < dev_attr->nvqs; i++)
-		vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu);
+		vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu,
+				 &vdpasim->iommu_lock);
 
 	ret = iova_cache_get();
 	if (ret)
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index 85d85faba058..f68122705719 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -1074,6 +1074,8 @@ static int iotlb_translate(const struct vringh *vrh,
 	int ret = 0;
 	u64 s = 0;
 
+	spin_lock(vrh->iotlb_lock);
+
 	while (len > s) {
 		u64 size, pa, pfn;
 
@@ -1103,6 +1105,8 @@ static int iotlb_translate(const struct vringh *vrh,
 		++ret;
 	}
 
+	spin_unlock(vrh->iotlb_lock);
+
 	return ret;
 }
 
@@ -1262,10 +1266,13 @@ EXPORT_SYMBOL(vringh_init_iotlb);
  * vringh_set_iotlb - initialize a vringh for a ring with IOTLB.
  * @vrh: the vring
  * @iotlb: iotlb associated with this vring
+ * @iotlb_lock: spinlock to synchronize the iotlb accesses
  */
-void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb)
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb,
+		      spinlock_t *iotlb_lock)
 {
 	vrh->iotlb = iotlb;
+	vrh->iotlb_lock = iotlb_lock;
 }
 EXPORT_SYMBOL(vringh_set_iotlb);
 
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index 59bd50f99291..9c077863c8f6 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -46,6 +46,9 @@ struct vringh {
 	/* IOTLB for this vring */
 	struct vhost_iotlb *iotlb;
 
+	/* spinlock to synchronize IOTLB accesses */
+	spinlock_t *iotlb_lock;
+
 	/* The function to call to notify the guest about added buffers */
 	void (*notify)(struct vringh *);
 };
@@ -258,7 +261,8 @@ static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val)
 
 #if IS_REACHABLE(CONFIG_VHOST_IOTLB)
 
-void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb);
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb,
+		      spinlock_t *iotlb_lock);
 
 int vringh_init_iotlb(struct vringh *vrh, u64 features,
 		      unsigned int num, bool weak_barriers,
-- 
cgit v1.2.3


From b8c06ad4d67db56ed6bdfb685c134da74e92a2c7 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Mon, 15 Mar 2021 17:34:41 +0100
Subject: vringh: implement vringh_kiov_advance()

In some cases, it may be useful to provide a way to skip a number
of bytes in a vringh_kiov.

Let's implement vringh_kiov_advance() for this purpose, reusing the
code from vringh_iov_xfer().
We replace that code calling the new vringh_kiov_advance().

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20210315163450.254396-6-sgarzare@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vringh.c | 41 +++++++++++++++++++++++++++++------------
 include/linux/vringh.h |  2 ++
 2 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index 2a88e087afd8..4af8fa259d65 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -75,6 +75,34 @@ static inline int __vringh_get_head(const struct vringh *vrh,
 	return head;
 }
 
+/**
+ * vringh_kiov_advance - skip bytes from vring_kiov
+ * @iov: an iov passed to vringh_getdesc_*() (updated as we consume)
+ * @len: the maximum length to advance
+ */
+void vringh_kiov_advance(struct vringh_kiov *iov, size_t len)
+{
+	while (len && iov->i < iov->used) {
+		size_t partlen = min(iov->iov[iov->i].iov_len, len);
+
+		iov->consumed += partlen;
+		iov->iov[iov->i].iov_len -= partlen;
+		iov->iov[iov->i].iov_base += partlen;
+
+		if (!iov->iov[iov->i].iov_len) {
+			/* Fix up old iov element then increment. */
+			iov->iov[iov->i].iov_len = iov->consumed;
+			iov->iov[iov->i].iov_base -= iov->consumed;
+
+			iov->consumed = 0;
+			iov->i++;
+		}
+
+		len -= partlen;
+	}
+}
+EXPORT_SYMBOL(vringh_kiov_advance);
+
 /* Copy some bytes to/from the iovec.  Returns num copied. */
 static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
 				      struct vringh_kiov *iov,
@@ -95,19 +123,8 @@ static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
 		done += partlen;
 		len -= partlen;
 		ptr += partlen;
-		iov->consumed += partlen;
-		iov->iov[iov->i].iov_len -= partlen;
-		iov->iov[iov->i].iov_base += partlen;
 
-		if (!iov->iov[iov->i].iov_len) {
-			/* Fix up old iov element then increment. */
-			iov->iov[iov->i].iov_len = iov->consumed;
-			iov->iov[iov->i].iov_base -= iov->consumed;
-
-			
-			iov->consumed = 0;
-			iov->i++;
-		}
+		vringh_kiov_advance(iov, partlen);
 	}
 	return done;
 }
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index 9c077863c8f6..755211ebd195 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -199,6 +199,8 @@ static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
 	kiov->iov = NULL;
 }
 
+void vringh_kiov_advance(struct vringh_kiov *kiov, size_t len);
+
 int vringh_getdesc_kern(struct vringh *vrh,
 			struct vringh_kiov *riov,
 			struct vringh_kiov *wiov,
-- 
cgit v1.2.3


From 14c9ac05ce09c8c6a89ffcca6ffb68707cba36c2 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Mon, 15 Mar 2021 17:34:42 +0100
Subject: vringh: add vringh_kiov_length() helper

This new helper returns the total number of bytes covered by
a vringh_kiov.

Suggested-by: Jason Wang <jasowang@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20210315163450.254396-7-sgarzare@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vringh.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index 755211ebd195..84db7b8f912f 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -199,6 +199,17 @@ static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
 	kiov->iov = NULL;
 }
 
+static inline size_t vringh_kiov_length(struct vringh_kiov *kiov)
+{
+	size_t len = 0;
+	int i;
+
+	for (i = kiov->i; i < kiov->used; i++)
+		len += kiov->iov[i].iov_len;
+
+	return len;
+}
+
 void vringh_kiov_advance(struct vringh_kiov *kiov, size_t len);
 
 int vringh_getdesc_kern(struct vringh *vrh,
-- 
cgit v1.2.3


From 442706f9f94d28fe3c9f188ae4ebbd6b40addffe Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Mon, 15 Mar 2021 17:34:44 +0100
Subject: vdpa: add get_config_size callback in vdpa_config_ops

This new callback is used to get the size of the configuration space
of vDPA devices.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20210315163450.254396-9-sgarzare@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/ifcvf/ifcvf_main.c   | 6 ++++++
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 ++++++
 drivers/vdpa/vdpa_sim/vdpa_sim.c  | 9 +++++++++
 drivers/vdpa/virtio_pci/vp_vdpa.c | 8 ++++++++
 include/linux/vdpa.h              | 4 ++++
 5 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index a16753f1de5b..44d7586019da 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -351,6 +351,11 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
 	return IFCVF_QUEUE_ALIGNMENT;
 }
 
+static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev)
+{
+	return sizeof(struct virtio_net_config);
+}
+
 static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
 				  unsigned int offset,
 				  void *buf, unsigned int len)
@@ -411,6 +416,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
 	.get_device_id	= ifcvf_vdpa_get_device_id,
 	.get_vendor_id	= ifcvf_vdpa_get_vendor_id,
 	.get_vq_align	= ifcvf_vdpa_get_vq_align,
+	.get_config_size	= ifcvf_vdpa_get_config_size,
 	.get_config	= ifcvf_vdpa_get_config,
 	.set_config	= ifcvf_vdpa_set_config,
 	.set_config_cb  = ifcvf_vdpa_set_config_cb,
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 25533db01f5f..189e4385df40 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1809,6 +1809,11 @@ err_setup:
 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
 }
 
+static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
+{
+	return sizeof(struct virtio_net_config);
+}
+
 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
 				 unsigned int len)
 {
@@ -1895,6 +1900,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
 	.get_status = mlx5_vdpa_get_status,
 	.set_status = mlx5_vdpa_set_status,
+	.get_config_size = mlx5_vdpa_get_config_size,
 	.get_config = mlx5_vdpa_get_config,
 	.set_config = mlx5_vdpa_set_config,
 	.get_generation = mlx5_vdpa_get_generation,
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 14dc2d3d983e..98f793bc9376 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -462,6 +462,13 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
 	spin_unlock(&vdpasim->lock);
 }
 
+static size_t vdpasim_get_config_size(struct vdpa_device *vdpa)
+{
+	struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+	return vdpasim->dev_attr.config_size;
+}
+
 static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
 			     void *buf, unsigned int len)
 {
@@ -598,6 +605,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = {
 	.get_vendor_id          = vdpasim_get_vendor_id,
 	.get_status             = vdpasim_get_status,
 	.set_status             = vdpasim_set_status,
+	.get_config_size        = vdpasim_get_config_size,
 	.get_config             = vdpasim_get_config,
 	.set_config             = vdpasim_set_config,
 	.get_generation         = vdpasim_get_generation,
@@ -625,6 +633,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = {
 	.get_vendor_id          = vdpasim_get_vendor_id,
 	.get_status             = vdpasim_get_status,
 	.set_status             = vdpasim_set_status,
+	.get_config_size        = vdpasim_get_config_size,
 	.get_config             = vdpasim_get_config,
 	.set_config             = vdpasim_set_config,
 	.get_generation         = vdpasim_get_generation,
diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c
index 002b928d0ca1..c76ebb531212 100644
--- a/drivers/vdpa/virtio_pci/vp_vdpa.c
+++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
@@ -296,6 +296,13 @@ static u32 vp_vdpa_get_vq_align(struct vdpa_device *vdpa)
 	return PAGE_SIZE;
 }
 
+static size_t vp_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+	struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa);
+
+	return mdev->device_len;
+}
+
 static void vp_vdpa_get_config(struct vdpa_device *vdpa,
 			       unsigned int offset,
 			       void *buf, unsigned int len)
@@ -369,6 +376,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = {
 	.get_device_id	= vp_vdpa_get_device_id,
 	.get_vendor_id	= vp_vdpa_get_vendor_id,
 	.get_vq_align	= vp_vdpa_get_vq_align,
+	.get_config_size = vp_vdpa_get_config_size,
 	.get_config	= vp_vdpa_get_config,
 	.set_config	= vp_vdpa_set_config,
 	.set_config_cb  = vp_vdpa_set_config_cb,
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 37b65ca940cf..f311d227aa1b 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -150,6 +150,9 @@ struct vdpa_iova_range {
  * @set_status:			Set the device status
  *				@vdev: vdpa device
  *				@status: virtio device status
+ * @get_config_size:		Get the size of the configuration space
+ *				@vdev: vdpa device
+ *				Returns size_t: configuration size
  * @get_config:			Read from device specific configuration space
  *				@vdev: vdpa device
  *				@offset: offset from the beginning of
@@ -231,6 +234,7 @@ struct vdpa_config_ops {
 	u32 (*get_vendor_id)(struct vdpa_device *vdev);
 	u8 (*get_status)(struct vdpa_device *vdev);
 	void (*set_status)(struct vdpa_device *vdev, u8 status);
+	size_t (*get_config_size)(struct vdpa_device *vdev);
 	void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
 			   void *buf, unsigned int len);
 	void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
-- 
cgit v1.2.3


From 801c6058d14a82179a7ee17a4b532cac6fad067f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 29 Apr 2021 15:19:37 +0000
Subject: bpf: Fix leakage of uninitialized bpf stack under speculation

The current implemented mechanisms to mitigate data disclosure under
speculation mainly address stack and map value oob access from the
speculative domain. However, Piotr discovered that uninitialized BPF
stack is not protected yet, and thus old data from the kernel stack,
potentially including addresses of kernel structures, could still be
extracted from that 512 bytes large window. The BPF stack is special
compared to map values since it's not zero initialized for every
program invocation, whereas map values /are/ zero initialized upon
their initial allocation and thus cannot leak any prior data in either
domain. In the non-speculative domain, the verifier ensures that every
stack slot read must have a prior stack slot write by the BPF program
to avoid such data leaking issue.

However, this is not enough: for example, when the pointer arithmetic
operation moves the stack pointer from the last valid stack offset to
the first valid offset, the sanitation logic allows for any intermediate
offsets during speculative execution, which could then be used to
extract any restricted stack content via side-channel.

Given for unprivileged stack pointer arithmetic the use of unknown
but bounded scalars is generally forbidden, we can simply turn the
register-based arithmetic operation into an immediate-based arithmetic
operation without the need for masking. This also gives the benefit
of reducing the needed instructions for the operation. Given after
the work in 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic
mask"), the aux->alu_limit already holds the final immediate value for
the offset register with the known scalar. Thus, a simple mov of the
immediate to AX register with using AX as the source for the original
instruction is sufficient and possible now in this case.

Reported-by: Piotr Krysiuk <piotras@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Piotr Krysiuk <piotras@gmail.com>
Reviewed-by: Piotr Krysiuk <piotras@gmail.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  5 +++--
 kernel/bpf/verifier.c        | 27 +++++++++++++++++----------
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6023a1367853..06841517ab1e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -302,10 +302,11 @@ struct bpf_verifier_state_list {
 };
 
 /* Possible states for alu_state member. */
-#define BPF_ALU_SANITIZE_SRC		1U
-#define BPF_ALU_SANITIZE_DST		2U
+#define BPF_ALU_SANITIZE_SRC		(1U << 0)
+#define BPF_ALU_SANITIZE_DST		(1U << 1)
 #define BPF_ALU_NEG_VALUE		(1U << 2)
 #define BPF_ALU_NON_POINTER		(1U << 3)
+#define BPF_ALU_IMMEDIATE		(1U << 4)
 #define BPF_ALU_SANITIZE		(BPF_ALU_SANITIZE_SRC | \
 					 BPF_ALU_SANITIZE_DST)
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index db347fb125e3..757476c91c98 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6496,6 +6496,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 {
 	struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux;
 	struct bpf_verifier_state *vstate = env->cur_state;
+	bool off_is_imm = tnum_is_const(off_reg->var_off);
 	bool off_is_neg = off_reg->smin_value < 0;
 	bool ptr_is_dst_reg = ptr_reg == dst_reg;
 	u8 opcode = BPF_OP(insn->code);
@@ -6526,6 +6527,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 		alu_limit = abs(tmp_aux->alu_limit - alu_limit);
 	} else {
 		alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+		alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
 		alu_state |= ptr_is_dst_reg ?
 			     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
 	}
@@ -12371,7 +12373,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
 			const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
 			struct bpf_insn *patch = &insn_buf[0];
-			bool issrc, isneg;
+			bool issrc, isneg, isimm;
 			u32 off_reg;
 
 			aux = &env->insn_aux_data[i + delta];
@@ -12382,16 +12384,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
 			issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
 				BPF_ALU_SANITIZE_SRC;
+			isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
 
 			off_reg = issrc ? insn->src_reg : insn->dst_reg;
-			if (isneg)
-				*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
-			*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
-			*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
-			*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
-			*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
-			*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
-			*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
+			if (isimm) {
+				*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+			} else {
+				if (isneg)
+					*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+				*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+				*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+				*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+				*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+				*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+				*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
+			}
 			if (!issrc)
 				*patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
 			insn->src_reg = BPF_REG_AX;
@@ -12399,7 +12406,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 				insn->code = insn->code == code_add ?
 					     code_sub : code_add;
 			*patch++ = *insn;
-			if (issrc && isneg)
+			if (issrc && isneg && !isimm)
 				*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
 			cnt = patch - insn_buf;
 
-- 
cgit v1.2.3


From cd2c7545ae1beac3b6aae033c7f31193b3255946 Mon Sep 17 00:00:00 2001
From: Changheun Lee <nanich.lee@samsung.com>
Date: Mon, 3 May 2021 18:52:03 +0900
Subject: bio: limit bio max size

bio size can grow up to 4GB when muli-page bvec is enabled.
but sometimes it would lead to inefficient behaviors.
in case of large chunk direct I/O, - 32MB chunk read in user space -
all pages for 32MB would be merged to a bio structure if the pages
physical addresses are contiguous. it makes some delay to submit
until merge complete. bio max size should be limited to a proper size.

When 32MB chunk read with direct I/O option is coming from userspace,
kernel behavior is below now in do_direct_IO() loop. it's timeline.

 | bio merge for 32MB. total 8,192 pages are merged.
 | total elapsed time is over 2ms.
 |------------------ ... ----------------------->|
                                                 | 8,192 pages merged a bio.
                                                 | at this time, first bio submit is done.
                                                 | 1 bio is split to 32 read request and issue.
                                                 |--------------->
                                                  |--------------->
                                                   |--------------->
                                                              ......
                                                                   |--------------->
                                                                    |--------------->|
                          total 19ms elapsed to complete 32MB read done from device. |

If bio max size is limited with 1MB, behavior is changed below.

 | bio merge for 1MB. 256 pages are merged for each bio.
 | total 32 bio will be made.
 | total elapsed time is over 2ms. it's same.
 | but, first bio submit timing is fast. about 100us.
 |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
      | 256 pages merged a bio.
      | at this time, first bio submit is done.
      | and 1 read request is issued for 1 bio.
      |--------------->
           |--------------->
                |--------------->
                                      ......
                                                 |--------------->
                                                  |--------------->|
        total 17ms elapsed to complete 32MB read done from device. |

As a result, read request issue timing is faster if bio max size is limited.
Current kernel behavior with multipage bvec, super large bio can be created.
And it lead to delay first I/O request issue.

Signed-off-by: Changheun Lee <nanich.lee@samsung.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20210503095203.29076-1-nanich.lee@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c            | 13 +++++++++++--
 block/blk-settings.c   |  5 +++++
 include/linux/bio.h    |  4 +++-
 include/linux/blkdev.h |  2 ++
 4 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 44205dfb6b60..221dc56ba22f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -255,6 +255,13 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 }
 EXPORT_SYMBOL(bio_init);
 
+unsigned int bio_max_size(struct bio *bio)
+{
+	struct block_device *bdev = bio->bi_bdev;
+
+	return bdev ? bdev->bd_disk->queue->limits.bio_max_bytes : UINT_MAX;
+}
+
 /**
  * bio_reset - reinitialize a bio
  * @bio:	bio to reset
@@ -866,7 +873,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
 		if (page_is_mergeable(bv, page, len, off, same_page)) {
-			if (bio->bi_iter.bi_size > UINT_MAX - len) {
+			if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
 				*same_page = false;
 				return false;
 			}
@@ -995,6 +1002,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
+	unsigned int bytes_left = bio_max_size(bio) - bio->bi_iter.bi_size;
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
 	bool same_page = false;
@@ -1010,7 +1018,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	size = iov_iter_get_pages(iter, pages, bytes_left, nr_pages,
+				  &offset);
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9c009090c4b5..c646503e55d2 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
+	lim->bio_max_bytes = UINT_MAX;
 	lim->max_segments = BLK_MAX_SEGMENTS;
 	lim->max_discard_segments = 1;
 	lim->max_integrity_segments = 0;
@@ -139,6 +140,10 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 				 limits->logical_block_size >> SECTOR_SHIFT);
 	limits->max_sectors = max_sectors;
 
+	if (check_shl_overflow(max_sectors, SECTOR_SHIFT,
+				&limits->bio_max_bytes))
+		limits->bio_max_bytes = UINT_MAX;
+
 	q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a0b4cfdf62a4..f1a99f0a240c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -106,6 +106,8 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
+extern unsigned int bio_max_size(struct bio *bio);
+
 /**
  * bio_full - check if the bio is full
  * @bio:	bio to check
@@ -119,7 +121,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return true;
 
-	if (bio->bi_iter.bi_size > UINT_MAX - len)
+	if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
 		return true;
 
 	return false;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b91ba6207365..40c7c4d87aa1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -327,6 +327,8 @@ enum blk_bounce {
 };
 
 struct queue_limits {
+	unsigned int		bio_max_bytes;
+
 	enum blk_bounce		bounce;
 	unsigned long		seg_boundary_mask;
 	unsigned long		virt_boundary_mask;
-- 
cgit v1.2.3


From 48582b2e3b87b794a9845d488af2c76ce055502b Mon Sep 17 00:00:00 2001
From: Jim Quinlan <jim2101024@gmail.com>
Date: Fri, 30 Apr 2021 11:21:54 -0400
Subject: reset: add missing empty function reset_control_rearm()

All other functions are defined for when CONFIG_RESET_CONTROLLER
is not set.

Fixes: 557acb3d2cd9 ("reset: make shared pulsed reset controls re-triggerable")
Link: https://lore.kernel.org/r/20210430152156.21162-2-jim2101024@gmail.com
Signed-off-by: Jim Quinlan <jim2101024@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: stable@vger.kernel.org # v5.11+
---
 include/linux/reset.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reset.h b/include/linux/reset.h
index b9109efa2a5c..9700124affa3 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -47,6 +47,11 @@ static inline int reset_control_reset(struct reset_control *rstc)
 	return 0;
 }
 
+static inline int reset_control_rearm(struct reset_control *rstc)
+{
+	return 0;
+}
+
 static inline int reset_control_assert(struct reset_control *rstc)
 {
 	return 0;
-- 
cgit v1.2.3


From 43016d02cf6e46edfc4696452251d34bba0c0435 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 3 May 2021 13:51:15 +0200
Subject: netfilter: arptables: use pernet ops struct during unregister

Like with iptables and ebtables, hook unregistration has to use the
pernet ops struct, not the template.

This triggered following splat:
  hook not found, pf 3 num 0
  WARNING: CPU: 0 PID: 224 at net/netfilter/core.c:480 __nf_unregister_net_hook+0x1eb/0x610 net/netfilter/core.c:480
[..]
 nf_unregister_net_hook net/netfilter/core.c:502 [inline]
 nf_unregister_net_hooks+0x117/0x160 net/netfilter/core.c:576
 arpt_unregister_table_pre_exit+0x67/0x80 net/ipv4/netfilter/arp_tables.c:1565

Fixes: f9006acc8dfe5 ("netfilter: arp_tables: pass table pointer via nf_hook_ops")
Reported-by: syzbot+dcccba8a1e41a38cb9df@syzkaller.appspotmail.com
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_arp/arp_tables.h | 3 +--
 net/ipv4/netfilter/arp_tables.c          | 5 ++---
 net/ipv4/netfilter/arptable_filter.c     | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 2aab9612f6ab..4f9a4b3c5892 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -53,8 +53,7 @@ int arpt_register_table(struct net *net, const struct xt_table *table,
 			const struct arpt_replace *repl,
 			const struct nf_hook_ops *ops);
 void arpt_unregister_table(struct net *net, const char *name);
-void arpt_unregister_table_pre_exit(struct net *net, const char *name,
-				    const struct nf_hook_ops *ops);
+void arpt_unregister_table_pre_exit(struct net *net, const char *name);
 extern unsigned int arpt_do_table(struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  struct xt_table *table);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index cf20316094d0..c53f14b94356 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1556,13 +1556,12 @@ out_free:
 	return ret;
 }
 
-void arpt_unregister_table_pre_exit(struct net *net, const char *name,
-				    const struct nf_hook_ops *ops)
+void arpt_unregister_table_pre_exit(struct net *net, const char *name)
 {
 	struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
 
 	if (table)
-		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+		nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
 }
 EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
 
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index b8f45e9bbec8..6922612df456 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -54,7 +54,7 @@ static int __net_init arptable_filter_table_init(struct net *net)
 
 static void __net_exit arptable_filter_net_pre_exit(struct net *net)
 {
-	arpt_unregister_table_pre_exit(net, "filter", arpfilter_ops);
+	arpt_unregister_table_pre_exit(net, "filter");
 }
 
 static void __net_exit arptable_filter_net_exit(struct net *net)
-- 
cgit v1.2.3


From d7bce85aa7b92b5de8f69b3bcedfe51d7b1aabe1 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 4 May 2021 04:17:20 -0400
Subject: virtio_pci_modern: correct sparse tags for notify

When switching virtio_pci_modern to use a helper for mappings we lost an
__iomem tag. Restore it.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 9e3bb9b79a71 ("virtio_pci_modern: introduce helper to map vq notify area")
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_modern_dev.c | 9 ++++-----
 include/linux/virtio_pci_modern.h      | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index ae87b3fa8858..54f297028586 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -605,8 +605,8 @@ static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev,
  *
  * Returns the address of the notification area
  */
-void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
-			      u16 index, resource_size_t *pa)
+void __iomem *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
+				      u16 index, resource_size_t *pa)
 {
 	u16 off = vp_modern_get_queue_notify_off(mdev, index);
 
@@ -624,10 +624,9 @@ void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
 		if (pa)
 			*pa = mdev->notify_pa +
 			      off * mdev->notify_offset_multiplier;
-		return (void __force *)mdev->notify_base +
-			off * mdev->notify_offset_multiplier;
+		return mdev->notify_base + off * mdev->notify_offset_multiplier;
 	} else {
-		return (void __force *)vp_modern_map_capability(mdev,
+		return vp_modern_map_capability(mdev,
 				       mdev->notify_map_cap, 2, 2,
 				       off * mdev->notify_offset_multiplier, 2,
 				       NULL, pa);
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index cdfabbefacdf..6a95b58fd0f4 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -101,8 +101,8 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev,
 u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev,
 			     u16 idx);
 u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev);
-void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
-                              u16 index, resource_size_t *pa);
+void __iomem * vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev,
+				       u16 index, resource_size_t *pa);
 int vp_modern_probe(struct virtio_pci_modern_device *mdev);
 void vp_modern_remove(struct virtio_pci_modern_device *mdev);
 #endif
-- 
cgit v1.2.3


From 6e552494fb90acae005d74ce6a2ee102d965184b Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 4 May 2021 08:54:29 -0700
Subject: iomap: remove unused private field from ioend

The only remaining user of ->io_private is the generic ioend merging
infrastructure. The only user of that is XFS, which no longer sets
->io_private or passes an associated merge callback. Remove the
unused parameter and the ->io_private field.

CC: linux-fsdevel@vger.kernel.org
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/iomap/buffered-io.c | 7 +------
 fs/xfs/xfs_aops.c      | 2 +-
 include/linux/iomap.h  | 5 +----
 3 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0129e6bab985..f2cd2034a87b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1134,9 +1134,7 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 }
 
 void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
-		void (*merge_private)(struct iomap_ioend *ioend,
-				struct iomap_ioend *next))
+iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
 {
 	struct iomap_ioend *next;
 
@@ -1148,8 +1146,6 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
 			break;
 		list_move_tail(&next->io_list, &ioend->io_list);
 		ioend->io_size += next->io_size;
-		if (next->io_private && merge_private)
-			merge_private(ioend, next);
 	}
 }
 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
@@ -1236,7 +1232,6 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
 	ioend->io_offset = offset;
-	ioend->io_private = NULL;
 	ioend->io_bio = bio;
 	return ioend;
 }
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 9b08db45ce85..826caa6b4a5a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -146,7 +146,7 @@ xfs_end_io(
 	while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
 			io_list))) {
 		list_del_init(&ioend->io_list);
-		iomap_ioend_try_merge(ioend, &tmp, NULL);
+		iomap_ioend_try_merge(ioend, &tmp);
 		xfs_end_ioend(ioend);
 	}
 }
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index d202fd2d0f91..c87d0cb0de6d 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -198,7 +198,6 @@ struct iomap_ioend {
 	struct inode		*io_inode;	/* file being written to */
 	size_t			io_size;	/* size of the extent */
 	loff_t			io_offset;	/* offset in the file */
-	void			*io_private;	/* file system private data */
 	struct bio		*io_bio;	/* bio being built */
 	struct bio		io_inline_bio;	/* MUST BE LAST! */
 };
@@ -234,9 +233,7 @@ struct iomap_writepage_ctx {
 
 void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
 void iomap_ioend_try_merge(struct iomap_ioend *ioend,
-		struct list_head *more_ioends,
-		void (*merge_private)(struct iomap_ioend *ioend,
-				struct iomap_ioend *next));
+		struct list_head *more_ioends);
 void iomap_sort_ioends(struct list_head *ioend_list);
 int iomap_writepage(struct page *page, struct writeback_control *wbc,
 		struct iomap_writepage_ctx *wpc,
-- 
cgit v1.2.3


From 98635b29a73f1a49ab6882ae58d56c9cd5ecb902 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 15 Mar 2021 10:13:54 +0100
Subject: lib: bitmap: remove the 'extern' keyword from function declarations

The 'extern' keyword doesn't have any benefits for functions in header
files. Remove it.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/bitmap.h | 115 ++++++++++++++++++++++++-------------------------
 1 file changed, 57 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 70a932470b2d..6939a8983026 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -118,54 +118,53 @@
  * Allocation and deallocation of bitmap.
  * Provided in lib/bitmap.c to avoid circular dependency.
  */
-extern unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
-extern unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
-extern void bitmap_free(const unsigned long *bitmap);
+unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
+unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
+void bitmap_free(const unsigned long *bitmap);
 
 /*
  * lib/bitmap.c provides these functions:
  */
 
-extern int __bitmap_equal(const unsigned long *bitmap1,
-			  const unsigned long *bitmap2, unsigned int nbits);
-extern bool __pure __bitmap_or_equal(const unsigned long *src1,
-				     const unsigned long *src2,
-				     const unsigned long *src3,
-				     unsigned int nbits);
-extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
-			unsigned int nbits);
-extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
-				unsigned int shift, unsigned int nbits);
-extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
-				unsigned int shift, unsigned int nbits);
-extern void bitmap_cut(unsigned long *dst, const unsigned long *src,
-		       unsigned int first, unsigned int cut,
-		       unsigned int nbits);
-extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+int __bitmap_equal(const unsigned long *bitmap1,
+		   const unsigned long *bitmap2, unsigned int nbits);
+bool __pure __bitmap_or_equal(const unsigned long *src1,
+			      const unsigned long *src2,
+			      const unsigned long *src3,
+			      unsigned int nbits);
+void __bitmap_complement(unsigned long *dst, const unsigned long *src,
+			 unsigned int nbits);
+void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
+			  unsigned int shift, unsigned int nbits);
+void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
+			 unsigned int shift, unsigned int nbits);
+void bitmap_cut(unsigned long *dst, const unsigned long *src,
+		unsigned int first, unsigned int cut, unsigned int nbits);
+int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+		 const unsigned long *bitmap2, unsigned int nbits);
+void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
+		 const unsigned long *bitmap2, unsigned int nbits);
+void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
+		  const unsigned long *bitmap2, unsigned int nbits);
+int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+		    const unsigned long *bitmap2, unsigned int nbits);
+void __bitmap_replace(unsigned long *dst,
+		      const unsigned long *old, const unsigned long *new,
+		      const unsigned long *mask, unsigned int nbits);
+int __bitmap_intersects(const unsigned long *bitmap1,
 			const unsigned long *bitmap2, unsigned int nbits);
-extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
-			const unsigned long *bitmap2, unsigned int nbits);
-extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
-			const unsigned long *bitmap2, unsigned int nbits);
-extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
-			const unsigned long *bitmap2, unsigned int nbits);
-extern void __bitmap_replace(unsigned long *dst,
-			const unsigned long *old, const unsigned long *new,
-			const unsigned long *mask, unsigned int nbits);
-extern int __bitmap_intersects(const unsigned long *bitmap1,
-			const unsigned long *bitmap2, unsigned int nbits);
-extern int __bitmap_subset(const unsigned long *bitmap1,
-			const unsigned long *bitmap2, unsigned int nbits);
-extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
-extern void __bitmap_set(unsigned long *map, unsigned int start, int len);
-extern void __bitmap_clear(unsigned long *map, unsigned int start, int len);
-
-extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
-						    unsigned long size,
-						    unsigned long start,
-						    unsigned int nr,
-						    unsigned long align_mask,
-						    unsigned long align_offset);
+int __bitmap_subset(const unsigned long *bitmap1,
+		    const unsigned long *bitmap2, unsigned int nbits);
+int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
+void __bitmap_set(unsigned long *map, unsigned int start, int len);
+void __bitmap_clear(unsigned long *map, unsigned int start, int len);
+
+unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
+					     unsigned long size,
+					     unsigned long start,
+					     unsigned int nr,
+					     unsigned long align_mask,
+					     unsigned long align_offset);
 
 /**
  * bitmap_find_next_zero_area - find a contiguous aligned zero area
@@ -190,33 +189,33 @@ bitmap_find_next_zero_area(unsigned long *map,
 					      align_mask, 0);
 }
 
-extern int bitmap_parse(const char *buf, unsigned int buflen,
+int bitmap_parse(const char *buf, unsigned int buflen,
 			unsigned long *dst, int nbits);
-extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
+int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
-extern int bitmap_parselist(const char *buf, unsigned long *maskp,
+int bitmap_parselist(const char *buf, unsigned long *maskp,
 			int nmaskbits);
-extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
+int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
 			unsigned long *dst, int nbits);
-extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
+void bitmap_remap(unsigned long *dst, const unsigned long *src,
 		const unsigned long *old, const unsigned long *new, unsigned int nbits);
-extern int bitmap_bitremap(int oldbit,
+int bitmap_bitremap(int oldbit,
 		const unsigned long *old, const unsigned long *new, int bits);
-extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
+void bitmap_onto(unsigned long *dst, const unsigned long *orig,
 		const unsigned long *relmap, unsigned int bits);
-extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
+void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 		unsigned int sz, unsigned int nbits);
-extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
-extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
-extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
+int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
+void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
+int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
 
 #ifdef __BIG_ENDIAN
-extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
+void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
 #else
 #define bitmap_copy_le bitmap_copy
 #endif
-extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
-extern int bitmap_print_to_pagebuf(bool list, char *buf,
+unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
+int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
 
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
@@ -265,9 +264,9 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst,
  * therefore conversion is not needed when copying data from/to arrays of u32.
  */
 #if BITS_PER_LONG == 64
-extern void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
+void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
 							unsigned int nbits);
-extern void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
+void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
 							unsigned int nbits);
 #else
 #define bitmap_from_arr32(bitmap, buf, nbits)			\
-- 
cgit v1.2.3


From c13656b904b6173aad723d9680a81c60de2f5edc Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 15 Mar 2021 10:13:55 +0100
Subject: lib: bitmap: order includes alphabetically

For better readability and maintenance: order the includes in bitmap
source files alphabetically.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/bitmap.h | 4 ++--
 lib/bitmap.c           | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 6939a8983026..3282db97e06c 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -4,10 +4,10 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/types.h>
 #include <linux/bitops.h>
-#include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/types.h>
 
 /*
  * bitmaps provide bit arrays that consume one or more unsigned
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 75006c4036e9..78f70d9007ad 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -3,17 +3,18 @@
  * lib/bitmap.c
  * Helper functions for bitmap.h.
  */
-#include <linux/export.h>
-#include <linux/thread_info.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
+
 #include <linux/bitmap.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/thread_info.h>
 #include <linux/uaccess.h>
 
 #include <asm/page.h>
-- 
cgit v1.2.3


From e829c2e4744850bab4d8f8ffebd00df10b4c6c2b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 15 Mar 2021 10:13:56 +0100
Subject: lib: bitmap: provide devm_bitmap_alloc() and devm_bitmap_zalloc()

Provide managed variants of bitmap_alloc() and bitmap_zalloc().

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/bitmap.h |  8 ++++++++
 lib/bitmap.c           | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 3282db97e06c..73d039476fa4 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -9,6 +9,8 @@
 #include <linux/string.h>
 #include <linux/types.h>
 
+struct device;
+
 /*
  * bitmaps provide bit arrays that consume one or more unsigned
  * longs.  The bitmap interface and available operations are listed
@@ -122,6 +124,12 @@ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
 unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
 void bitmap_free(const unsigned long *bitmap);
 
+/* Managed variants of the above. */
+unsigned long *devm_bitmap_alloc(struct device *dev,
+				 unsigned int nbits, gfp_t flags);
+unsigned long *devm_bitmap_zalloc(struct device *dev,
+				  unsigned int nbits, gfp_t flags);
+
 /*
  * lib/bitmap.c provides these functions:
  */
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 78f70d9007ad..27e08c0e547e 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -8,6 +8,7 @@
 #include <linux/bitops.h>
 #include <linux/bug.h>
 #include <linux/ctype.h>
+#include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
@@ -1263,6 +1264,38 @@ void bitmap_free(const unsigned long *bitmap)
 }
 EXPORT_SYMBOL(bitmap_free);
 
+static void devm_bitmap_free(void *data)
+{
+	unsigned long *bitmap = data;
+
+	bitmap_free(bitmap);
+}
+
+unsigned long *devm_bitmap_alloc(struct device *dev,
+				 unsigned int nbits, gfp_t flags)
+{
+	unsigned long *bitmap;
+	int ret;
+
+	bitmap = bitmap_alloc(nbits, flags);
+	if (!bitmap)
+		return NULL;
+
+	ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
+	if (ret)
+		return NULL;
+
+	return bitmap;
+}
+EXPORT_SYMBOL_GPL(devm_bitmap_alloc);
+
+unsigned long *devm_bitmap_zalloc(struct device *dev,
+				  unsigned int nbits, gfp_t flags)
+{
+	return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);
+
 #if BITS_PER_LONG == 64
 /**
  * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
-- 
cgit v1.2.3


From 7716506adac4664793a9d6d3dfa31ffddfa98714 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 4 May 2021 18:32:45 -0700
Subject: mm: introduce and use mapping_empty()

Patch series "Remove nrexceptional tracking", v2.

We actually use nrexceptional for very little these days.  It's a minor
pain to keep in sync with nrpages, but the pain becomes much bigger with
the THP patches because we don't know how many indices a shadow entry
occupies.  It's easier to just remove it than keep it accurate.

Also, we save 8 bytes per inode which is nothing to sneeze at; on my
laptop, it would improve shmem_inode_cache from 22 to 23 objects per
16kB, and inode_cache from 26 to 27 objects.  Combined, that saves
a megabyte of memory from a combined usage of 25MB for both caches.
Unfortunately, ext4 doesn't cross a magic boundary, so it doesn't save
any memory for ext4.

This patch (of 4):

Instead of checking the two counters (nrpages and nrexceptional), we can
just check whether i_pages is empty.

Link: https://lkml.kernel.org/r/20201026151849.24232-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20201026151849.24232-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Vishal Verma <vishal.l.verma@intel.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c          |  2 +-
 fs/dax.c                |  2 +-
 fs/gfs2/glock.c         |  3 +--
 include/linux/pagemap.h |  5 +++++
 mm/truncate.c           | 18 +++---------------
 5 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a5244e08b6c8..9114e0a0e7b4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+	if (mapping_empty(mapping))
 		return;
 
 	invalidate_bh_lrus();
diff --git a/fs/dax.c b/fs/dax.c
index b3d27fdc6775..999f3f22aea3 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -965,7 +965,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 		return -EIO;
 
-	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+	if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
 		return 0;
 
 	trace_dax_writeback_range(inode, xas.xa_index, end_index);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 84c38103aa06..ea7fc5c641c7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -273,8 +273,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
 	if (mapping) {
 		truncate_inode_pages_final(mapping);
 		if (!gfs2_withdrawn(sdp))
-			GLOCK_BUG_ON(gl, mapping->nrpages ||
-				     mapping->nrexceptional);
+			GLOCK_BUG_ON(gl, !mapping_empty(mapping));
 	}
 	trace_gfs2_glock_put(gl);
 	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 469fa7ffcf96..a4bd41128bf3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -18,6 +18,11 @@
 
 struct pagevec;
 
+static inline bool mapping_empty(struct address_space *mapping)
+{
+	return xa_empty(&mapping->i_pages);
+}
+
 /*
  * Bits in mapping->flags.
  */
diff --git a/mm/truncate.c b/mm/truncate.c
index 455944264663..adb8d4107988 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -295,7 +295,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	pgoff_t		index;
 	int		i;
 
-	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+	if (mapping_empty(mapping))
 		goto out;
 
 	/* Offsets within partial pages */
@@ -440,9 +440,6 @@ EXPORT_SYMBOL(truncate_inode_pages);
  */
 void truncate_inode_pages_final(struct address_space *mapping)
 {
-	unsigned long nrexceptional;
-	unsigned long nrpages;
-
 	/*
 	 * Page reclaim can not participate in regular inode lifetime
 	 * management (can't call iput()) and thus can race with the
@@ -452,16 +449,7 @@ void truncate_inode_pages_final(struct address_space *mapping)
 	 */
 	mapping_set_exiting(mapping);
 
-	/*
-	 * When reclaim installs eviction entries, it increases
-	 * nrexceptional first, then decreases nrpages.  Make sure we see
-	 * this in the right order or we might miss an entry.
-	 */
-	nrpages = mapping->nrpages;
-	smp_rmb();
-	nrexceptional = mapping->nrexceptional;
-
-	if (nrpages || nrexceptional) {
+	if (!mapping_empty(mapping)) {
 		/*
 		 * As truncation uses a lockless tree lookup, cycle
 		 * the tree lock to make sure any ongoing tree
@@ -633,7 +621,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	int ret2 = 0;
 	int did_range_unmap = 0;
 
-	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+	if (mapping_empty(mapping))
 		goto out;
 
 	pagevec_init(&pvec);
-- 
cgit v1.2.3


From 8bc3c481b3d0dcef2cf8e1b7c6b780af6725f7e3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 4 May 2021 18:32:54 -0700
Subject: mm: remove nrexceptional from inode

We no longer track anything in nrexceptional, so remove it, saving 8 bytes
per inode.

Link: https://lkml.kernel.org/r/20201026151849.24232-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Vishal Verma <vishal.l.verma@intel.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c         | 2 +-
 include/linux/fs.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 9e192bea0630..af48d1b722f0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -529,7 +529,7 @@ void clear_inode(struct inode *inode)
 	 */
 	xa_lock_irq(&inode->i_data.i_pages);
 	BUG_ON(inode->i_data.nrpages);
-	BUG_ON(inode->i_data.nrexceptional);
+	BUG_ON(!mapping_empty(&inode->i_data));
 	xa_unlock_irq(&inode->i_data.i_pages);
 	BUG_ON(!list_empty(&inode->i_data.private_list));
 	BUG_ON(!(inode->i_state & I_FREEING));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 12766edee81f..acef282b97c6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
  * @nrpages: Number of page entries, protected by the i_pages lock.
- * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
  * @writeback_index: Writeback starts here.
  * @a_ops: Methods.
  * @flags: Error bits and flags (AS_*).
@@ -463,7 +462,6 @@ struct address_space {
 	struct rb_root_cached	i_mmap;
 	struct rw_semaphore	i_mmap_rwsem;
 	unsigned long		nrpages;
-	unsigned long		nrexceptional;
 	pgoff_t			writeback_index;
 	const struct address_space_operations *a_ops;
 	unsigned long		flags;
-- 
cgit v1.2.3


From aec44e0f0213e36d4f0868a80cdc5097a510f79d Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 May 2021 18:33:00 -0700
Subject: hugetlb: pass vma into huge_pte_alloc() and huge_pmd_share()

Patch series "hugetlb: Disable huge pmd unshare for uffd-wp", v4.

This series tries to disable huge pmd unshare of hugetlbfs backed memory
for uffd-wp.  Although uffd-wp of hugetlbfs is still during rfc stage,
the idea of this series may be needed for multiple tasks (Axel's uffd
minor fault series, and Mike's soft dirty series), so I picked it out
from the larger series.

This patch (of 4):

It is a preparation work to be able to behave differently in the per
architecture huge_pte_alloc() according to different VMA attributes.

Pass it deeper into huge_pmd_share() so that we can avoid the find_vma() call.

[peterx@redhat.com: build fix]
  Link: https://lkml.kernel.org/r/20210304164653.GB397383@xz-x1Link: https://lkml.kernel.org/r/20210218230633.15028-1-peterx@redhat.com

Link: https://lkml.kernel.org/r/20210218230633.15028-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/hugetlbpage.c   |  4 ++--
 arch/ia64/mm/hugetlbpage.c    |  3 ++-
 arch/mips/mm/hugetlbpage.c    |  4 ++--
 arch/parisc/mm/hugetlbpage.c  |  2 +-
 arch/powerpc/mm/hugetlbpage.c |  3 ++-
 arch/s390/mm/hugetlbpage.c    |  2 +-
 arch/sh/mm/hugetlbpage.c      |  2 +-
 arch/sparc/mm/hugetlbpage.c   |  2 +-
 include/linux/hugetlb.h       |  5 +++--
 mm/hugetlb.c                  | 15 ++++++++-------
 mm/userfaultfd.c              |  2 +-
 11 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 55ecf6de9ff7..6e3bcffe2837 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -252,7 +252,7 @@ void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
 		set_pte(ptep, pte);
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
@@ -286,7 +286,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	} else if (sz == PMD_SIZE) {
 		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
 		    pud_none(READ_ONCE(*pudp)))
-			ptep = huge_pmd_share(mm, addr, pudp);
+			ptep = huge_pmd_share(mm, vma, addr, pudp);
 		else
 			ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
 	} else if (sz == (CONT_PMD_SIZE)) {
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index b331f94d20ac..f993cb36c062 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -25,7 +25,8 @@ unsigned int hpage_shift = HPAGE_SHIFT_DEFAULT;
 EXPORT_SYMBOL(hpage_shift);
 
 pte_t *
-huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+	       unsigned long addr, unsigned long sz)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
 	pgd_t *pgd;
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index b9f76f433617..7eaff5b07873 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -21,8 +21,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr,
-		      unsigned long sz)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+		      unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index 43652de5f139..d1d3990b83f6 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -44,7 +44,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 }
 
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index d142b76d507d..9a75ba078e1b 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -106,7 +106,8 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  * At this point we do the placement change only for BOOK3S 64. This would
  * possibly work on other subarchs.
  */
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+		      unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
 	p4d_t *p4;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 3b5a4d25ca9b..da36d13ffc16 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -189,7 +189,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 220d7bc43d2b..999ab5916e69 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -21,7 +21,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index ad4b42f04988..04d8790f6c32 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -279,7 +279,7 @@ unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t *)&p
 unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t *)&pmd); }
 unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index cccd1aab69dd..653ef322fac9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -152,7 +152,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
 
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+		      unsigned long addr, pud_t *pud);
 
 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
 
@@ -161,7 +162,7 @@ extern struct list_head huge_boot_pages;
 
 /* arch callbacks */
 
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm,
 		       unsigned long addr, unsigned long sz);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6c72433bec1e..a02a651088d3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3795,7 +3795,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		src_pte = huge_pte_offset(src, addr, sz);
 		if (!src_pte)
 			continue;
-		dst_pte = huge_pte_alloc(dst, addr, sz);
+		dst_pte = huge_pte_alloc(dst, vma, addr, sz);
 		if (!dst_pte) {
 			ret = -ENOMEM;
 			break;
@@ -4563,7 +4563,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	mapping = vma->vm_file->f_mapping;
 	i_mmap_lock_read(mapping);
-	ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
 	if (!ptep) {
 		i_mmap_unlock_read(mapping);
 		return VM_FAULT_OOM;
@@ -5370,9 +5370,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
  * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
  * only required for subsequent processing.
  */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+		      unsigned long addr, pud_t *pud)
 {
-	struct vm_area_struct *vma = find_vma(mm, addr);
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
 			vma->vm_pgoff;
@@ -5450,7 +5450,8 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 #define want_pmd_share()	(1)
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+		      unsigned long addr, pud_t *pud)
 {
 	return NULL;
 }
@@ -5469,7 +5470,7 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
@@ -5488,7 +5489,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 		} else {
 			BUG_ON(sz != PMD_SIZE);
 			if (want_pmd_share() && pud_none(*pud))
-				pte = huge_pmd_share(mm, addr, pud);
+				pte = huge_pmd_share(mm, vma, addr, pud);
 			else
 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
 		}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451402d7..063cbb17e8d8 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -290,7 +290,7 @@ retry:
 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 		err = -ENOMEM;
-		dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
+		dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
 		if (!dst_pte) {
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			i_mmap_unlock_read(mapping);
-- 
cgit v1.2.3


From c1991e0705d143be773c984b006f2078aa9f2853 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 May 2021 18:33:04 -0700
Subject: hugetlb/userfaultfd: forbid huge pmd sharing when uffd enabled

Huge pmd sharing could bring problem to userfaultfd.  The thing is that
userfaultfd is running its logic based on the special bits on page table
entries, however the huge pmd sharing could potentially share page table
entries for different address ranges.  That could cause issues on
either:

 - When sharing huge pmd page tables for an uffd write protected range,
   the newly mapped huge pmd range will also be write protected
   unexpectedly, or,

 - When we try to write protect a range of huge pmd shared range, we'll
   first do huge_pmd_unshare() in hugetlb_change_protection(), however
   that also means the UFFDIO_WRITEPROTECT could be silently skipped for
   the shared region, which could lead to data loss.

While at it, a few other things are done altogether:

 - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because
   that's definitely something that arch code would like to use too

 - ARM64 currently directly check against
   CONFIG_ARCH_WANT_HUGE_PMD_SHARE when trying to share huge pmd. Switch
   to the want_pmd_share() helper.

 - Move vma_shareable() from huge_pmd_share() into want_pmd_share().

[peterx@redhat.com: fix build with !ARCH_WANT_HUGE_PMD_SHARE]
  Link: https://lkml.kernel.org/r/20210310185359.88297-1-peterx@redhat.com

Link: https://lkml.kernel.org/r/20210218231202.15426-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Axel Rasmussen <axelrasmussen@google.com>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/hugetlbpage.c   |  3 +--
 include/linux/hugetlb.h       |  2 ++
 include/linux/userfaultfd_k.h |  9 +++++++++
 mm/hugetlb.c                  | 22 ++++++++++++++++------
 4 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 6e3bcffe2837..58987a98e179 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -284,8 +284,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		ptep = pte_alloc_map(mm, pmdp, addr);
 	} else if (sz == PMD_SIZE) {
-		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
-		    pud_none(READ_ONCE(*pudp)))
+		if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
 			ptep = huge_pmd_share(mm, vma, addr, pudp);
 		else
 			ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 653ef322fac9..88e93809a455 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1040,4 +1040,6 @@ static inline __init void hugetlb_cma_check(void)
 }
 #endif
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a8e5f3ea9bb2..c63ccdae3eab 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,15 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 	return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
 }
 
+/*
+ * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp
+ * protect information is per pgtable entry.
+ */
+static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_UFFD_WP;
+}
+
 static inline bool userfaultfd_missing(struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_UFFD_MISSING;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a02a651088d3..91647e824015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5326,6 +5326,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 	return false;
 }
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_USERFAULTFD
+	if (uffd_disable_huge_pmd_share(vma))
+		return false;
+#endif
+	return vma_shareable(vma, addr);
+}
+
 /*
  * Determine if start,end range within vma could be mapped by shared pmd.
  * If yes, adjust start and end to cover range associated with possible
@@ -5382,9 +5391,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (!vma_shareable(vma, addr))
-		return (pte_t *)pmd_alloc(mm, pud, addr);
-
 	i_mmap_assert_locked(mapping);
 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
 		if (svma == vma)
@@ -5448,7 +5454,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
 	return 1;
 }
-#define want_pmd_share()	(1)
+
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, pud_t *pud)
@@ -5466,7 +5472,11 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end)
 {
 }
-#define want_pmd_share()	(0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+	return false;
+}
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
@@ -5488,7 +5498,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			pte = (pte_t *)pud;
 		} else {
 			BUG_ON(sz != PMD_SIZE);
-			if (want_pmd_share() && pud_none(*pud))
+			if (want_pmd_share(vma, addr) && pud_none(*pud))
 				pte = huge_pmd_share(mm, vma, addr, pud);
 			else
 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
-- 
cgit v1.2.3


From 537cf30bba241ae88d5f4b0b6a5e66271b394852 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 May 2021 18:33:08 -0700
Subject: mm/hugetlb: move flush_hugetlb_tlb_range() into hugetlb.h

Prepare for it to be called outside of mm/hugetlb.c.

Link: https://lkml.kernel.org/r/20210218231204.15474-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Axel Rasmussen <axelrasmussen@google.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 8 ++++++++
 mm/hugetlb.c            | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 88e93809a455..e43668144664 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1042,4 +1042,12 @@ static inline __init void hugetlb_cma_check(void)
 
 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
 
+#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+/*
+ * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
+ * implement this.
+ */
+#define flush_hugetlb_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
+#endif
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 91647e824015..3868f3126534 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4996,14 +4996,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	return i ? i : err;
 }
 
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
-#endif
-
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot)
 {
-- 
cgit v1.2.3


From 6dfeaff93be1a4cab4fb48dad7df326d05059a99 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 4 May 2021 18:33:13 -0700
Subject: hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp

Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because
userfaultfd-wp is always based on pgtable entries, so they cannot be
shared.

Walk the hugetlb range and unshare all such mappings if there is, right
before UFFDIO_REGISTER will succeed and return to userspace.

This will pair with want_pmd_share() in hugetlb code so that huge pmd
sharing is completely disabled for userfaultfd-wp registered range.

Link: https://lkml.kernel.org/r/20210218231206.15524-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c        |  4 ++++
 include/linux/hugetlb.h |  3 +++
 mm/hugetlb.c            | 51 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0be8cdd4425a..e5ce3b4e6c3d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -15,6 +15,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
@@ -1449,6 +1450,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		vma->vm_flags = new_flags;
 		vma->vm_userfaultfd_ctx.ctx = ctx;
 
+		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
+			hugetlb_unshare_all_pmds(vma);
+
 	skip:
 		prev = vma;
 		start = vma->vm_end;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e43668144664..0f5813522224 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -188,6 +188,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot);
 
 bool is_hugetlb_entry_migration(pte_t pte);
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -369,6 +370,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
 	return 0;
 }
 
+static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 /*
  * hugepages at page global directory. If arch support
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3868f3126534..e86d3abcc300 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5691,6 +5691,57 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
 	}
 }
 
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_notifier_range range;
+	unsigned long address, start, end;
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+	if (!(vma->vm_flags & VM_MAYSHARE))
+		return;
+
+	start = ALIGN(vma->vm_start, PUD_SIZE);
+	end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+	if (start >= end)
+		return;
+
+	/*
+	 * No need to call adjust_range_if_pmd_sharing_possible(), because
+	 * we have already done the PUD_SIZE alignment.
+	 */
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+				start, end);
+	mmu_notifier_invalidate_range_start(&range);
+	i_mmap_lock_write(vma->vm_file->f_mapping);
+	for (address = start; address < end; address += PUD_SIZE) {
+		unsigned long tmp = address;
+
+		ptep = huge_pte_offset(mm, address, sz);
+		if (!ptep)
+			continue;
+		ptl = huge_pte_lock(h, mm, ptep);
+		/* We don't want 'address' to be changed */
+		huge_pmd_unshare(mm, vma, &tmp, ptep);
+		spin_unlock(ptl);
+	}
+	flush_hugetlb_tlb_range(vma, start, end);
+	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	/*
+	 * No need to call mmu_notifier_invalidate_range(), see
+	 * Documentation/vm/mmu_notifier.rst.
+	 */
+	mmu_notifier_invalidate_range_end(&range);
+}
+
 #ifdef CONFIG_CMA
 static bool cma_reserve_called __initdata;
 
-- 
cgit v1.2.3


From d4afd60c24f87b6275b12ec3d67d8c2ad78cb075 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Tue, 4 May 2021 18:34:05 -0700
Subject: mm/huge_memory.c: remove unused macro
 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG

Commit 4958e4d86ecb ("mm: thp: remove debug_cow switch") forgot to
remove TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG macro.  Remove it here.

Link: https://lkml.kernel.org/r/20210318122722.13135-6-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Thomas Hellstrm (Intel) <thomas_os@shipmail.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: yuleixzhang <yulei.kernel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ba973efcd369..9626fda5efce 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -87,9 +87,6 @@ enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
 	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
-#ifdef CONFIG_DEBUG_VM
-	TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
-#endif
 };
 
 struct kobject;
-- 
cgit v1.2.3


From 2938396771c8fd0870b5284319f9e78b4b552a79 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Tue, 4 May 2021 18:34:52 -0700
Subject: hugetlb: add per-hstate mutex to synchronize user adjustments

The helper routine hstate_next_node_to_alloc accesses and modifies the
hstate variable next_nid_to_alloc.  The helper is used by the routines
alloc_pool_huge_page and adjust_pool_surplus.  adjust_pool_surplus is
called with hugetlb_lock held.  However, alloc_pool_huge_page can not be
called with the hugetlb lock held as it will call the page allocator.
Two instances of alloc_pool_huge_page could be run in parallel or
alloc_pool_huge_page could run in parallel with adjust_pool_surplus
which may result in the variable next_nid_to_alloc becoming invalid for
the caller and pages being allocated on the wrong node.

Both alloc_pool_huge_page and adjust_pool_surplus are only called from
the routine set_max_huge_pages after boot.  set_max_huge_pages is only
called as the reusult of a user writing to the proc/sysfs nr_hugepages,
or nr_hugepages_mempolicy file to adjust the number of hugetlb pages.

It makes little sense to allow multiple adjustment to the number of
hugetlb pages in parallel.  Add a mutex to the hstate and use it to only
allow one hugetlb page adjustment at a time.  This will synchronize
modifications to the next_nid_to_alloc variable.

Link: https://lkml.kernel.org/r/20210409205254.242291-4-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.ibm.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: HORIGUCHI NAOYA <naoya.horiguchi@nec.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 1 +
 mm/hugetlb.c            | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0f5813522224..628639422c5d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -559,6 +559,7 @@ HPAGEFLAG(Freed, freed)
 #define HSTATE_NAME_LEN 32
 /* Defines one hugetlb page size */
 struct hstate {
+	struct mutex resize_lock;
 	int next_nid_to_alloc;
 	int next_nid_to_free;
 	unsigned int order;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a5636e1593c5..067fd29a9d51 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2621,6 +2621,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	else
 		return -ENOMEM;
 
+	/*
+	 * resize_lock mutex prevents concurrent adjustments to number of
+	 * pages in hstate via the proc/sysfs interfaces.
+	 */
+	mutex_lock(&h->resize_lock);
 	spin_lock(&hugetlb_lock);
 
 	/*
@@ -2653,6 +2658,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
 		if (count > persistent_huge_pages(h)) {
 			spin_unlock(&hugetlb_lock);
+			mutex_unlock(&h->resize_lock);
 			NODEMASK_FREE(node_alloc_noretry);
 			return -EINVAL;
 		}
@@ -2727,6 +2733,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 out:
 	h->max_huge_pages = persistent_huge_pages(h);
 	spin_unlock(&hugetlb_lock);
+	mutex_unlock(&h->resize_lock);
 
 	NODEMASK_FREE(node_alloc_noretry);
 
@@ -3214,6 +3221,7 @@ void __init hugetlb_add_hstate(unsigned int order)
 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
 	BUG_ON(order == 0);
 	h = &hstates[hugetlb_max_hstate++];
+	mutex_init(&h->resize_lock);
 	h->order = order;
 	h->mask = ~(huge_page_size(h) - 1);
 	for (i = 0; i < MAX_NUMNODES; ++i)
-- 
cgit v1.2.3


From 369fa227c21949b22fd7374506c4992a0d7bb580 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:35:26 -0700
Subject: mm: make alloc_contig_range handle free hugetlb pages

alloc_contig_range will fail if it ever sees a HugeTLB page within the
range we are trying to allocate, even when that page is free and can be
easily reallocated.

This has proved to be problematic for some users of alloc_contic_range,
e.g: CMA and virtio-mem, where those would fail the call even when those
pages lay in ZONE_MOVABLE and are free.

We can do better by trying to replace such page.

Free hugepages are tricky to handle so as to no userspace application
notices disruption, we need to replace the current free hugepage with a
new one.

In order to do that, a new function called alloc_and_dissolve_huge_page is
introduced.  This function will first try to get a new fresh hugepage, and
if it succeeds, it will replace the old one in the free hugepage pool.

The free page replacement is done under hugetlb_lock, so no external users
of hugetlb will notice the change.  To allocate the new huge page, we use
alloc_buddy_huge_page(), so we do not have to deal with any counters, and
prep_new_huge_page() is not called.  This is valulable because in case we
need to free the new page, we only need to call __free_pages().

Once we know that the page to be replaced is a genuine 0-refcounted huge
page, we remove the old page from the freelist by remove_hugetlb_page().
Then, we can call __prep_new_huge_page() and
__prep_account_new_huge_page() for the new huge page to properly
initialize it and increment the hstate->nr_huge_pages counter (previously
decremented by remove_hugetlb_page()).  Once done, the page is enqueued by
enqueue_huge_page() and it is ready to be used.

There is one tricky case when page's refcount is 0 because it is in the
process of being released.  A missing PageHugeFreed bit will tell us that
freeing is in flight so we retry after dropping the hugetlb_lock.  The
race window should be small and the next retry should make a forward
progress.

E.g:

CPU0				CPU1
free_huge_page()		isolate_or_dissolve_huge_page
				  PageHuge() == T
				  alloc_and_dissolve_huge_page
				    alloc_buddy_huge_page()
				    spin_lock_irq(hugetlb_lock)
				    // PageHuge() && !PageHugeFreed &&
				    // !PageCount()
				    spin_unlock_irq(hugetlb_lock)
  spin_lock_irq(hugetlb_lock)
  1) update_and_free_page
       PageHuge() == F
       __free_pages()
  2) enqueue_huge_page
       SetPageHugeFreed()
  spin_unlock_irq(&hugetlb_lock)
				  spin_lock_irq(hugetlb_lock)
                                   1) PageHuge() == F (freed by case#1 from CPU0)
				   2) PageHuge() == T
                                       PageHugeFreed() == T
                                       - proceed with replacing the page

In the case above we retry as the window race is quite small and we have
high chances to succeed next time.

With regard to the allocation, we restrict it to the node the page belongs
to with __GFP_THISNODE, meaning we do not fallback on other node's zones.

Note that gigantic hugetlb pages are fenced off since there is a cyclic
dependency between them and alloc_contig_range.

Link: https://lkml.kernel.org/r/20210419075413.1064-6-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |   6 +++
 mm/compaction.c         |  33 ++++++++++++--
 mm/hugetlb.c            | 116 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 152 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 628639422c5d..ec6a10b8860a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -588,6 +588,7 @@ struct huge_bootmem_page {
 	struct hstate *hstate;
 };
 
+int isolate_or_dissolve_huge_page(struct page *page);
 struct page *alloc_huge_page(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
@@ -870,6 +871,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
+static inline int isolate_or_dissolve_huge_page(struct page *page)
+{
+	return -ENOMEM;
+}
+
 static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
 					   unsigned long addr,
 					   int avoid_reserve)
diff --git a/mm/compaction.c b/mm/compaction.c
index c4d8007221b7..b77e1382307f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -788,7 +788,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
  * Isolate all pages that can be migrated from the range specified by
  * [low_pfn, end_pfn). The range is expected to be within same pageblock.
  * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
- * or 0.
+ * -ENOMEM in case we could not allocate a page, or 0.
  * cc->migrate_pfn will contain the next pfn to scan.
  *
  * The pages are isolated on cc->migratepages list (not required to be empty),
@@ -906,6 +906,29 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			valid_page = page;
 		}
 
+		if (PageHuge(page) && cc->alloc_contig) {
+			ret = isolate_or_dissolve_huge_page(page);
+
+			/*
+			 * Fail isolation in case isolate_or_dissolve_huge_page()
+			 * reports an error. In case of -ENOMEM, abort right away.
+			 */
+			if (ret < 0) {
+				 /* Do not report -EBUSY down the chain */
+				if (ret == -EBUSY)
+					ret = 0;
+				low_pfn += (1UL << compound_order(page)) - 1;
+				goto isolate_fail;
+			}
+
+			/*
+			 * Ok, the hugepage was dissolved. Now these pages are
+			 * Buddy and cannot be re-allocated because they are
+			 * isolated. Fall-through as the check below handles
+			 * Buddy pages.
+			 */
+		}
+
 		/*
 		 * Skip if free. We read page order here without zone lock
 		 * which is generally unsafe, but the race window is small and
@@ -1065,7 +1088,7 @@ isolate_fail_put:
 		put_page(page);
 
 isolate_fail:
-		if (!skip_on_failure)
+		if (!skip_on_failure && ret != -ENOMEM)
 			continue;
 
 		/*
@@ -1091,6 +1114,9 @@ isolate_fail:
 			 */
 			next_skip_pfn += 1UL << cc->order;
 		}
+
+		if (ret == -ENOMEM)
+			break;
 	}
 
 	/*
@@ -1143,7 +1169,8 @@ fatal_pending:
  * @start_pfn: The first PFN to start isolating.
  * @end_pfn:   The one-past-last PFN.
  *
- * Returns -EAGAIN when contented, -EINTR in case of a signal pending or 0.
+ * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
+ * in case we could not allocate a page, or 0.
  */
 int
 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 63760be2688e..92f3cd08946f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2267,6 +2267,122 @@ static void restore_reserve_on_error(struct hstate *h,
 	}
 }
 
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page)
+{
+	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+	int nid = page_to_nid(old_page);
+	struct page *new_page;
+	int ret = 0;
+
+	/*
+	 * Before dissolving the page, we need to allocate a new one for the
+	 * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+	 * not having to deal with prep_new_huge_page() and avoids dealing of any
+	 * counters. This simplifies and let us do the whole thing under the
+	 * lock.
+	 */
+	new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+	if (!new_page)
+		return -ENOMEM;
+
+retry:
+	spin_lock_irq(&hugetlb_lock);
+	if (!PageHuge(old_page)) {
+		/*
+		 * Freed from under us. Drop new_page too.
+		 */
+		goto free_new;
+	} else if (page_count(old_page)) {
+		/*
+		 * Someone has grabbed the page, fail for now.
+		 */
+		ret = -EBUSY;
+		goto free_new;
+	} else if (!HPageFreed(old_page)) {
+		/*
+		 * Page's refcount is 0 but it has not been enqueued in the
+		 * freelist yet. Race window is small, so we can succeed here if
+		 * we retry.
+		 */
+		spin_unlock_irq(&hugetlb_lock);
+		cond_resched();
+		goto retry;
+	} else {
+		/*
+		 * Ok, old_page is still a genuine free hugepage. Remove it from
+		 * the freelist and decrease the counters. These will be
+		 * incremented again when calling __prep_account_new_huge_page()
+		 * and enqueue_huge_page() for new_page. The counters will remain
+		 * stable since this happens under the lock.
+		 */
+		remove_hugetlb_page(h, old_page, false);
+
+		/*
+		 * new_page needs to be initialized with the standard hugetlb
+		 * state. This is normally done by prep_new_huge_page() but
+		 * that takes hugetlb_lock which is already held so we need to
+		 * open code it here.
+		 * Reference count trick is needed because allocator gives us
+		 * referenced page but the pool requires pages with 0 refcount.
+		 */
+		__prep_new_huge_page(new_page);
+		__prep_account_new_huge_page(h, nid);
+		page_ref_dec(new_page);
+		enqueue_huge_page(h, new_page);
+
+		/*
+		 * Pages have been replaced, we can safely free the old one.
+		 */
+		spin_unlock_irq(&hugetlb_lock);
+		update_and_free_page(h, old_page);
+	}
+
+	return ret;
+
+free_new:
+	spin_unlock_irq(&hugetlb_lock);
+	__free_pages(new_page, huge_page_order(h));
+
+	return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page)
+{
+	struct hstate *h;
+	struct page *head;
+
+	/*
+	 * The page might have been dissolved from under our feet, so make sure
+	 * to carefully check the state under the lock.
+	 * Return success when racing as if we dissolved the page ourselves.
+	 */
+	spin_lock_irq(&hugetlb_lock);
+	if (PageHuge(page)) {
+		head = compound_head(page);
+		h = page_hstate(head);
+	} else {
+		spin_unlock_irq(&hugetlb_lock);
+		return 0;
+	}
+	spin_unlock_irq(&hugetlb_lock);
+
+	/*
+	 * Fence off gigantic pages as there is a cyclic dependency between
+	 * alloc_contig_range and them. Return -ENOMEM as this has the effect
+	 * of bailing out right away without further retrying.
+	 */
+	if (hstate_is_gigantic(h))
+		return -ENOMEM;
+
+	return alloc_and_dissolve_huge_page(h, head);
+}
+
 struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr, int avoid_reserve)
 {
-- 
cgit v1.2.3


From ae37c7ff79f1f030e28ec76c46ee032f8fd07607 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:35:29 -0700
Subject: mm: make alloc_contig_range handle in-use hugetlb pages

alloc_contig_range() will fail if it finds a HugeTLB page within the
range, without a chance to handle them.  Since HugeTLB pages can be
migrated as any LRU or Movable page, it does not make sense to bail out
without trying.  Enable the interface to recognize in-use HugeTLB pages so
we can migrate them, and have much better chances to succeed the call.

Link: https://lkml.kernel.org/r/20210419075413.1064-7-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  5 +++--
 mm/compaction.c         | 12 +++++++++++-
 mm/hugetlb.c            | 22 +++++++++++++++++-----
 mm/vmscan.c             |  5 +++--
 4 files changed, 34 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ec6a10b8860a..d0f310ae3f82 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -588,7 +588,7 @@ struct huge_bootmem_page {
 	struct hstate *hstate;
 };
 
-int isolate_or_dissolve_huge_page(struct page *page);
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 struct page *alloc_huge_page(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
@@ -871,7 +871,8 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 
-static inline int isolate_or_dissolve_huge_page(struct page *page)
+static inline int isolate_or_dissolve_huge_page(struct page *page,
+						struct list_head *list)
 {
 	return -ENOMEM;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
index b77e1382307f..335862f1661c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -907,7 +907,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		}
 
 		if (PageHuge(page) && cc->alloc_contig) {
-			ret = isolate_or_dissolve_huge_page(page);
+			ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
 
 			/*
 			 * Fail isolation in case isolate_or_dissolve_huge_page()
@@ -921,6 +921,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 				goto isolate_fail;
 			}
 
+			if (PageHuge(page)) {
+				/*
+				 * Hugepage was successfully isolated and placed
+				 * on the cc->migratepages list.
+				 */
+				low_pfn += compound_nr(page) - 1;
+				goto isolate_success_no_list;
+			}
+
 			/*
 			 * Ok, the hugepage was dissolved. Now these pages are
 			 * Buddy and cannot be re-allocated because they are
@@ -1062,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 isolate_success:
 		list_add(&page->lru, &cc->migratepages);
+isolate_success_no_list:
 		cc->nr_migratepages += compound_nr(page);
 		nr_isolated += compound_nr(page);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 92f3cd08946f..b5977d9709ad 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2271,9 +2271,11 @@ static void restore_reserve_on_error(struct hstate *h,
  * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
  * @h: struct hstate old page belongs to
  * @old_page: Old page to dissolve
+ * @list: List to isolate the page in case we need to
  * Returns 0 on success, otherwise negated error.
  */
-static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page)
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
+					struct list_head *list)
 {
 	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
 	int nid = page_to_nid(old_page);
@@ -2300,9 +2302,13 @@ retry:
 		goto free_new;
 	} else if (page_count(old_page)) {
 		/*
-		 * Someone has grabbed the page, fail for now.
+		 * Someone has grabbed the page, try to isolate it here.
+		 * Fail with -EBUSY if not possible.
 		 */
-		ret = -EBUSY;
+		spin_unlock_irq(&hugetlb_lock);
+		if (!isolate_huge_page(old_page, list))
+			ret = -EBUSY;
+		spin_lock_irq(&hugetlb_lock);
 		goto free_new;
 	} else if (!HPageFreed(old_page)) {
 		/*
@@ -2352,10 +2358,11 @@ free_new:
 	return ret;
 }
 
-int isolate_or_dissolve_huge_page(struct page *page)
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
 {
 	struct hstate *h;
 	struct page *head;
+	int ret = -EBUSY;
 
 	/*
 	 * The page might have been dissolved from under our feet, so make sure
@@ -2380,7 +2387,12 @@ int isolate_or_dissolve_huge_page(struct page *page)
 	if (hstate_is_gigantic(h))
 		return -ENOMEM;
 
-	return alloc_and_dissolve_huge_page(h, head);
+	if (page_count(head) && isolate_huge_page(head, list))
+		ret = 0;
+	else if (!page_count(head))
+		ret = alloc_and_dissolve_huge_page(h, head, list);
+
+	return ret;
 }
 
 struct page *alloc_huge_page(struct vm_area_struct *vma,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 562e87cbd7a1..42aaef30633e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1507,8 +1507,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	LIST_HEAD(clean_pages);
 
 	list_for_each_entry_safe(page, next, page_list, lru) {
-		if (page_is_file_lru(page) && !PageDirty(page) &&
-		    !__PageMovable(page) && !PageUnevictable(page)) {
+		if (!PageHuge(page) && page_is_file_lru(page) &&
+		    !PageDirty(page) && !__PageMovable(page) &&
+		    !PageUnevictable(page)) {
 			ClearPageActive(page);
 			list_move(&page->lru, &clean_pages);
 		}
-- 
cgit v1.2.3


From 7677f7fd8be76659cd2d0db8ff4093bbb51c20e5 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 4 May 2021 18:35:36 -0700
Subject: userfaultfd: add minor fault registration mode

Patch series "userfaultfd: add minor fault handling", v9.

Overview
========

This series adds a new userfaultfd feature, UFFD_FEATURE_MINOR_HUGETLBFS.
When enabled (via the UFFDIO_API ioctl), this feature means that any
hugetlbfs VMAs registered with UFFDIO_REGISTER_MODE_MISSING will *also*
get events for "minor" faults.  By "minor" fault, I mean the following
situation:

Let there exist two mappings (i.e., VMAs) to the same page(s) (shared
memory).  One of the mappings is registered with userfaultfd (in minor
mode), and the other is not.  Via the non-UFFD mapping, the underlying
pages have already been allocated & filled with some contents.  The UFFD
mapping has not yet been faulted in; when it is touched for the first
time, this results in what I'm calling a "minor" fault.  As a concrete
example, when working with hugetlbfs, we have huge_pte_none(), but
find_lock_page() finds an existing page.

We also add a new ioctl to resolve such faults: UFFDIO_CONTINUE.  The idea
is, userspace resolves the fault by either a) doing nothing if the
contents are already correct, or b) updating the underlying contents using
the second, non-UFFD mapping (via memcpy/memset or similar, or something
fancier like RDMA, or etc...).  In either case, userspace issues
UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are
correct, carry on setting up the mapping".

Use Case
========

Consider the use case of VM live migration (e.g. under QEMU/KVM):

1. While a VM is still running, we copy the contents of its memory to a
   target machine. The pages are populated on the target by writing to the
   non-UFFD mapping, using the setup described above. The VM is still running
   (and therefore its memory is likely changing), so this may be repeated
   several times, until we decide the target is "up to date enough".

2. We pause the VM on the source, and start executing on the target machine.
   During this gap, the VM's user(s) will *see* a pause, so it is desirable to
   minimize this window.

3. Between the last time any page was copied from the source to the target, and
   when the VM was paused, the contents of that page may have changed - and
   therefore the copy we have on the target machine is out of date. Although we
   can keep track of which pages are out of date, for VMs with large amounts of
   memory, it is "slow" to transfer this information to the target machine. We
   want to resume execution before such a transfer would complete.

4. So, the guest begins executing on the target machine. The first time it
   touches its memory (via the UFFD-registered mapping), userspace wants to
   intercept this fault. Userspace checks whether or not the page is up to date,
   and if not, copies the updated page from the source machine, via the non-UFFD
   mapping. Finally, whether a copy was performed or not, userspace issues a
   UFFDIO_CONTINUE ioctl to tell the kernel "I have ensured the page contents
   are correct, carry on setting up the mapping".

We don't have to do all of the final updates on-demand. The userfaultfd manager
can, in the background, also copy over updated pages once it receives the map of
which pages are up-to-date or not.

Interaction with Existing APIs
==============================

Because this is a feature, a registered VMA could potentially receive both
missing and minor faults.  I spent some time thinking through how the
existing API interacts with the new feature:

UFFDIO_CONTINUE cannot be used to resolve non-minor faults, as it does not
allocate a new page.  If UFFDIO_CONTINUE is used on a non-minor fault:

- For non-shared memory or shmem, -EINVAL is returned.
- For hugetlb, -EFAULT is returned.

UFFDIO_COPY and UFFDIO_ZEROPAGE cannot be used to resolve minor faults.
Without modifications, the existing codepath assumes a new page needs to
be allocated.  This is okay, since userspace must have a second
non-UFFD-registered mapping anyway, thus there isn't much reason to want
to use these in any case (just memcpy or memset or similar).

- If UFFDIO_COPY is used on a minor fault, -EEXIST is returned.
- If UFFDIO_ZEROPAGE is used on a minor fault, -EEXIST is returned (or -EINVAL
  in the case of hugetlb, as UFFDIO_ZEROPAGE is unsupported in any case).
- UFFDIO_WRITEPROTECT simply doesn't work with shared memory, and returns
  -ENOENT in that case (regardless of the kind of fault).

Future Work
===========

This series only supports hugetlbfs.  I have a second series in flight to
support shmem as well, extending the functionality.  This series is more
mature than the shmem support at this point, and the functionality works
fully on hugetlbfs, so this series can be merged first and then shmem
support will follow.

This patch (of 6):

This feature allows userspace to intercept "minor" faults.  By "minor"
faults, I mean the following situation:

Let there exist two mappings (i.e., VMAs) to the same page(s).  One of the
mappings is registered with userfaultfd (in minor mode), and the other is
not.  Via the non-UFFD mapping, the underlying pages have already been
allocated & filled with some contents.  The UFFD mapping has not yet been
faulted in; when it is touched for the first time, this results in what
I'm calling a "minor" fault.  As a concrete example, when working with
hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing
page.

This commit adds the new registration mode, and sets the relevant flag on
the VMAs being registered.  In the hugetlb fault path, if we find that we
have huge_pte_none(), but find_lock_page() does indeed find an existing
page, then we have a "minor" fault, and if the VMA has the userfaultfd
registration flag, we call into userfaultfd to handle it.

This is implemented as a new registration mode, instead of an API feature.
This is because the alternative implementation has significant drawbacks
[1].

However, doing it this was requires we allocate a VM_* flag for the new
registration mode.  On 32-bit systems, there are no unused bits, so this
feature is only supported on architectures with
CONFIG_ARCH_USES_HIGH_VMA_FLAGS.  When attempting to register a VMA in
MINOR mode on 32-bit architectures, we return -EINVAL.

[1] https://lore.kernel.org/patchwork/patch/1380226/

[peterx@redhat.com: fix minor fault page leak]
  Link: https://lkml.kernel.org/r/20210322175132.36659-1-peterx@redhat.com

Link: https://lkml.kernel.org/r/20210301222728.176417-1-axelrasmussen@google.com
Link: https://lkml.kernel.org/r/20210301222728.176417-2-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Steven Price <steven.price@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig               |  1 +
 arch/x86/Kconfig                 |  1 +
 fs/proc/task_mmu.c               |  3 ++
 fs/userfaultfd.c                 | 78 +++++++++++++++++++++++----------------
 include/linux/mm.h               |  7 ++++
 include/linux/userfaultfd_k.h    | 15 +++++++-
 include/trace/events/mmflags.h   |  7 ++++
 include/uapi/linux/userfaultfd.h | 15 +++++++-
 init/Kconfig                     |  5 +++
 mm/hugetlb.c                     | 80 ++++++++++++++++++++++++++--------------
 10 files changed, 150 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7f2a80091337..04c69f606537 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -213,6 +213,7 @@ config ARM64
 	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
+	select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dac15f646f79..1c350e8782ed 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -165,6 +165,7 @@ config X86
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
 	select HAVE_ARCH_USERFAULTFD_WP         if X86_64 && USERFAULTFD
+	select HAVE_ARCH_USERFAULTFD_MINOR	if X86_64 && USERFAULTFD
 	select HAVE_ARCH_VMAP_STACK		if X86_64
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_WITHIN_STACK_FRAMES
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e862cab69583..fc9784544b24 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_PKEY_BIT4)]	= "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+		[ilog2(VM_UFFD_MINOR)]	= "ui",
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
 	};
 	size_t i;
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e5ce3b4e6c3d..ba35cafa8b0d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -197,24 +197,21 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 	msg_init(&msg);
 	msg.event = UFFD_EVENT_PAGEFAULT;
 	msg.arg.pagefault.address = address;
+	/*
+	 * These flags indicate why the userfault occurred:
+	 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
+	 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
+	 * - Neither of these flags being set indicates a MISSING fault.
+	 *
+	 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
+	 * fault. Otherwise, it was a read fault.
+	 */
 	if (flags & FAULT_FLAG_WRITE)
-		/*
-		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
-		 * was not set in a UFFD_EVENT_PAGEFAULT, it means it
-		 * was a read fault, otherwise if set it means it's
-		 * a write fault.
-		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
 	if (reason & VM_UFFD_WP)
-		/*
-		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
-		 * not set in a UFFD_EVENT_PAGEFAULT, it means it was
-		 * a missing fault, otherwise if set it means it's a
-		 * write protect fault.
-		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+	if (reason & VM_UFFD_MINOR)
+		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
 	if (features & UFFD_FEATURE_THREAD_ID)
 		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
 	return msg;
@@ -401,8 +398,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
 	BUG_ON(ctx->mm != mm);
 
-	VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
-	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+	/* Any unrecognized flag is a bug. */
+	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
+	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
+	VM_BUG_ON(!reason || (reason & (reason - 1)));
 
 	if (ctx->features & UFFD_FEATURE_SIGBUS)
 		goto out;
@@ -612,7 +611,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 		for (vma = mm->mmap; vma; vma = vma->vm_next)
 			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
 				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-				vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+				vma->vm_flags &= ~__VM_UFFD_FLAGS;
 			}
 		mmap_write_unlock(mm);
 
@@ -644,7 +643,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 	octx = vma->vm_userfaultfd_ctx.ctx;
 	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		vma->vm_flags &= ~__VM_UFFD_FLAGS;
 		return 0;
 	}
 
@@ -726,7 +725,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	} else {
 		/* Drop uffd context if remap feature not enabled */
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		vma->vm_flags &= ~__VM_UFFD_FLAGS;
 	}
 }
 
@@ -867,12 +866,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		cond_resched();
 		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
-		       !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+		       !!(vma->vm_flags & __VM_UFFD_FLAGS));
 		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
 			prev = vma;
 			continue;
 		}
-		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 		prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
 				 new_flags, vma->anon_vma,
 				 vma->vm_file, vma->vm_pgoff,
@@ -1262,9 +1261,19 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 				     unsigned long vm_flags)
 {
 	/* FIXME: add WP support to hugetlbfs and shmem */
-	return vma_is_anonymous(vma) ||
-		((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
-		 !(vm_flags & VM_UFFD_WP));
+	if (vm_flags & VM_UFFD_WP) {
+		if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
+			return false;
+	}
+
+	if (vm_flags & VM_UFFD_MINOR) {
+		/* FIXME: Add minor fault interception for shmem. */
+		if (!is_vm_hugetlb_page(vma))
+			return false;
+	}
+
+	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+	       vma_is_shmem(vma);
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1290,14 +1299,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	ret = -EINVAL;
 	if (!uffdio_register.mode)
 		goto out;
-	if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
-				     UFFDIO_REGISTER_MODE_WP))
+	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
 		goto out;
 	vm_flags = 0;
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
 		vm_flags |= VM_UFFD_MISSING;
 	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
 		vm_flags |= VM_UFFD_WP;
+	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+		goto out;
+#endif
+		vm_flags |= VM_UFFD_MINOR;
+	}
 
 	ret = validate_range(mm, &uffdio_register.range.start,
 			     uffdio_register.range.len);
@@ -1341,7 +1355,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		cond_resched();
 
 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
-		       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
@@ -1421,8 +1435,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 			start = vma->vm_start;
 		vma_end = min(end, vma->vm_end);
 
-		new_flags = (vma->vm_flags &
-			     ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
+		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
 				 vma_policy(vma),
@@ -1544,7 +1557,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		cond_resched();
 
 		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
-		       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
 		/*
 		 * Check not compatible vmas, not strictly required
@@ -1595,7 +1608,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
 		}
 
-		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
 				 vma_policy(vma),
@@ -1863,6 +1876,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 		goto err_out;
 	/* report all available features and ioctls to userland */
 	uffdio_api.features = UFFD_API_FEATURES;
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+	uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+#endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 011f43605807..1dbb53c44243 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_GROWSUP	VM_NONE
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define VM_UFFD_MINOR_BIT	37
+# define VM_UFFD_MINOR		BIT(VM_UFFD_MINOR_BIT)	/* UFFD minor faults */
+#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+# define VM_UFFD_MINOR		VM_NONE
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index c63ccdae3eab..0390e5ac63b3 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -17,6 +17,9 @@
 #include <linux/mm.h>
 #include <asm-generic/pgtable_uffd.h>
 
+/* The set of all possible UFFD-related VM flags. */
+#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR)
+
 /*
  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
  * new flags, since they might collide with O_* ones. We want
@@ -71,6 +74,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_UFFD_WP;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_UFFD_MINOR;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
 				      pte_t pte)
 {
@@ -85,7 +93,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
 
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
-	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+	return vma->vm_flags & __VM_UFFD_FLAGS;
 }
 
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
@@ -132,6 +140,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+	return false;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
 				      pte_t pte)
 {
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 67018d367b9f..629c7a0eaff2 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2,		"arch_2"	)
 #define IF_HAVE_VM_SOFTDIRTY(flag,name)
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name},
+#else
+# define IF_HAVE_UFFD_MINOR(flag, name)
+#endif
+
 #define __def_vmaflag_names						\
 	{VM_READ,			"read"		},		\
 	{VM_WRITE,			"write"		},		\
@@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2,		"arch_2"	)
 	{VM_MAYSHARE,			"mayshare"	},		\
 	{VM_GROWSDOWN,			"growsdown"	},		\
 	{VM_UFFD_MISSING,		"uffd_missing"	},		\
+IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR,	"uffd_minor"	)		\
 	{VM_PFNMAP,			"pfnmap"	},		\
 	{VM_DENYWRITE,			"denywrite"	},		\
 	{VM_UFFD_WP,			"uffd_wp"	},		\
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 5f2d88212f7c..f24dd4fcbad9 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -19,15 +19,19 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
+#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING |	\
+				 UFFDIO_REGISTER_MODE_WP |	\
+				 UFFDIO_REGISTER_MODE_MINOR)
 #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP |	\
 			   UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
-			   UFFD_FEATURE_EVENT_REMOVE |	\
+			   UFFD_FEATURE_EVENT_REMOVE |		\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM |		\
 			   UFFD_FEATURE_SIGBUS |		\
-			   UFFD_FEATURE_THREAD_ID)
+			   UFFD_FEATURE_THREAD_ID |		\
+			   UFFD_FEATURE_MINOR_HUGETLBFS)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -127,6 +131,7 @@ struct uffd_msg {
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
 #define UFFD_PAGEFAULT_FLAG_WP		(1<<1)	/* If reason is VM_UFFD_WP */
+#define UFFD_PAGEFAULT_FLAG_MINOR	(1<<2)	/* If reason is VM_UFFD_MINOR */
 
 struct uffdio_api {
 	/* userland asks for an API number and the features to enable */
@@ -171,6 +176,10 @@ struct uffdio_api {
 	 *
 	 * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
 	 * be returned, if feature is not requested 0 will be returned.
+	 *
+	 * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
+	 * can be intercepted (via REGISTER_MODE_MINOR) for
+	 * hugetlbfs-backed pages.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -181,6 +190,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
 #define UFFD_FEATURE_SIGBUS			(1<<7)
 #define UFFD_FEATURE_THREAD_ID			(1<<8)
+#define UFFD_FEATURE_MINOR_HUGETLBFS		(1<<9)
 	__u64 features;
 
 	__u64 ioctls;
@@ -195,6 +205,7 @@ struct uffdio_register {
 	struct uffdio_range range;
 #define UFFDIO_REGISTER_MODE_MISSING	((__u64)1<<0)
 #define UFFDIO_REGISTER_MODE_WP		((__u64)1<<1)
+#define UFFDIO_REGISTER_MODE_MINOR	((__u64)1<<2)
 	__u64 mode;
 
 	/*
diff --git a/init/Kconfig b/init/Kconfig
index 9acb7762e971..1413413fcb9f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1644,6 +1644,11 @@ config HAVE_ARCH_USERFAULTFD_WP
 	help
 	  Arch has userfaultfd write protection support
 
+config HAVE_ARCH_USERFAULTFD_MINOR
+	bool
+	help
+	  Arch has userfaultfd minor fault support
+
 config MEMBARRIER
 	bool "Enable membarrier() system call" if EXPERT
 	default y
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b5977d9709ad..84530876b2ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4469,6 +4469,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 	return 0;
 }
 
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+						  struct address_space *mapping,
+						  pgoff_t idx,
+						  unsigned int flags,
+						  unsigned long haddr,
+						  unsigned long reason)
+{
+	vm_fault_t ret;
+	u32 hash;
+	struct vm_fault vmf = {
+		.vma = vma,
+		.address = haddr,
+		.flags = flags,
+
+		/*
+		 * Hard to debug if it ends up being
+		 * used by a callee that assumes
+		 * something about the other
+		 * uninitialized fields... same as in
+		 * memory.c
+		 */
+	};
+
+	/*
+	 * hugetlb_fault_mutex and i_mmap_rwsem must be
+	 * dropped before handling userfault.  Reacquire
+	 * after handling fault to make calling code simpler.
+	 */
+	hash = hugetlb_fault_mutex_hash(mapping, idx);
+	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+	i_mmap_unlock_read(mapping);
+	ret = handle_userfault(&vmf, reason);
+	i_mmap_lock_read(mapping);
+	mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+	return ret;
+}
+
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			struct vm_area_struct *vma,
 			struct address_space *mapping, pgoff_t idx,
@@ -4507,35 +4545,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 retry:
 	page = find_lock_page(mapping, idx);
 	if (!page) {
-		/*
-		 * Check for page in userfault range
-		 */
+		/* Check for page in userfault range */
 		if (userfaultfd_missing(vma)) {
-			u32 hash;
-			struct vm_fault vmf = {
-				.vma = vma,
-				.address = haddr,
-				.flags = flags,
-				/*
-				 * Hard to debug if it ends up being
-				 * used by a callee that assumes
-				 * something about the other
-				 * uninitialized fields... same as in
-				 * memory.c
-				 */
-			};
-
-			/*
-			 * hugetlb_fault_mutex and i_mmap_rwsem must be
-			 * dropped before handling userfault.  Reacquire
-			 * after handling fault to make calling code simpler.
-			 */
-			hash = hugetlb_fault_mutex_hash(mapping, idx);
-			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-			i_mmap_unlock_read(mapping);
-			ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-			i_mmap_lock_read(mapping);
-			mutex_lock(&hugetlb_fault_mutex_table[hash]);
+			ret = hugetlb_handle_userfault(vma, mapping, idx,
+						       flags, haddr,
+						       VM_UFFD_MISSING);
 			goto out;
 		}
 
@@ -4591,6 +4605,16 @@ retry:
 				VM_FAULT_SET_HINDEX(hstate_index(h));
 			goto backout_unlocked;
 		}
+
+		/* Check for page in userfault range. */
+		if (userfaultfd_minor(vma)) {
+			unlock_page(page);
+			put_page(page);
+			ret = hugetlb_handle_userfault(vma, mapping, idx,
+						       flags, haddr,
+						       VM_UFFD_MINOR);
+			goto out;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From 0d9cadabd193c6008d256533f544de8206fd3a80 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 4 May 2021 18:35:40 -0700
Subject: userfaultfd: disable huge PMD sharing for MINOR registered VMAs

As the comment says: for the MINOR fault use case, although the page
might be present and populated in the other (non-UFFD-registered) half
of the mapping, it may be out of date, and we explicitly want userspace
to get a minor fault so it can check and potentially update the page's
contents.

Huge PMD sharing would prevent these faults from occurring for suitably
aligned areas, so disable it upon UFFD registration.

Link: https://lkml.kernel.org/r/20210301222728.176417-3-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 0390e5ac63b3..e060d5f77cc5 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -56,12 +56,19 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 }
 
 /*
- * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp
- * protect information is per pgtable entry.
+ * Never enable huge pmd sharing on some uffd registered vmas:
+ *
+ * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry.
+ *
+ * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for
+ *   VMAs which share huge pmds. (If you have two mappings to the same
+ *   underlying pages, and fault in the non-UFFD-registered one with a write,
+ *   with huge pmd sharing this would *also* setup the second UFFD-registered
+ *   mapping, and we'd not get minor faults.)
  */
 static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
 {
-	return vma->vm_flags & VM_UFFD_WP;
+	return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR);
 }
 
 static inline bool userfaultfd_missing(struct vm_area_struct *vma)
-- 
cgit v1.2.3


From 714c189108244f1df579689061db1d785d92e7e2 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 4 May 2021 18:35:45 -0700
Subject: userfaultfd: hugetlbfs: only compile UFFD helpers if config enabled

For background, mm/userfaultfd.c provides a general mcopy_atomic
implementation.  But some types of memory (i.e., hugetlb and shmem) need
a slightly different implementation, so they provide their own helpers
for this.  In other words, userfaultfd is the only caller of these
functions.

This patch achieves two things:

1. Don't spend time compiling code which will end up never being
   referenced anyway (a small build time optimization).

2. In patches later in this series, we extend the signature of these
   helpers with UFFD-specific state (a mode enumeration).  Once this
   happens, we *have to* either not compile the helpers, or
   unconditionally define the UFFD-only state (which seems messier to me).
   This includes the declarations in the headers, as otherwise they'd
   yield warnings about implicitly defining the type of those arguments.

Link: https://lkml.kernel.org/r/20210301222728.176417-4-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 4 ++++
 mm/hugetlb.c            | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d0f310ae3f82..a1dbe4568707 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -134,11 +134,13 @@ void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
+#ifdef CONFIG_USERFAULTFD
 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				struct vm_area_struct *dst_vma,
 				unsigned long dst_addr,
 				unsigned long src_addr,
 				struct page **pagep);
+#endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
@@ -310,6 +312,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	BUG();
 }
 
+#ifdef CONFIG_USERFAULTFD
 static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 						pte_t *dst_pte,
 						struct vm_area_struct *dst_vma,
@@ -320,6 +323,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	BUG();
 	return 0;
 }
+#endif /* CONFIG_USERFAULTFD */
 
 static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
 					unsigned long sz)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 84530876b2ae..b105a455124d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4855,6 +4855,7 @@ out_mutex:
 	return ret;
 }
 
+#ifdef CONFIG_USERFAULTFD
 /*
  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
  * modifications for huge pages.
@@ -4985,6 +4986,7 @@ out_release_nounlock:
 	put_page(page);
 	goto out;
 }
+#endif /* CONFIG_USERFAULTFD */
 
 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
 				 int refs, struct page **pages,
-- 
cgit v1.2.3


From f619147104c8ea71e120e4936d2b68ec11a1e527 Mon Sep 17 00:00:00 2001
From: Axel Rasmussen <axelrasmussen@google.com>
Date: Tue, 4 May 2021 18:35:49 -0700
Subject: userfaultfd: add UFFDIO_CONTINUE ioctl

This ioctl is how userspace ought to resolve "minor" userfaults.  The
idea is, userspace is notified that a minor fault has occurred.  It
might change the contents of the page using its second non-UFFD mapping,
or not.  Then, it calls UFFDIO_CONTINUE to tell the kernel "I have
ensured the page contents are correct, carry on setting up the mapping".

Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for
MINOR registered VMAs.  ZEROPAGE maps the VMA to the zero page; but in
the minor fault case, we already have some pre-existing underlying page.
Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping.
We'd just use memcpy() or similar instead.

It turns out hugetlb_mcopy_atomic_pte() already does very close to what
we want, if an existing page is provided via `struct page **pagep`.  We
already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so
just extend that design: add an enum for the three modes of operation,
and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE
case.  (Basically, look up the existing page, and avoid adding the
existing page to the page cache or calling set_page_huge_active() on
it.)

Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 67 ++++++++++++++++++++++++++++++++++++++++
 include/linux/hugetlb.h          |  3 ++
 include/linux/userfaultfd_k.h    | 18 +++++++++++
 include/uapi/linux/userfaultfd.h | 21 +++++++++++--
 mm/hugetlb.c                     | 40 +++++++++++++++---------
 mm/userfaultfd.c                 | 37 +++++++++++++---------
 6 files changed, 156 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ba35cafa8b0d..14f92285d04f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1487,6 +1487,10 @@ out_unlock:
 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
 			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
 
+		/* CONTINUE ioctl is only supported for MINOR ranges. */
+		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
+
 		/*
 		 * Now that we scanned all vmas we can already tell
 		 * userland which ioctls methods are guaranteed to
@@ -1840,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+	__s64 ret;
+	struct uffdio_continue uffdio_continue;
+	struct uffdio_continue __user *user_uffdio_continue;
+	struct userfaultfd_wake_range range;
+
+	user_uffdio_continue = (struct uffdio_continue __user *)arg;
+
+	ret = -EAGAIN;
+	if (READ_ONCE(ctx->mmap_changing))
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
+			   /* don't copy the output fields */
+			   sizeof(uffdio_continue) - (sizeof(__s64))))
+		goto out;
+
+	ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+			     uffdio_continue.range.len);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	/* double check for wraparound just in case. */
+	if (uffdio_continue.range.start + uffdio_continue.range.len <=
+	    uffdio_continue.range.start) {
+		goto out;
+	}
+	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+		goto out;
+
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
+				     uffdio_continue.range.len,
+				     &ctx->mmap_changing);
+		mmput(ctx->mm);
+	} else {
+		return -ESRCH;
+	}
+
+	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+		return -EFAULT;
+	if (ret < 0)
+		goto out;
+
+	/* len == 0 would wake all */
+	BUG_ON(!ret);
+	range.len = ret;
+	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
+		range.start = uffdio_continue.range.start;
+		wake_userfault(ctx, &range);
+	}
+	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
+
+out:
+	return ret;
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
 	/*
@@ -1927,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_WRITEPROTECT:
 		ret = userfaultfd_writeprotect(ctx, arg);
 		break;
+	case UFFDIO_CONTINUE:
+		ret = userfaultfd_continue(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a1dbe4568707..b92f25ccef58 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -11,6 +11,7 @@
 #include <linux/kref.h>
 #include <linux/pgtable.h>
 #include <linux/gfp.h>
+#include <linux/userfaultfd_k.h>
 
 struct ctl_table;
 struct user_struct;
@@ -139,6 +140,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				struct vm_area_struct *dst_vma,
 				unsigned long dst_addr,
 				unsigned long src_addr,
+				enum mcopy_atomic_mode mode,
 				struct page **pagep);
 #endif /* CONFIG_USERFAULTFD */
 bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
@@ -318,6 +320,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 						struct vm_area_struct *dst_vma,
 						unsigned long dst_addr,
 						unsigned long src_addr,
+						enum mcopy_atomic_mode mode,
 						struct page **pagep)
 {
 	BUG();
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e060d5f77cc5..794d1538b8ba 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd;
 
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
+/*
+ * The mode of operation for __mcopy_atomic and its helpers.
+ *
+ * This is almost an implementation detail (mcopy_atomic below doesn't take this
+ * as a parameter), but it's exposed here because memory-kind-specific
+ * implementations (e.g. hugetlbfs) need to know the mode of operation.
+ */
+enum mcopy_atomic_mode {
+	/* A normal copy_from_user into the destination range. */
+	MCOPY_ATOMIC_NORMAL,
+	/* Don't copy; map the destination range to the zero page. */
+	MCOPY_ATOMIC_ZEROPAGE,
+	/* Just install pte(s) with the existing page(s) in the page cache. */
+	MCOPY_ATOMIC_CONTINUE,
+};
+
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len,
 			    bool *mmap_changing, __u64 mode);
@@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
 			      unsigned long dst_start,
 			      unsigned long len,
 			      bool *mmap_changing);
+extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+			      unsigned long len, bool *mmap_changing);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
 			       unsigned long start, unsigned long len,
 			       bool enable_wp, bool *mmap_changing);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index f24dd4fcbad9..bafbeb1a2624 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -40,10 +40,12 @@
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_ZEROPAGE |		\
-	 (__u64)1 << _UFFDIO_WRITEPROTECT)
+	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
+	 (__u64)1 << _UFFDIO_CONTINUE)
 #define UFFD_API_RANGE_IOCTLS_BASIC		\
 	((__u64)1 << _UFFDIO_WAKE |		\
-	 (__u64)1 << _UFFDIO_COPY)
+	 (__u64)1 << _UFFDIO_COPY |		\
+	 (__u64)1 << _UFFDIO_CONTINUE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -59,6 +61,7 @@
 #define _UFFDIO_COPY			(0x03)
 #define _UFFDIO_ZEROPAGE		(0x04)
 #define _UFFDIO_WRITEPROTECT		(0x06)
+#define _UFFDIO_CONTINUE		(0x07)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -77,6 +80,8 @@
 				      struct uffdio_zeropage)
 #define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
 				      struct uffdio_writeprotect)
+#define UFFDIO_CONTINUE		_IOR(UFFDIO, _UFFDIO_CONTINUE,	\
+				     struct uffdio_continue)
 
 /* read() structure */
 struct uffd_msg {
@@ -268,6 +273,18 @@ struct uffdio_writeprotect {
 	__u64 mode;
 };
 
+struct uffdio_continue {
+	struct uffdio_range range;
+#define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * Fields below here are written by the ioctl and must be at the end:
+	 * the copy_from_user will not read past here.
+	 */
+	__s64 mapped;
+};
+
 /*
  * Flags for the userfaultfd(2) system call itself.
  */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b105a455124d..533e5a26e437 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -39,7 +39,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/page_owner.h>
 #include "internal.h"
 
@@ -4865,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    struct vm_area_struct *dst_vma,
 			    unsigned long dst_addr,
 			    unsigned long src_addr,
+			    enum mcopy_atomic_mode mode,
 			    struct page **pagep)
 {
+	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
 	struct address_space *mapping;
 	pgoff_t idx;
 	unsigned long size;
@@ -4876,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	spinlock_t *ptl;
 	int ret;
 	struct page *page;
+	int writable;
 
-	if (!*pagep) {
+	mapping = dst_vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+	if (is_continue) {
+		ret = -EFAULT;
+		page = find_lock_page(mapping, idx);
+		if (!page)
+			goto out;
+	} else if (!*pagep) {
 		ret = -ENOMEM;
 		page = alloc_huge_page(dst_vma, dst_addr, 0);
 		if (IS_ERR(page))
@@ -4906,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	 */
 	__SetPageUptodate(page);
 
-	mapping = dst_vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
-	/*
-	 * If shared, add to page cache
-	 */
-	if (vm_shared) {
+	/* Add shared, newly allocated pages to the page cache. */
+	if (vm_shared && !is_continue) {
 		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		ret = -EFAULT;
 		if (idx >= size)
@@ -4957,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
 	}
 
-	_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
-	if (dst_vma->vm_flags & VM_WRITE)
+	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+	if (is_continue && !vm_shared)
+		writable = 0;
+	else
+		writable = dst_vma->vm_flags & VM_WRITE;
+
+	_dst_pte = make_huge_pte(dst_vma, page, writable);
+	if (writable)
 		_dst_pte = huge_pte_mkdirty(_dst_pte);
 	_dst_pte = pte_mkyoung(_dst_pte);
 
@@ -4972,15 +4983,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
 
 	spin_unlock(ptl);
-	SetHPageMigratable(page);
-	if (vm_shared)
+	if (!is_continue)
+		SetHPageMigratable(page);
+	if (vm_shared || is_continue)
 		unlock_page(page);
 	ret = 0;
 out:
 	return ret;
 out_release_unlock:
 	spin_unlock(ptl);
-	if (vm_shared)
+	if (vm_shared || is_continue)
 		unlock_page(page);
 out_release_nounlock:
 	put_page(page);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 063cbb17e8d8..e14b3820c6a8 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      bool zeropage)
+					      enum mcopy_atomic_mode mode)
 {
 	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 	 * by THP.  Since we can not reliably insert a zero page, this
 	 * feature is not supported.
 	 */
-	if (zeropage) {
+	if (mode == MCOPY_ATOMIC_ZEROPAGE) {
 		mmap_read_unlock(dst_mm);
 		return -EINVAL;
 	}
@@ -273,8 +273,6 @@ retry:
 	}
 
 	while (src_addr < src_start + len) {
-		pte_t dst_pteval;
-
 		BUG_ON(dst_addr >= dst_start + len);
 
 		/*
@@ -297,16 +295,16 @@ retry:
 			goto out_unlock;
 		}
 
-		err = -EEXIST;
-		dst_pteval = huge_ptep_get(dst_pte);
-		if (!huge_pte_none(dst_pteval)) {
+		if (mode != MCOPY_ATOMIC_CONTINUE &&
+		    !huge_pte_none(huge_ptep_get(dst_pte))) {
+			err = -EEXIST;
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			i_mmap_unlock_read(mapping);
 			goto out_unlock;
 		}
 
 		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
-						dst_addr, src_addr, &page);
+					       dst_addr, src_addr, mode, &page);
 
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		i_mmap_unlock_read(mapping);
@@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 				      unsigned long dst_start,
 				      unsigned long src_start,
 				      unsigned long len,
-				      bool zeropage);
+				      enum mcopy_atomic_mode mode);
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 					      unsigned long dst_start,
 					      unsigned long src_start,
 					      unsigned long len,
-					      bool zeropage,
+					      enum mcopy_atomic_mode mcopy_mode,
 					      bool *mmap_changing,
 					      __u64 mode)
 {
@@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	long copied;
 	struct page *page;
 	bool wp_copy;
+	bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
 
 	/*
 	 * Sanitize the command parameters:
@@ -527,10 +526,12 @@ retry:
 	 */
 	if (is_vm_hugetlb_page(dst_vma))
 		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
-						src_start, len, zeropage);
+						src_start, len, mcopy_mode);
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
+	if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+		goto out_unlock;
 
 	/*
 	 * Ensure the dst_vma has a anon_vma or this page
@@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 		     unsigned long src_start, unsigned long len,
 		     bool *mmap_changing, __u64 mode)
 {
-	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
-			      mmap_changing, mode);
+	return __mcopy_atomic(dst_mm, dst_start, src_start, len,
+			      MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
 		       unsigned long len, bool *mmap_changing)
 {
-	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+			      mmap_changing, 0);
+}
+
+ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
+		       unsigned long len, bool *mmap_changing)
+{
+	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+			      mmap_changing, 0);
 }
 
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
-- 
cgit v1.2.3


From 202e35db5e719ee8af6028183403f475e243f82d Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 4 May 2021 18:36:04 -0700
Subject: mm/vmscan: replace implicit RECLAIM_ZONE checks with explicit checks

RECLAIM_ZONE was assumed to be unused because it was never explicitly
used in the kernel.  However, there were a number of places where it was
checked implicitly by checking 'node_reclaim_mode' for a zero value.

These zero checks are not great because it is not obvious what a zero
mode *means* in the code.  Replace them with a helper which makes it
more obvious: node_reclaim_enabled().

This helper also provides a handy place to explicitly check the
RECLAIM_ZONE bit itself.  Check it explicitly there to make it more
obvious where the bit can affect behavior.

This should have no functional impact.

Link: https://lkml.kernel.org/r/20210219172559.BF589C44@viggo.jf.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Ben Widawsky <ben.widawsky@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: "Tobin C. Harding" <tobin@kernel.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 7 +++++++
 mm/khugepaged.c      | 2 +-
 mm/page_alloc.c      | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4cc6ec3bf0ab..42191da1bdc9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
+#include <uapi/linux/mempolicy.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -378,6 +379,12 @@ extern int sysctl_min_slab_ratio;
 #define node_reclaim_mode 0
 #endif
 
+static inline bool node_reclaim_enabled(void)
+{
+	/* Is any node_reclaim_mode bit set? */
+	return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
 extern void check_move_unevictable_pages(struct pagevec *pvec);
 
 extern int kswapd_run(int nid);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a03569eda183..ea74da3232ab 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid)
 	 * If node_reclaim_mode is disabled, then no extra effort is made to
 	 * allocate memory locally.
 	 */
-	if (!node_reclaim_mode)
+	if (!node_reclaim_enabled())
 		return false;
 
 	/* If there is a count for this node already, it must be acceptable */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 80fa6e0f9ed9..19cdd8a829dc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3968,7 +3968,7 @@ retry:
 			if (alloc_flags & ALLOC_NO_WATERMARKS)
 				goto try_this_zone;
 
-			if (node_reclaim_mode == 0 ||
+			if (!node_reclaim_enabled() ||
 			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
 				continue;
 
-- 
cgit v1.2.3


From 2bfd36374edd9ed7f2ebf66cacebedf7273901cb Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Tue, 4 May 2021 18:36:11 -0700
Subject: mm: vmscan: consolidate shrinker_maps handling code

The shrinker map management is not purely memcg specific, it is at the
intersection between memory cgroup and shrinkers.  It's allocation and
assignment of a structure, and the only memcg bit is the map is being
stored in a memcg structure.  So move the shrinker_maps handling code
into vmscan.c for tighter integration with shrinker code, and remove the
"memcg_" prefix.  There is no functional change.

Link: https://lkml.kernel.org/r/20210311190845.9708-3-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  11 ++--
 mm/huge_memory.c           |   4 +-
 mm/list_lru.c              |   6 +--
 mm/memcontrol.c            | 130 ++------------------------------------------
 mm/vmscan.c                | 132 +++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 142 insertions(+), 141 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5904716f29ba..7dbd2c9bad32 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1610,10 +1610,9 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 	return false;
 }
 
-extern int memcg_expand_shrinker_maps(int new_id);
-
-extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
-				   int nid, int shrinker_id);
+int alloc_shrinker_maps(struct mem_cgroup *memcg);
+void free_shrinker_maps(struct mem_cgroup *memcg);
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
 #else
 #define mem_cgroup_sockets_enabled 0
 static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1623,8 +1622,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 	return false;
 }
 
-static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
-					  int nid, int shrinker_id)
+static inline void set_shrinker_bit(struct mem_cgroup *memcg,
+				    int nid, int shrinker_id)
 {
 }
 #endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b3788683f7d3..98456017744d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2830,8 +2830,8 @@ void deferred_split_huge_page(struct page *page)
 		ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
 		if (memcg)
-			memcg_set_shrinker_bit(memcg, page_to_nid(page),
-					       deferred_split_shrinker.id);
+			set_shrinker_bit(memcg, page_to_nid(page),
+					 deferred_split_shrinker.id);
 #endif
 	}
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 6f067b6b935f..cd58790d0fb3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 		list_add_tail(item, &l->list);
 		/* Set shrinker bit if the first element was added */
 		if (!l->nr_items++)
-			memcg_set_shrinker_bit(memcg, nid,
-					       lru_shrinker_id(lru));
+			set_shrinker_bit(memcg, nid,
+					 lru_shrinker_id(lru));
 		nlru->nr_items++;
 		spin_unlock(&nlru->lock);
 		return true;
@@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
 
 	if (src->nr_items) {
 		dst->nr_items += src->nr_items;
-		memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
+		set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
 		src->nr_items = 0;
 	}
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c100265dc393..09fd17ba6de2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -400,130 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 #endif
 
-static int memcg_shrinker_map_size;
-static DEFINE_MUTEX(memcg_shrinker_map_mutex);
-
-static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
-{
-	kvfree(container_of(head, struct memcg_shrinker_map, rcu));
-}
-
-static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
-					 int size, int old_size)
-{
-	struct memcg_shrinker_map *new, *old;
-	struct mem_cgroup_per_node *pn;
-	int nid;
-
-	lockdep_assert_held(&memcg_shrinker_map_mutex);
-
-	for_each_node(nid) {
-		pn = memcg->nodeinfo[nid];
-		old = rcu_dereference_protected(pn->shrinker_map, true);
-		/* Not yet online memcg */
-		if (!old)
-			return 0;
-
-		new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
-		if (!new)
-			return -ENOMEM;
-
-		/* Set all old bits, clear all new bits */
-		memset(new->map, (int)0xff, old_size);
-		memset((void *)new->map + old_size, 0, size - old_size);
-
-		rcu_assign_pointer(pn->shrinker_map, new);
-		call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
-	}
-
-	return 0;
-}
-
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup_per_node *pn;
-	struct memcg_shrinker_map *map;
-	int nid;
-
-	if (mem_cgroup_is_root(memcg))
-		return;
-
-	for_each_node(nid) {
-		pn = memcg->nodeinfo[nid];
-		map = rcu_dereference_protected(pn->shrinker_map, true);
-		kvfree(map);
-		rcu_assign_pointer(pn->shrinker_map, NULL);
-	}
-}
-
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
-	struct memcg_shrinker_map *map;
-	int nid, size, ret = 0;
-
-	if (mem_cgroup_is_root(memcg))
-		return 0;
-
-	mutex_lock(&memcg_shrinker_map_mutex);
-	size = memcg_shrinker_map_size;
-	for_each_node(nid) {
-		map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
-		if (!map) {
-			memcg_free_shrinker_maps(memcg);
-			ret = -ENOMEM;
-			break;
-		}
-		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
-	}
-	mutex_unlock(&memcg_shrinker_map_mutex);
-
-	return ret;
-}
-
-int memcg_expand_shrinker_maps(int new_id)
-{
-	int size, old_size, ret = 0;
-	struct mem_cgroup *memcg;
-
-	size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
-	old_size = memcg_shrinker_map_size;
-	if (size <= old_size)
-		return 0;
-
-	mutex_lock(&memcg_shrinker_map_mutex);
-	if (!root_mem_cgroup)
-		goto unlock;
-
-	for_each_mem_cgroup(memcg) {
-		if (mem_cgroup_is_root(memcg))
-			continue;
-		ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
-		if (ret) {
-			mem_cgroup_iter_break(NULL, memcg);
-			goto unlock;
-		}
-	}
-unlock:
-	if (!ret)
-		memcg_shrinker_map_size = size;
-	mutex_unlock(&memcg_shrinker_map_mutex);
-	return ret;
-}
-
-void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
-{
-	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
-		struct memcg_shrinker_map *map;
-
-		rcu_read_lock();
-		map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
-		/* Pairs with smp mb in shrink_slab() */
-		smp_mb__before_atomic();
-		set_bit(shrinker_id, map->map);
-		rcu_read_unlock();
-	}
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -5242,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
 	/*
-	 * A memcg must be visible for memcg_expand_shrinker_maps()
+	 * A memcg must be visible for expand_shrinker_maps()
 	 * by the time the maps are allocated. So, we allocate maps
 	 * here, when for_each_mem_cgroup() can't skip it.
 	 */
-	if (memcg_alloc_shrinker_maps(memcg)) {
+	if (alloc_shrinker_maps(memcg)) {
 		mem_cgroup_id_remove(memcg);
 		return -ENOMEM;
 	}
@@ -5310,7 +5186,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 	vmpressure_cleanup(&memcg->vmpressure);
 	cancel_work_sync(&memcg->high_work);
 	mem_cgroup_remove_from_trees(memcg);
-	memcg_free_shrinker_maps(memcg);
+	free_shrinker_maps(memcg);
 	memcg_free_kmem(memcg);
 	mem_cgroup_free(memcg);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d231af7cae06..bead5ae1e7e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -185,6 +185,132 @@ static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_MEMCG
+
+static int memcg_shrinker_map_size;
+static DEFINE_MUTEX(memcg_shrinker_map_mutex);
+
+static void free_shrinker_map_rcu(struct rcu_head *head)
+{
+	kvfree(container_of(head, struct memcg_shrinker_map, rcu));
+}
+
+static int expand_one_shrinker_map(struct mem_cgroup *memcg,
+				   int size, int old_size)
+{
+	struct memcg_shrinker_map *new, *old;
+	struct mem_cgroup_per_node *pn;
+	int nid;
+
+	lockdep_assert_held(&memcg_shrinker_map_mutex);
+
+	for_each_node(nid) {
+		pn = memcg->nodeinfo[nid];
+		old = rcu_dereference_protected(pn->shrinker_map, true);
+		/* Not yet online memcg */
+		if (!old)
+			return 0;
+
+		new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+		if (!new)
+			return -ENOMEM;
+
+		/* Set all old bits, clear all new bits */
+		memset(new->map, (int)0xff, old_size);
+		memset((void *)new->map + old_size, 0, size - old_size);
+
+		rcu_assign_pointer(pn->shrinker_map, new);
+		call_rcu(&old->rcu, free_shrinker_map_rcu);
+	}
+
+	return 0;
+}
+
+void free_shrinker_maps(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup_per_node *pn;
+	struct memcg_shrinker_map *map;
+	int nid;
+
+	if (mem_cgroup_is_root(memcg))
+		return;
+
+	for_each_node(nid) {
+		pn = memcg->nodeinfo[nid];
+		map = rcu_dereference_protected(pn->shrinker_map, true);
+		kvfree(map);
+		rcu_assign_pointer(pn->shrinker_map, NULL);
+	}
+}
+
+int alloc_shrinker_maps(struct mem_cgroup *memcg)
+{
+	struct memcg_shrinker_map *map;
+	int nid, size, ret = 0;
+
+	if (mem_cgroup_is_root(memcg))
+		return 0;
+
+	mutex_lock(&memcg_shrinker_map_mutex);
+	size = memcg_shrinker_map_size;
+	for_each_node(nid) {
+		map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
+		if (!map) {
+			free_shrinker_maps(memcg);
+			ret = -ENOMEM;
+			break;
+		}
+		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
+	}
+	mutex_unlock(&memcg_shrinker_map_mutex);
+
+	return ret;
+}
+
+static int expand_shrinker_maps(int new_id)
+{
+	int size, old_size, ret = 0;
+	struct mem_cgroup *memcg;
+
+	size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
+	old_size = memcg_shrinker_map_size;
+	if (size <= old_size)
+		return 0;
+
+	mutex_lock(&memcg_shrinker_map_mutex);
+	if (!root_mem_cgroup)
+		goto unlock;
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		if (mem_cgroup_is_root(memcg))
+			continue;
+		ret = expand_one_shrinker_map(memcg, size, old_size);
+		if (ret) {
+			mem_cgroup_iter_break(NULL, memcg);
+			goto unlock;
+		}
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+unlock:
+	if (!ret)
+		memcg_shrinker_map_size = size;
+	mutex_unlock(&memcg_shrinker_map_mutex);
+	return ret;
+}
+
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+		struct memcg_shrinker_map *map;
+
+		rcu_read_lock();
+		map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
+		/* Pairs with smp mb in shrink_slab() */
+		smp_mb__before_atomic();
+		set_bit(shrinker_id, map->map);
+		rcu_read_unlock();
+	}
+}
+
 /*
  * We allow subsystems to populate their shrinker-related
  * LRU lists before register_shrinker_prepared() is called
@@ -212,7 +338,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 		goto unlock;
 
 	if (id >= shrinker_nr_max) {
-		if (memcg_expand_shrinker_maps(id)) {
+		if (expand_shrinker_maps(id)) {
 			idr_remove(&shrinker_idr, id);
 			goto unlock;
 		}
@@ -590,7 +716,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 			 * case, we invoke the shrinker one more time and reset
 			 * the bit if it reports that it is not empty anymore.
 			 * The memory barrier here pairs with the barrier in
-			 * memcg_set_shrinker_bit():
+			 * set_shrinker_bit():
 			 *
 			 * list_lru_add()     shrink_slab_memcg()
 			 *   list_add_tail()    clear_bit()
@@ -602,7 +728,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 			if (ret == SHRINK_EMPTY)
 				ret = 0;
 			else
-				memcg_set_shrinker_bit(memcg, nid, i);
+				set_shrinker_bit(memcg, nid, i);
 		}
 		freed += ret;
 
-- 
cgit v1.2.3


From e4262c4f51d6373447c9d89093f49ff6b1e607be Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Tue, 4 May 2021 18:36:23 -0700
Subject: mm: memcontrol: rename shrinker_map to shrinker_info

The following patch is going to add nr_deferred into shrinker_map, the
change will make shrinker_map not only include map anymore, so rename it
to "memcg_shrinker_info".  And this should make the patch adding
nr_deferred cleaner and readable and make review easier.  Also remove the
"memcg_" prefix.

Link: https://lkml.kernel.org/r/20210311190845.9708-7-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  8 +++----
 mm/memcontrol.c            |  6 ++---
 mm/vmscan.c                | 58 +++++++++++++++++++++++-----------------------
 3 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7dbd2c9bad32..6cd800fe9a67 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -117,7 +117,7 @@ struct batched_lruvec_stat {
  * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
  * which have elements charged to this memcg.
  */
-struct memcg_shrinker_map {
+struct shrinker_info {
 	struct rcu_head rcu;
 	unsigned long map[];
 };
@@ -145,7 +145,7 @@ struct mem_cgroup_per_node {
 
 	struct mem_cgroup_reclaim_iter	iter;
 
-	struct memcg_shrinker_map __rcu	*shrinker_map;
+	struct shrinker_info __rcu	*shrinker_info;
 
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long		usage_in_excess;/* Set to the value by which */
@@ -1610,8 +1610,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 	return false;
 }
 
-int alloc_shrinker_maps(struct mem_cgroup *memcg);
-void free_shrinker_maps(struct mem_cgroup *memcg);
+int alloc_shrinker_info(struct mem_cgroup *memcg);
+void free_shrinker_info(struct mem_cgroup *memcg);
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
 #else
 #define mem_cgroup_sockets_enabled 0
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09fd17ba6de2..36f31d611dea 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5118,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
 	/*
-	 * A memcg must be visible for expand_shrinker_maps()
+	 * A memcg must be visible for expand_shrinker_info()
 	 * by the time the maps are allocated. So, we allocate maps
 	 * here, when for_each_mem_cgroup() can't skip it.
 	 */
-	if (alloc_shrinker_maps(memcg)) {
+	if (alloc_shrinker_info(memcg)) {
 		mem_cgroup_id_remove(memcg);
 		return -ENOMEM;
 	}
@@ -5186,7 +5186,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 	vmpressure_cleanup(&memcg->vmpressure);
 	cancel_work_sync(&memcg->high_work);
 	mem_cgroup_remove_from_trees(memcg);
-	free_shrinker_maps(memcg);
+	free_shrinker_info(memcg);
 	memcg_free_kmem(memcg);
 	mem_cgroup_free(memcg);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index aa99a835cf89..518084ce8757 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -192,16 +192,16 @@ static inline int shrinker_map_size(int nr_items)
 	return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
 }
 
-static int expand_one_shrinker_map(struct mem_cgroup *memcg,
-				   int size, int old_size)
+static int expand_one_shrinker_info(struct mem_cgroup *memcg,
+				    int size, int old_size)
 {
-	struct memcg_shrinker_map *new, *old;
+	struct shrinker_info *new, *old;
 	struct mem_cgroup_per_node *pn;
 	int nid;
 
 	for_each_node(nid) {
 		pn = memcg->nodeinfo[nid];
-		old = rcu_dereference_protected(pn->shrinker_map, true);
+		old = rcu_dereference_protected(pn->shrinker_info, true);
 		/* Not yet online memcg */
 		if (!old)
 			return 0;
@@ -214,17 +214,17 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg,
 		memset(new->map, (int)0xff, old_size);
 		memset((void *)new->map + old_size, 0, size - old_size);
 
-		rcu_assign_pointer(pn->shrinker_map, new);
+		rcu_assign_pointer(pn->shrinker_info, new);
 		kvfree_rcu(old, rcu);
 	}
 
 	return 0;
 }
 
-void free_shrinker_maps(struct mem_cgroup *memcg)
+void free_shrinker_info(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_node *pn;
-	struct memcg_shrinker_map *map;
+	struct shrinker_info *info;
 	int nid;
 
 	if (mem_cgroup_is_root(memcg))
@@ -232,15 +232,15 @@ void free_shrinker_maps(struct mem_cgroup *memcg)
 
 	for_each_node(nid) {
 		pn = memcg->nodeinfo[nid];
-		map = rcu_dereference_protected(pn->shrinker_map, true);
-		kvfree(map);
-		rcu_assign_pointer(pn->shrinker_map, NULL);
+		info = rcu_dereference_protected(pn->shrinker_info, true);
+		kvfree(info);
+		rcu_assign_pointer(pn->shrinker_info, NULL);
 	}
 }
 
-int alloc_shrinker_maps(struct mem_cgroup *memcg)
+int alloc_shrinker_info(struct mem_cgroup *memcg)
 {
-	struct memcg_shrinker_map *map;
+	struct shrinker_info *info;
 	int nid, size, ret = 0;
 
 	if (mem_cgroup_is_root(memcg))
@@ -249,20 +249,20 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg)
 	down_write(&shrinker_rwsem);
 	size = shrinker_map_size(shrinker_nr_max);
 	for_each_node(nid) {
-		map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
-		if (!map) {
-			free_shrinker_maps(memcg);
+		info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
+		if (!info) {
+			free_shrinker_info(memcg);
 			ret = -ENOMEM;
 			break;
 		}
-		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
+		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
 	}
 	up_write(&shrinker_rwsem);
 
 	return ret;
 }
 
-static int expand_shrinker_maps(int new_id)
+static int expand_shrinker_info(int new_id)
 {
 	int size, old_size, ret = 0;
 	int new_nr_max = new_id + 1;
@@ -282,7 +282,7 @@ static int expand_shrinker_maps(int new_id)
 	do {
 		if (mem_cgroup_is_root(memcg))
 			continue;
-		ret = expand_one_shrinker_map(memcg, size, old_size);
+		ret = expand_one_shrinker_info(memcg, size, old_size);
 		if (ret) {
 			mem_cgroup_iter_break(NULL, memcg);
 			goto out;
@@ -298,13 +298,13 @@ out:
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 {
 	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
-		struct memcg_shrinker_map *map;
+		struct shrinker_info *info;
 
 		rcu_read_lock();
-		map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
+		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
 		/* Pairs with smp mb in shrink_slab() */
 		smp_mb__before_atomic();
-		set_bit(shrinker_id, map->map);
+		set_bit(shrinker_id, info->map);
 		rcu_read_unlock();
 	}
 }
@@ -335,7 +335,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 		goto unlock;
 
 	if (id >= shrinker_nr_max) {
-		if (expand_shrinker_maps(id)) {
+		if (expand_shrinker_info(id)) {
 			idr_remove(&shrinker_idr, id);
 			goto unlock;
 		}
@@ -665,7 +665,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 			struct mem_cgroup *memcg, int priority)
 {
-	struct memcg_shrinker_map *map;
+	struct shrinker_info *info;
 	unsigned long ret, freed = 0;
 	int i;
 
@@ -675,12 +675,12 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 	if (!down_read_trylock(&shrinker_rwsem))
 		return 0;
 
-	map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
-					true);
-	if (unlikely(!map))
+	info = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+					 true);
+	if (unlikely(!info))
 		goto unlock;
 
-	for_each_set_bit(i, map->map, shrinker_nr_max) {
+	for_each_set_bit(i, info->map, shrinker_nr_max) {
 		struct shrink_control sc = {
 			.gfp_mask = gfp_mask,
 			.nid = nid,
@@ -691,7 +691,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 		shrinker = idr_find(&shrinker_idr, i);
 		if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
 			if (!shrinker)
-				clear_bit(i, map->map);
+				clear_bit(i, info->map);
 			continue;
 		}
 
@@ -702,7 +702,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 
 		ret = do_shrink_slab(&sc, shrinker, priority);
 		if (ret == SHRINK_EMPTY) {
-			clear_bit(i, map->map);
+			clear_bit(i, info->map);
 			/*
 			 * After the shrinker reported that it had no objects to
 			 * free, but before we cleared the corresponding bit in
-- 
cgit v1.2.3


From 41ca668a71e7b03743369a2c6d8b8edc1e943dc8 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Tue, 4 May 2021 18:36:29 -0700
Subject: mm: vmscan: use a new flag to indicate shrinker is registered

Currently registered shrinker is indicated by non-NULL
shrinker->nr_deferred.  This approach is fine with nr_deferred at the
shrinker level, but the following patches will move MEMCG_AWARE
shrinkers' nr_deferred to memcg level, so their shrinker->nr_deferred
would always be NULL.  This would prevent the shrinkers from
unregistering correctly.

Remove SHRINKER_REGISTERING since we could check if shrinker is
registered successfully by the new flag.

Link: https://lkml.kernel.org/r/20210311190845.9708-9-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shrinker.h |  7 ++++---
 mm/vmscan.c              | 40 +++++++++++++++-------------------------
 2 files changed, 19 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 0f80123650e2..1eac79ce57d4 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -79,13 +79,14 @@ struct shrinker {
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
 
 /* Flags */
-#define SHRINKER_NUMA_AWARE	(1 << 0)
-#define SHRINKER_MEMCG_AWARE	(1 << 1)
+#define SHRINKER_REGISTERED	(1 << 0)
+#define SHRINKER_NUMA_AWARE	(1 << 1)
+#define SHRINKER_MEMCG_AWARE	(1 << 2)
 /*
  * It just makes sense when the shrinker is also MEMCG_AWARE for now,
  * non-MEMCG_AWARE shrinker should not have this flag set.
  */
-#define SHRINKER_NONSLAB	(1 << 2)
+#define SHRINKER_NONSLAB	(1 << 3)
 
 extern int prealloc_shrinker(struct shrinker *shrinker);
 extern void register_shrinker_prepared(struct shrinker *shrinker);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 400f4a657b27..d1601163d895 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -316,19 +316,6 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 	}
 }
 
-/*
- * We allow subsystems to populate their shrinker-related
- * LRU lists before register_shrinker_prepared() is called
- * for the shrinker, since we don't want to impose
- * restrictions on their internal registration order.
- * In this case shrink_slab_memcg() may find corresponding
- * bit is set in the shrinkers map.
- *
- * This value is used by the function to detect registering
- * shrinkers and to skip do_shrink_slab() calls for them.
- */
-#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
-
 static DEFINE_IDR(shrinker_idr);
 
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
@@ -337,7 +324,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 
 	down_write(&shrinker_rwsem);
 	/* This may call shrinker, so it must use down_read_trylock() */
-	id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
 	if (id < 0)
 		goto unlock;
 
@@ -360,9 +347,9 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
 
 	BUG_ON(id < 0);
 
-	down_write(&shrinker_rwsem);
+	lockdep_assert_held(&shrinker_rwsem);
+
 	idr_remove(&shrinker_idr, id);
-	up_write(&shrinker_rwsem);
 }
 
 static bool cgroup_reclaim(struct scan_control *sc)
@@ -490,8 +477,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
 	if (!shrinker->nr_deferred)
 		return;
 
-	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+		down_write(&shrinker_rwsem);
 		unregister_memcg_shrinker(shrinker);
+		up_write(&shrinker_rwsem);
+	}
 
 	kfree(shrinker->nr_deferred);
 	shrinker->nr_deferred = NULL;
@@ -501,10 +491,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 {
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG
-	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-		idr_replace(&shrinker_idr, shrinker, shrinker->id);
-#endif
+	shrinker->flags |= SHRINKER_REGISTERED;
 	up_write(&shrinker_rwsem);
 }
 
@@ -524,13 +511,16 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
-	if (!shrinker->nr_deferred)
+	if (!(shrinker->flags & SHRINKER_REGISTERED))
 		return;
-	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-		unregister_memcg_shrinker(shrinker);
+
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
+	shrinker->flags &= ~SHRINKER_REGISTERED;
+	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+		unregister_memcg_shrinker(shrinker);
 	up_write(&shrinker_rwsem);
+
 	kfree(shrinker->nr_deferred);
 	shrinker->nr_deferred = NULL;
 }
@@ -695,7 +685,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 		struct shrinker *shrinker;
 
 		shrinker = idr_find(&shrinker_idr, i);
-		if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+		if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
 			if (!shrinker)
 				clear_bit(i, info->map);
 			continue;
-- 
cgit v1.2.3


From 3c6f17e6c5d048c8029578c475dd037dd5db58af Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Tue, 4 May 2021 18:36:33 -0700
Subject: mm: vmscan: add per memcg shrinker nr_deferred

Currently the number of deferred objects are per shrinker, but some
slabs, for example, vfs inode/dentry cache are per memcg, this would
result in poor isolation among memcgs.

The deferred objects typically are generated by __GFP_NOFS allocations,
one memcg with excessive __GFP_NOFS allocations may blow up deferred
objects, then other innocent memcgs may suffer from over shrink,
excessive reclaim latency, etc.

For example, two workloads run in memcgA and memcgB respectively,
workload in B is vfs heavy workload.  Workload in A generates excessive
deferred objects, then B's vfs cache might be hit heavily (drop half of
caches) by B's limit reclaim or global reclaim.

We observed this hit in our production environment which was running vfs
heavy workload shown as the below tracing log:

  <...>-409454 [016] .... 28286961.747146: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458:
  nid: 1 objects to shrink 3641681686040 gfp_flags GFP_HIGHUSER_MOVABLE|__GFP_ZERO pgs_scanned 1 lru_pgs 15721
  cache items 246404277 delta 31345 total_scan 123202138
  <...>-409454 [022] .... 28287105.928018: mm_shrink_slab_end: super_cache_scan+0x0/0x1a0 ffff9a83046f3458:
  nid: 1 unused scan count 3641681686040 new scan count 3641798379189 total_scan 602
  last shrinker return val 123186855

The vfs cache and page cache ratio was 10:1 on this machine, and half of
caches were dropped.  This also resulted in significant amount of page
caches were dropped due to inodes eviction.

Make nr_deferred per memcg for memcg aware shrinkers would solve the
unfairness and bring better isolation.

The following patch will add nr_deferred to parent memcg when memcg
offline.  To preserve nr_deferred when reparenting memcgs to root, root
memcg needs shrinker_info allocated too.

When memcg is not enabled (!CONFIG_MEMCG or memcg disabled), the
shrinker's nr_deferred would be used.  And non memcg aware shrinkers use
shrinker's nr_deferred all the time.

Link: https://lkml.kernel.org/r/20210311190845.9708-10-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  7 +++---
 mm/vmscan.c                | 60 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 46 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6cd800fe9a67..32bd62047238 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -114,12 +114,13 @@ struct batched_lruvec_stat {
 };
 
 /*
- * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
- * which have elements charged to this memcg.
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to this memcg.
  */
 struct shrinker_info {
 	struct rcu_head rcu;
-	unsigned long map[];
+	atomic_long_t *nr_deferred;
+	unsigned long *map;
 };
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d1601163d895..db668c4c78f4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -187,11 +187,17 @@ static DECLARE_RWSEM(shrinker_rwsem);
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
 
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
 static inline int shrinker_map_size(int nr_items)
 {
 	return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
 }
 
+static inline int shrinker_defer_size(int nr_items)
+{
+	return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+}
+
 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
 						     int nid)
 {
@@ -200,11 +206,13 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
 }
 
 static int expand_one_shrinker_info(struct mem_cgroup *memcg,
-				    int size, int old_size)
+				    int map_size, int defer_size,
+				    int old_map_size, int old_defer_size)
 {
 	struct shrinker_info *new, *old;
 	struct mem_cgroup_per_node *pn;
 	int nid;
+	int size = map_size + defer_size;
 
 	for_each_node(nid) {
 		pn = memcg->nodeinfo[nid];
@@ -217,9 +225,16 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
 		if (!new)
 			return -ENOMEM;
 
-		/* Set all old bits, clear all new bits */
-		memset(new->map, (int)0xff, old_size);
-		memset((void *)new->map + old_size, 0, size - old_size);
+		new->nr_deferred = (atomic_long_t *)(new + 1);
+		new->map = (void *)new->nr_deferred + defer_size;
+
+		/* map: set all old bits, clear all new bits */
+		memset(new->map, (int)0xff, old_map_size);
+		memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
+		/* nr_deferred: copy old values, clear all new values */
+		memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
+		memset((void *)new->nr_deferred + old_defer_size, 0,
+		       defer_size - old_defer_size);
 
 		rcu_assign_pointer(pn->shrinker_info, new);
 		kvfree_rcu(old, rcu);
@@ -234,9 +249,6 @@ void free_shrinker_info(struct mem_cgroup *memcg)
 	struct shrinker_info *info;
 	int nid;
 
-	if (mem_cgroup_is_root(memcg))
-		return;
-
 	for_each_node(nid) {
 		pn = memcg->nodeinfo[nid];
 		info = rcu_dereference_protected(pn->shrinker_info, true);
@@ -249,12 +261,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
 {
 	struct shrinker_info *info;
 	int nid, size, ret = 0;
-
-	if (mem_cgroup_is_root(memcg))
-		return 0;
+	int map_size, defer_size = 0;
 
 	down_write(&shrinker_rwsem);
-	size = shrinker_map_size(shrinker_nr_max);
+	map_size = shrinker_map_size(shrinker_nr_max);
+	defer_size = shrinker_defer_size(shrinker_nr_max);
+	size = map_size + defer_size;
 	for_each_node(nid) {
 		info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
 		if (!info) {
@@ -262,6 +274,8 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
 			ret = -ENOMEM;
 			break;
 		}
+		info->nr_deferred = (atomic_long_t *)(info + 1);
+		info->map = (void *)info->nr_deferred + defer_size;
 		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
 	}
 	up_write(&shrinker_rwsem);
@@ -269,15 +283,21 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
 	return ret;
 }
 
+static inline bool need_expand(int nr_max)
+{
+	return round_up(nr_max, BITS_PER_LONG) >
+	       round_up(shrinker_nr_max, BITS_PER_LONG);
+}
+
 static int expand_shrinker_info(int new_id)
 {
-	int size, old_size, ret = 0;
+	int ret = 0;
 	int new_nr_max = new_id + 1;
+	int map_size, defer_size = 0;
+	int old_map_size, old_defer_size = 0;
 	struct mem_cgroup *memcg;
 
-	size = shrinker_map_size(new_nr_max);
-	old_size = shrinker_map_size(shrinker_nr_max);
-	if (size <= old_size)
+	if (!need_expand(new_nr_max))
 		goto out;
 
 	if (!root_mem_cgroup)
@@ -285,11 +305,15 @@ static int expand_shrinker_info(int new_id)
 
 	lockdep_assert_held(&shrinker_rwsem);
 
+	map_size = shrinker_map_size(new_nr_max);
+	defer_size = shrinker_defer_size(new_nr_max);
+	old_map_size = shrinker_map_size(shrinker_nr_max);
+	old_defer_size = shrinker_defer_size(shrinker_nr_max);
+
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
-		if (mem_cgroup_is_root(memcg))
-			continue;
-		ret = expand_one_shrinker_info(memcg, size, old_size);
+		ret = expand_one_shrinker_info(memcg, map_size, defer_size,
+					       old_map_size, old_defer_size);
 		if (ret) {
 			mem_cgroup_iter_break(NULL, memcg);
 			goto out;
-- 
cgit v1.2.3


From a178015cde69981cdcd8f109c5abc98703fead62 Mon Sep 17 00:00:00 2001
From: Yang Shi <shy828301@gmail.com>
Date: Tue, 4 May 2021 18:36:42 -0700
Subject: mm: memcontrol: reparent nr_deferred when memcg offline

Now shrinker's nr_deferred is per memcg for memcg aware shrinkers, add
to parent's corresponding nr_deferred when memcg offline.

Link: https://lkml.kernel.org/r/20210311190845.9708-13-shy828301@gmail.com
Signed-off-by: Yang Shi <shy828301@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  1 +
 mm/memcontrol.c            |  1 +
 mm/vmscan.c                | 24 ++++++++++++++++++++++++
 3 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 32bd62047238..c193be760709 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1614,6 +1614,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 int alloc_shrinker_info(struct mem_cgroup *memcg);
 void free_shrinker_info(struct mem_cgroup *memcg);
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
+void reparent_shrinker_deferred(struct mem_cgroup *memcg);
 #else
 #define mem_cgroup_sockets_enabled 0
 static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36f31d611dea..3004afb6d090 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5154,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	page_counter_set_low(&memcg->memory, 0);
 
 	memcg_offline_kmem(memcg);
+	reparent_shrinker_deferred(memcg);
 	wb_memcg_offline(memcg);
 
 	drain_all_stock(memcg);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9617c5ff3e98..8c2d2003acbe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -397,6 +397,30 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
 	return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
 }
 
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+	int i, nid;
+	long nr;
+	struct mem_cgroup *parent;
+	struct shrinker_info *child_info, *parent_info;
+
+	parent = parent_mem_cgroup(memcg);
+	if (!parent)
+		parent = root_mem_cgroup;
+
+	/* Prevent from concurrent shrinker_info expand */
+	down_read(&shrinker_rwsem);
+	for_each_node(nid) {
+		child_info = shrinker_info_protected(memcg, nid);
+		parent_info = shrinker_info_protected(parent, nid);
+		for (i = 0; i < shrinker_nr_max; i++) {
+			nr = atomic_long_read(&child_info->nr_deferred[i]);
+			atomic_long_add(nr, &parent_info->nr_deferred[i]);
+		}
+	}
+	up_read(&shrinker_rwsem);
+}
+
 static bool cgroup_reclaim(struct scan_control *sc)
 {
 	return sc->target_mem_cgroup;
-- 
cgit v1.2.3


From ef4984384172e93cc95e0e8cd102536d67e8a787 Mon Sep 17 00:00:00 2001
From: Pintu Kumar <pintu@codeaurora.org>
Date: Tue, 4 May 2021 18:36:48 -0700
Subject: mm/compaction: remove unused variable sysctl_compact_memory

The sysctl_compact_memory is mostly unused in mm/compaction.c It just
acts as a place holder for sysctl to store .data.

But the .data itself is not needed here.

So we can get ride of this variable completely and make .data as NULL.
This will also eliminate the extern declaration from header file.  No
functionality is broken or changed this way.

Link: https://lkml.kernel.org/r/1614852224-14671-1-git-send-email-pintu@codeaurora.org
Signed-off-by: Pintu Kumar <pintu@codeaurora.org>
Signed-off-by: Pintu Agarwal <pintu.ping@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 1 -
 kernel/sysctl.c            | 2 +-
 mm/compaction.c            | 3 ---
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index ed4070ed41ef..4221888bdcd6 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order)
 }
 
 #ifdef CONFIG_COMPACTION
-extern int sysctl_compact_memory;
 extern unsigned int sysctl_compaction_proactiveness;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 			void *buffer, size_t *length, loff_t *ppos);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f91d327273c1..14edf84cc571 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2830,7 +2830,7 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_COMPACTION
 	{
 		.procname	= "compact_memory",
-		.data		= &sysctl_compact_memory,
+		.data		= NULL,
 		.maxlen		= sizeof(int),
 		.mode		= 0200,
 		.proc_handler	= sysctl_compaction_handler,
diff --git a/mm/compaction.c b/mm/compaction.c
index 335862f1661c..027eb794e747 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2692,9 +2692,6 @@ static void compact_nodes(void)
 		compact_node(nid);
 }
 
-/* The written value is actually unused, all memory is compacted */
-int sysctl_compact_memory;
-
 /*
  * Tunable for proactive compaction. It determines how
  * aggressively the kernel should compact memory in the
-- 
cgit v1.2.3


From d479960e44f27e0e52ba31b21740b703c538027c Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 4 May 2021 18:36:54 -0700
Subject: mm: disable LRU pagevec during the migration temporarily

LRU pagevec holds refcount of pages until the pagevec are drained.  It
could prevent migration since the refcount of the page is greater than
the expection in migration logic.  To mitigate the issue, callers of
migrate_pages drains LRU pagevec via migrate_prep or lru_add_drain_all
before migrate_pages call.

However, it's not enough because pages coming into pagevec after the
draining call still could stay at the pagevec so it could keep
preventing page migration.  Since some callers of migrate_pages have
retrial logic with LRU draining, the page would migrate at next trail
but it is still fragile in that it doesn't close the fundamental race
between upcoming LRU pages into pagvec and migration so the migration
failure could cause contiguous memory allocation failure in the end.

To close the race, this patch disables lru caches(i.e, pagevec) during
ongoing migration until migrate is done.

Since it's really hard to reproduce, I measured how many times
migrate_pages retried with force mode(it is about a fallback to a sync
migration) with below debug code.

int migrate_pages(struct list_head *from, new_page_t get_new_page,
			..
			..

  if (rc && reason == MR_CONTIG_RANGE && pass > 2) {
         printk(KERN_ERR, "pfn 0x%lx reason %d", page_to_pfn(page), rc);
         dump_page(page, "fail to migrate");
  }

The test was repeating android apps launching with cma allocation in
background every five seconds.  Total cma allocation count was about 500
during the testing.  With this patch, the dump_page count was reduced
from 400 to 30.

The new interface is also useful for memory hotplug which currently
drains lru pcp caches after each migration failure.  This is rather
suboptimal as it has to disrupt others running during the operation.
With the new interface the operation happens only once.  This is also in
line with pcp allocator cache which are disabled for the offlining as
well.

Link: https://lkml.kernel.org/r/20210319175127.886124-1-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Chris Goldsworthy <cgoldswo@codeaurora.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: John Dias <joaodias@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Oliver Sang <oliver.sang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  2 ++
 include/linux/swap.h    | 14 +++++++++++
 mm/memory_hotplug.c     |  3 ++-
 mm/mempolicy.c          |  4 ++++
 mm/migrate.c            | 11 +++++----
 mm/page_alloc.c         |  2 ++
 mm/swap.c               | 64 ++++++++++++++++++++++++++++++++++++++++++-------
 7 files changed, 86 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3a389633b68f..9e4a2dc8622c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -46,6 +46,7 @@ extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 extern void putback_movable_page(struct page *page);
 
 extern void migrate_prep(void);
+extern void migrate_finish(void);
 extern void migrate_prep_local(void);
 extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
@@ -67,6 +68,7 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
 	{ return -EBUSY; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
+static inline int migrate_finish(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
 
 static inline void migrate_page_states(struct page *newpage, struct page *page)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 42191da1bdc9..f69e0f67651d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -340,6 +340,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file,
 extern void lru_note_cost_page(struct page *);
 extern void lru_cache_add(struct page *);
 extern void mark_page_accessed(struct page *);
+
+extern atomic_t lru_disable_count;
+
+static inline bool lru_cache_disabled(void)
+{
+	return atomic_read(&lru_disable_count);
+}
+
+static inline void lru_cache_enable(void)
+{
+	atomic_dec(&lru_disable_count);
+}
+
+extern void lru_cache_disable(void);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0cdbbfbc5757..729fba144c71 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1611,6 +1611,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	 * in a way that pages from isolated pageblock are left on pcplists.
 	 */
 	zone_pcp_disable(zone);
+	lru_cache_disable();
 
 	/* set above range as isolated */
 	ret = start_isolate_page_range(start_pfn, end_pfn,
@@ -1642,7 +1643,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 			}
 
 			cond_resched();
-			lru_add_drain_all();
 
 			ret = scan_movable_pages(pfn, end_pfn, &pfn);
 			if (!ret) {
@@ -1687,6 +1687,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
 	spin_unlock_irqrestore(&zone->lock, flags);
 
+	lru_cache_enable();
 	zone_pcp_enable(zone);
 
 	/* removal success */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cd0295567a04..3b95e169e97d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 			break;
 	}
 	mmap_read_unlock(mm);
+
+	migrate_finish();
 	if (err < 0)
 		return err;
 	return busy;
@@ -1371,6 +1373,8 @@ up_out:
 	mmap_write_unlock(mm);
 mpol_out:
 	mpol_put(new);
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		migrate_finish();
 	return err;
 }
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 47df0df8f21a..5b09567dc293 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -66,11 +66,13 @@ void migrate_prep(void)
 {
 	/*
 	 * Clear the LRU lists so pages can be isolated.
-	 * Note that pages may be moved off the LRU after we have
-	 * drained them. Those pages will fail to migrate like other
-	 * pages that may be busy.
 	 */
-	lru_add_drain_all();
+	lru_cache_disable();
+}
+
+void migrate_finish(void)
+{
+	lru_cache_enable();
 }
 
 /* Do the necessary work of migrate_prep but not if it involves other CPUs */
@@ -1838,6 +1840,7 @@ out_flush:
 	if (err >= 0)
 		err = err1;
 out:
+	migrate_finish();
 	return err;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 64d4aae2a78a..2cefb634e0d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8715,6 +8715,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		if (ret == -ENOMEM)
 			break;
 	}
+
+	migrate_finish();
 	if (ret < 0) {
 		alloc_contig_dump_pages(&cc->migratepages);
 		putback_movable_pages(&cc->migratepages);
diff --git a/mm/swap.c b/mm/swap.c
index 31b844d4ed94..c94f55e7b649 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -235,6 +235,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
 	}
 }
 
+/* return true if pagevec needs to drain */
+static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
+{
+	bool ret = false;
+
+	if (!pagevec_add(pvec, page) || PageCompound(page) ||
+			lru_cache_disabled())
+		ret = true;
+
+	return ret;
+}
+
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
@@ -252,7 +264,7 @@ void rotate_reclaimable_page(struct page *page)
 		get_page(page);
 		local_lock_irqsave(&lru_rotate.lock, flags);
 		pvec = this_cpu_ptr(&lru_rotate.pvec);
-		if (!pagevec_add(pvec, page) || PageCompound(page))
+		if (pagevec_add_and_need_flush(pvec, page))
 			pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
 		local_unlock_irqrestore(&lru_rotate.lock, flags);
 	}
@@ -343,7 +355,7 @@ static void activate_page(struct page *page)
 		local_lock(&lru_pvecs.lock);
 		pvec = this_cpu_ptr(&lru_pvecs.activate_page);
 		get_page(page);
-		if (!pagevec_add(pvec, page) || PageCompound(page))
+		if (pagevec_add_and_need_flush(pvec, page))
 			pagevec_lru_move_fn(pvec, __activate_page);
 		local_unlock(&lru_pvecs.lock);
 	}
@@ -458,7 +470,7 @@ void lru_cache_add(struct page *page)
 	get_page(page);
 	local_lock(&lru_pvecs.lock);
 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
-	if (!pagevec_add(pvec, page) || PageCompound(page))
+	if (pagevec_add_and_need_flush(pvec, page))
 		__pagevec_lru_add(pvec);
 	local_unlock(&lru_pvecs.lock);
 }
@@ -654,7 +666,7 @@ void deactivate_file_page(struct page *page)
 		local_lock(&lru_pvecs.lock);
 		pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
 
-		if (!pagevec_add(pvec, page) || PageCompound(page))
+		if (pagevec_add_and_need_flush(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
 		local_unlock(&lru_pvecs.lock);
 	}
@@ -676,7 +688,7 @@ void deactivate_page(struct page *page)
 		local_lock(&lru_pvecs.lock);
 		pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
 		get_page(page);
-		if (!pagevec_add(pvec, page) || PageCompound(page))
+		if (pagevec_add_and_need_flush(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_deactivate_fn);
 		local_unlock(&lru_pvecs.lock);
 	}
@@ -698,7 +710,7 @@ void mark_page_lazyfree(struct page *page)
 		local_lock(&lru_pvecs.lock);
 		pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
 		get_page(page);
-		if (!pagevec_add(pvec, page) || PageCompound(page))
+		if (pagevec_add_and_need_flush(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
 		local_unlock(&lru_pvecs.lock);
 	}
@@ -735,7 +747,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  * Calling this function with cpu hotplug locks held can actually lead
  * to obscure indirect dependencies via WQ context.
  */
-void lru_add_drain_all(void)
+inline void __lru_add_drain_all(bool force_all_cpus)
 {
 	/*
 	 * lru_drain_gen - Global pages generation number
@@ -780,7 +792,7 @@ void lru_add_drain_all(void)
 	 * (C) Exit the draining operation if a newer generation, from another
 	 * lru_add_drain_all(), was already scheduled for draining. Check (A).
 	 */
-	if (unlikely(this_gen != lru_drain_gen))
+	if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
 		goto done;
 
 	/*
@@ -810,7 +822,8 @@ void lru_add_drain_all(void)
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
 
-		if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+		if (force_all_cpus ||
+		    pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
 		    data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
 		    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
 		    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
@@ -828,6 +841,11 @@ void lru_add_drain_all(void)
 done:
 	mutex_unlock(&lock);
 }
+
+void lru_add_drain_all(void)
+{
+	__lru_add_drain_all(false);
+}
 #else
 void lru_add_drain_all(void)
 {
@@ -835,6 +853,34 @@ void lru_add_drain_all(void)
 }
 #endif /* CONFIG_SMP */
 
+atomic_t lru_disable_count = ATOMIC_INIT(0);
+
+/*
+ * lru_cache_disable() needs to be called before we start compiling
+ * a list of pages to be migrated using isolate_lru_page().
+ * It drains pages on LRU cache and then disable on all cpus until
+ * lru_cache_enable is called.
+ *
+ * Must be paired with a call to lru_cache_enable().
+ */
+void lru_cache_disable(void)
+{
+	atomic_inc(&lru_disable_count);
+#ifdef CONFIG_SMP
+	/*
+	 * lru_add_drain_all in the force mode will schedule draining on
+	 * all online CPUs so any calls of lru_cache_disabled wrapped by
+	 * local_lock or preemption disabled would be ordered by that.
+	 * The atomic operation doesn't need to have stronger ordering
+	 * requirements because that is enforeced by the scheduling
+	 * guarantees.
+	 */
+	__lru_add_drain_all(true);
+#else
+	lru_add_drain();
+#endif
+}
+
 /**
  * release_pages - batched put_page()
  * @pages: array of pages to release
-- 
cgit v1.2.3


From 361a2a229fa31ab7f2b236b5946e434964d00762 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 4 May 2021 18:36:57 -0700
Subject: mm: replace migrate_[prep|finish] with lru_cache_[disable|enable]

Currently, migrate_[prep|finish] is merely a wrapper of
lru_cache_[disable|enable].  There is not much to gain from having
additional abstraction.

Use lru_cache_[disable|enable] instead of migrate_[prep|finish], which
would be more descriptive.

note: migrate_prep_local in compaction.c changed into lru_add_drain to
avoid CPU schedule cost with involving many other CPUs to keep old
behavior.

Link: https://lkml.kernel.org/r/20210319175127.886124-2-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
Cc: John Dias <joaodias@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oliver Sang <oliver.sang@intel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  7 -------
 mm/compaction.c         |  3 ++-
 mm/mempolicy.c          |  8 ++++----
 mm/migrate.c            | 28 ++--------------------------
 mm/page_alloc.c         |  4 ++--
 5 files changed, 10 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9e4a2dc8622c..6155d97ec76c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -45,9 +45,6 @@ extern struct page *alloc_migration_target(struct page *page, unsigned long priv
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 extern void putback_movable_page(struct page *page);
 
-extern void migrate_prep(void);
-extern void migrate_finish(void);
-extern void migrate_prep_local(void);
 extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
@@ -67,10 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page,
 static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
 	{ return -EBUSY; }
 
-static inline int migrate_prep(void) { return -ENOSYS; }
-static inline int migrate_finish(void) { return -ENOSYS; }
-static inline int migrate_prep_local(void) { return -ENOSYS; }
-
 static inline void migrate_page_states(struct page *newpage, struct page *page)
 {
 }
diff --git a/mm/compaction.c b/mm/compaction.c
index 1be7928af62b..598dffbd5c8e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2354,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
 				cc->free_pfn, end_pfn, sync);
 
-	migrate_prep_local();
+	/* lru_add_drain_all could be expensive with involving other CPUs */
+	lru_add_drain();
 
 	while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
 		int err;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3b95e169e97d..c0343c742bed 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 	int err = 0;
 	nodemask_t tmp;
 
-	migrate_prep();
+	lru_cache_disable();
 
 	mmap_read_lock(mm);
 
@@ -1209,7 +1209,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 	}
 	mmap_read_unlock(mm);
 
-	migrate_finish();
+	lru_cache_enable();
 	if (err < 0)
 		return err;
 	return busy;
@@ -1325,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 
-		migrate_prep();
+		lru_cache_disable();
 	}
 	{
 		NODEMASK_SCRATCH(scratch);
@@ -1374,7 +1374,7 @@ up_out:
 mpol_out:
 	mpol_put(new);
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-		migrate_finish();
+		lru_cache_enable();
 	return err;
 }
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 5b09567dc293..b5fbeb4bf49a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -57,30 +57,6 @@
 
 #include "internal.h"
 
-/*
- * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
- * undesirable, use migrate_prep_local()
- */
-void migrate_prep(void)
-{
-	/*
-	 * Clear the LRU lists so pages can be isolated.
-	 */
-	lru_cache_disable();
-}
-
-void migrate_finish(void)
-{
-	lru_cache_enable();
-}
-
-/* Do the necessary work of migrate_prep but not if it involves other CPUs */
-void migrate_prep_local(void)
-{
-	lru_add_drain();
-}
-
 int isolate_movable_page(struct page *page, isolate_mode_t mode)
 {
 	struct address_space *mapping;
@@ -1771,7 +1747,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
 	int start, i;
 	int err = 0, err1;
 
-	migrate_prep();
+	lru_cache_disable();
 
 	for (i = start = 0; i < nr_pages; i++) {
 		const void __user *p;
@@ -1840,7 +1816,7 @@ out_flush:
 	if (err >= 0)
 		err = err1;
 out:
-	migrate_finish();
+	lru_cache_enable();
 	return err;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2cefb634e0d6..80754fc1f1ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8681,7 +8681,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
 	};
 
-	migrate_prep();
+	lru_cache_disable();
 
 	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
@@ -8716,7 +8716,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 			break;
 	}
 
-	migrate_finish();
+	lru_cache_enable();
 	if (ret < 0) {
 		alloc_contig_dump_pages(&cc->migratepages);
 		putback_movable_pages(&cc->migratepages);
-- 
cgit v1.2.3


From 8cc621d2f45ddd3dc664024a647ee7adf48d79a5 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 4 May 2021 18:37:00 -0700
Subject: mm: fs: invalidate BH LRU during page migration

Pages containing buffer_heads that are in one of the per-CPU buffer_head
LRU caches will be pinned and thus cannot be migrated.  This can prevent
CMA allocations from succeeding, which are often used on platforms with
co-processors (such as a DSP) that can only use physically contiguous
memory.  It can also prevent memory hot-unplugging from succeeding,
which involves migrating at least MIN_MEMORY_BLOCK_SIZE bytes of memory,
which ranges from 8 MiB to 1 GiB based on the architecture in use.

Correspondingly, invalidate the BH LRU caches before a migration starts
and stop any buffer_head from being cached in the LRU caches, until
migration has finished.

Link: https://lkml.kernel.org/r/20210319175127.886124-3-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reported-by: Chris Goldsworthy <cgoldswo@codeaurora.org>
Reported-by: Laura Abbott <labbott@kernel.org>
Tested-by: Oliver Sang <oliver.sang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: John Dias <joaodias@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                 | 36 ++++++++++++++++++++++++++++++------
 include/linux/buffer_head.h |  4 ++++
 mm/swap.c                   |  5 ++++-
 3 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 0cb7ffd4977c..e9872d0dcbf1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1264,6 +1264,15 @@ static void bh_lru_install(struct buffer_head *bh)
 	int i;
 
 	check_irqs_on();
+	/*
+	 * the refcount of buffer_head in bh_lru prevents dropping the
+	 * attached page(i.e., try_to_free_buffers) so it could cause
+	 * failing page migration.
+	 * Skip putting upcoming bh into bh_lru until migration is done.
+	 */
+	if (lru_cache_disabled())
+		return;
+
 	bh_lru_lock();
 
 	b = this_cpu_ptr(&bh_lrus);
@@ -1404,6 +1413,15 @@ __bread_gfp(struct block_device *bdev, sector_t block,
 }
 EXPORT_SYMBOL(__bread_gfp);
 
+static void __invalidate_bh_lrus(struct bh_lru *b)
+{
+	int i;
+
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		brelse(b->bhs[i]);
+		b->bhs[i] = NULL;
+	}
+}
 /*
  * invalidate_bh_lrus() is called rarely - but not only at unmount.
  * This doesn't race because it runs in each cpu either in irq
@@ -1412,16 +1430,12 @@ EXPORT_SYMBOL(__bread_gfp);
 static void invalidate_bh_lru(void *arg)
 {
 	struct bh_lru *b = &get_cpu_var(bh_lrus);
-	int i;
 
-	for (i = 0; i < BH_LRU_SIZE; i++) {
-		brelse(b->bhs[i]);
-		b->bhs[i] = NULL;
-	}
+	__invalidate_bh_lrus(b);
 	put_cpu_var(bh_lrus);
 }
 
-static bool has_bh_in_lru(int cpu, void *dummy)
+bool has_bh_in_lru(int cpu, void *dummy)
 {
 	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
 	int i;
@@ -1440,6 +1454,16 @@ void invalidate_bh_lrus(void)
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
+void invalidate_bh_lrus_cpu(int cpu)
+{
+	struct bh_lru *b;
+
+	bh_lru_lock();
+	b = per_cpu_ptr(&bh_lrus, cpu);
+	__invalidate_bh_lrus(b);
+	bh_lru_unlock();
+}
+
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset)
 {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6b47f94378c5..e7e99da31349 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
 struct buffer_head *__bread_gfp(struct block_device *,
 				sector_t block, unsigned size, gfp_t gfp);
 void invalidate_bh_lrus(void);
+void invalidate_bh_lrus_cpu(int cpu);
+bool has_bh_in_lru(int cpu, void *dummy);
 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void unlock_buffer(struct buffer_head *bh);
@@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bh_lrus_cpu(int cpu) {}
+static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; }
 #define buffer_heads_over_limit 0
 
 #endif /* CONFIG_BLOCK */
diff --git a/mm/swap.c b/mm/swap.c
index c94f55e7b649..a75a8265302b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -36,6 +36,7 @@
 #include <linux/hugetlb.h>
 #include <linux/page_idle.h>
 #include <linux/local_lock.h>
+#include <linux/buffer_head.h>
 
 #include "internal.h"
 
@@ -641,6 +642,7 @@ void lru_add_drain_cpu(int cpu)
 		pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
 
 	activate_page_drain(cpu);
+	invalidate_bh_lrus_cpu(cpu);
 }
 
 /**
@@ -828,7 +830,8 @@ inline void __lru_add_drain_all(bool force_all_cpus)
 		    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
 		    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
 		    pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
-		    need_activate_page_drain(cpu)) {
+		    need_activate_page_drain(cpu) ||
+		    has_bh_in_lru(cpu, NULL)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
 			queue_work_on(cpu, mm_percpu_wq, work);
 			__cpumask_set_cpu(cpu, &has_work);
-- 
cgit v1.2.3


From 606a6f71a25accfc960a5063c23717ff07aa43a3 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Tue, 4 May 2021 18:37:04 -0700
Subject: mm/migrate.c: make putback_movable_page() static

Patch series "Cleanup and fixup for mm/migrate.c", v3.

This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE and rc
!= MIGRATEPAGE_SUCCESS check.  Also use helper function to remove some
duplicated codes.  What's more, this fixes potential deadlock in NUMA
balancing shared exec THP case and so on.  More details can be found in
the respective changelogs.

This patch (of 5):

The putback_movable_page() is just called by putback_movable_pages() and
we know the page is locked and both PageMovable() and PageIsolated() is
checked right before calling putback_movable_page().  So we make it static
and remove all the 3 VM_BUG_ON_PAGE().

Link: https://lkml.kernel.org/r/20210325131524.48181-1-linmiaohe@huawei.com
Link: https://lkml.kernel.org/r/20210325131524.48181-2-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 1 -
 mm/migrate.c            | 7 +------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 6155d97ec76c..175ef15ae9e8 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -43,7 +43,6 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 extern struct page *alloc_migration_target(struct page *page, unsigned long private);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
-extern void putback_movable_page(struct page *page);
 
 extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
diff --git a/mm/migrate.c b/mm/migrate.c
index b5fbeb4bf49a..1e1f8324cefe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -118,15 +118,10 @@ out:
 	return -EBUSY;
 }
 
-/* It should be called on page which is PG_movable */
-void putback_movable_page(struct page *page)
+static void putback_movable_page(struct page *page)
 {
 	struct address_space *mapping;
 
-	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	VM_BUG_ON_PAGE(!PageMovable(page), page);
-	VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
 	mapping = page_mapping(page);
 	mapping->a_ops->putback_page(page);
 	__ClearPageIsolated(page);
-- 
cgit v1.2.3


From bbb269206f3c914d4f23e023de4ec020abea6d1b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 4 May 2021 18:37:19 -0700
Subject: mm: vmstat: add cma statistics

Since CMA is used more widely, it's worth to have CMA allocation
statistics into vmstat.  With it, we could know how agressively system
uses cma allocation and how often it fails.

Link: https://lkml.kernel.org/r/20210302183346.3707237-1-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: John Dias <joaodias@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h |  4 ++++
 mm/cma.c                      | 12 +++++++++---
 mm/vmstat.c                   |  4 ++++
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 18e75974d4e3..21d7c7f72f1c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -70,6 +70,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
+#endif
+#ifdef CONFIG_CMA
+		CMA_ALLOC_SUCCESS,
+		CMA_ALLOC_FAIL,
 #endif
 		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
 		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
diff --git a/mm/cma.c b/mm/cma.c
index acd6991f77a0..b44a71eb3174 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -435,13 +435,13 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 	int ret = -ENOMEM;
 
 	if (!cma || !cma->count || !cma->bitmap)
-		return NULL;
+		goto out;
 
 	pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
 		 count, align);
 
 	if (!count)
-		return NULL;
+		goto out;
 
 	mask = cma_bitmap_aligned_mask(cma, align);
 	offset = cma_bitmap_aligned_offset(cma, align);
@@ -449,7 +449,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 	bitmap_count = cma_bitmap_pages_to_bits(cma, count);
 
 	if (bitmap_count > bitmap_maxno)
-		return NULL;
+		goto out;
 
 	for (;;) {
 		spin_lock_irq(&cma->lock);
@@ -506,6 +506,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 	}
 
 	pr_debug("%s(): returned %p\n", __func__, page);
+out:
+	if (page)
+		count_vm_event(CMA_ALLOC_SUCCESS);
+	else
+		count_vm_event(CMA_ALLOC_FAIL);
+
 	return page;
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 74b2c374b86c..49a8456ec079 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1312,6 +1312,10 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_HUGETLB_PAGE
 	"htlb_buddy_alloc_success",
 	"htlb_buddy_alloc_fail",
+#endif
+#ifdef CONFIG_CMA
+	"cma_alloc_success",
+	"cma_alloc_fail",
 #endif
 	"unevictable_pgs_culled",
 	"unevictable_pgs_scanned",
-- 
cgit v1.2.3


From 78fa51503fdbe463c96eef4c3cf69ca54032647a Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 4 May 2021 18:37:34 -0700
Subject: mm: use proper type for cma_[alloc|release]

size_t in cma_alloc is confusing since it makes people think it's byte
count, not pages.  Change it to unsigned long[1].

The unsigned int in cma_release is also not right so change it.  Since we
have unsigned long in cma_release, free_contig_range should also respect
it.

[1] 67a2e213e7e9, mm: cma: fix incorrect type conversion for size during dma allocation

Link: https://lore.kernel.org/linux-mm/20210324043434.GP1719932@casper.infradead.org/
Link: https://lkml.kernel.org/r/20210331164018.710560-1-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cma.h        |  4 ++--
 include/linux/gfp.h        |  2 +-
 include/trace/events/cma.h | 22 +++++++++++-----------
 mm/cma.c                   | 17 +++++++++--------
 mm/page_alloc.c            |  6 +++---
 5 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 217999c8a762..53fd8c3cdbd0 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 					unsigned int order_per_bit,
 					const char *name,
 					struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
 			      bool no_warn);
-extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
+extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
 
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
 #endif
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 26f4d907254a..8a5f6c3d7dba 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -657,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end,
 extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
 				       int nid, nodemask_t *nodemask);
 #endif
-void free_contig_range(unsigned long pfn, unsigned int nr_pages);
+void free_contig_range(unsigned long pfn, unsigned long nr_pages);
 
 #ifdef CONFIG_CMA
 /* CMA stuff */
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index 5cf385ae7c08..c3d354702cb0 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -11,7 +11,7 @@
 DECLARE_EVENT_CLASS(cma_alloc_class,
 
 	TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
-		 unsigned int count, unsigned int align),
+		 unsigned long count, unsigned int align),
 
 	TP_ARGS(name, pfn, page, count, align),
 
@@ -19,7 +19,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class,
 		__string(name, name)
 		__field(unsigned long, pfn)
 		__field(const struct page *, page)
-		__field(unsigned int, count)
+		__field(unsigned long, count)
 		__field(unsigned int, align)
 	),
 
@@ -31,7 +31,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class,
 		__entry->align = align;
 	),
 
-	TP_printk("name=%s pfn=%lx page=%p count=%u align=%u",
+	TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u",
 		  __get_str(name),
 		  __entry->pfn,
 		  __entry->page,
@@ -42,7 +42,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class,
 TRACE_EVENT(cma_release,
 
 	TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
-		 unsigned int count),
+		 unsigned long count),
 
 	TP_ARGS(name, pfn, page, count),
 
@@ -50,7 +50,7 @@ TRACE_EVENT(cma_release,
 		__string(name, name)
 		__field(unsigned long, pfn)
 		__field(const struct page *, page)
-		__field(unsigned int, count)
+		__field(unsigned long, count)
 	),
 
 	TP_fast_assign(
@@ -60,7 +60,7 @@ TRACE_EVENT(cma_release,
 		__entry->count = count;
 	),
 
-	TP_printk("name=%s pfn=%lx page=%p count=%u",
+	TP_printk("name=%s pfn=%lx page=%p count=%lu",
 		  __get_str(name),
 		  __entry->pfn,
 		  __entry->page,
@@ -69,13 +69,13 @@ TRACE_EVENT(cma_release,
 
 TRACE_EVENT(cma_alloc_start,
 
-	TP_PROTO(const char *name, unsigned int count, unsigned int align),
+	TP_PROTO(const char *name, unsigned long count, unsigned int align),
 
 	TP_ARGS(name, count, align),
 
 	TP_STRUCT__entry(
 		__string(name, name)
-		__field(unsigned int, count)
+		__field(unsigned long, count)
 		__field(unsigned int, align)
 	),
 
@@ -85,7 +85,7 @@ TRACE_EVENT(cma_alloc_start,
 		__entry->align = align;
 	),
 
-	TP_printk("name=%s count=%u align=%u",
+	TP_printk("name=%s count=%lu align=%u",
 		  __get_str(name),
 		  __entry->count,
 		  __entry->align)
@@ -94,7 +94,7 @@ TRACE_EVENT(cma_alloc_start,
 DEFINE_EVENT(cma_alloc_class, cma_alloc_finish,
 
 	TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
-		 unsigned int count, unsigned int align),
+		 unsigned long count, unsigned int align),
 
 	TP_ARGS(name, pfn, page, count, align)
 );
@@ -102,7 +102,7 @@ DEFINE_EVENT(cma_alloc_class, cma_alloc_finish,
 DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry,
 
 	TP_PROTO(const char *name, unsigned long pfn, const struct page *page,
-		 unsigned int count, unsigned int align),
+		 unsigned long count, unsigned int align),
 
 	TP_ARGS(name, pfn, page, count, align)
 );
diff --git a/mm/cma.c b/mm/cma.c
index cdad8c4de921..995e15480937 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -79,7 +79,7 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
 }
 
 static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
-			     unsigned int count)
+			     unsigned long count)
 {
 	unsigned long bitmap_no, bitmap_count;
 	unsigned long flags;
@@ -423,21 +423,21 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
  * This function allocates part of contiguous memory on specific
  * contiguous memory area.
  */
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
-		       bool no_warn)
+struct page *cma_alloc(struct cma *cma, unsigned long count,
+		       unsigned int align, bool no_warn)
 {
 	unsigned long mask, offset;
 	unsigned long pfn = -1;
 	unsigned long start = 0;
 	unsigned long bitmap_maxno, bitmap_no, bitmap_count;
-	size_t i;
+	unsigned long i;
 	struct page *page = NULL;
 	int ret = -ENOMEM;
 
 	if (!cma || !cma->count || !cma->bitmap)
 		goto out;
 
-	pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
+	pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma,
 		 count, align);
 
 	if (!count)
@@ -505,7 +505,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 	}
 
 	if (ret && !no_warn) {
-		pr_err_ratelimited("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n",
+		pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n",
 				   __func__, cma->name, count, ret);
 		cma_debug_show_areas(cma);
 	}
@@ -534,14 +534,15 @@ out:
  * It returns false when provided pages do not belong to contiguous area and
  * true otherwise.
  */
-bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
+bool cma_release(struct cma *cma, const struct page *pages,
+		 unsigned long count)
 {
 	unsigned long pfn;
 
 	if (!cma || !pages)
 		return false;
 
-	pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count);
+	pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
 
 	pfn = page_to_pfn(pages);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 80754fc1f1ff..d12299c08b95 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8973,9 +8973,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
 }
 #endif /* CONFIG_CONTIG_ALLOC */
 
-void free_contig_range(unsigned long pfn, unsigned int nr_pages)
+void free_contig_range(unsigned long pfn, unsigned long nr_pages)
 {
-	unsigned int count = 0;
+	unsigned long count = 0;
 
 	for (; nr_pages--; pfn++) {
 		struct page *page = pfn_to_page(pfn);
@@ -8983,7 +8983,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
 		count += page_count(page) != 1;
 		__free_page(page);
 	}
-	WARN(count != 0, "%d pages are still in use!\n", count);
+	WARN(count != 0, "%lu pages are still in use!\n", count);
 }
 EXPORT_SYMBOL(free_contig_range);
 
-- 
cgit v1.2.3


From 575299ea18a8c0575d4c2ef6ad3fa4d41d529d1c Mon Sep 17 00:00:00 2001
From: Saravanan D <saravanand@fb.com>
Date: Tue, 4 May 2021 18:38:03 -0700
Subject: x86/mm: track linear mapping split events

To help with debugging the sluggishness caused by TLB miss/reload, we
introduce monotonic hugepage [direct mapped] split event counts since
system state: SYSTEM_RUNNING to be displayed as part of /proc/vmstat in
x86 servers

The lifetime split event information will be displayed at the bottom of
/proc/vmstat
  ....
  swap_ra 0
  swap_ra_hit 0
  direct_map_level2_splits 94
  direct_map_level3_splits 4
  nr_unstable 0
  ....

One of the many lasting sources of direct hugepage splits is kernel
tracing (kprobes, tracepoints).

Note that the kernel's code segment [512 MB] points to the same physical
addresses that have been already mapped in the kernel's direct mapping
range.

Source : Documentation/x86/x86_64/mm.rst

When we enable kernel tracing, the kernel has to modify
attributes/permissions of the text segment hugepages that are direct
mapped causing them to split.

Kernel's direct mapped hugepages do not coalesce back after split and
remain in place for the remainder of the lifetime.

An instance of direct page splits when we turn on dynamic kernel tracing
....
cat /proc/vmstat | grep -i direct_map_level
direct_map_level2_splits 784
direct_map_level3_splits 12
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @ [pid, comm] =
count(); }'
cat /proc/vmstat | grep -i
direct_map_level
direct_map_level2_splits 789
direct_map_level3_splits 12
....

Link: https://lkml.kernel.org/r/20210218235744.1040634-1-saravanand@fb.com
Signed-off-by: Saravanan D <saravanand@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pat/set_memory.c  | 8 ++++++++
 include/linux/vm_event_item.h | 4 ++++
 mm/vmstat.c                   | 4 ++++
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 427980617557..156cd235659f 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,8 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/libnvdimm.h>
+#include <linux/vmstat.h>
+#include <linux/kernel.h>
 
 #include <asm/e820/api.h>
 #include <asm/processor.h>
@@ -91,6 +93,12 @@ static void split_page_count(int level)
 		return;
 
 	direct_pages_count[level]--;
+	if (system_state == SYSTEM_RUNNING) {
+		if (level == PG_LEVEL_2M)
+			count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
+		else if (level == PG_LEVEL_1G)
+			count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
+	}
 	direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 21d7c7f72f1c..ae0dd1948c2b 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -124,6 +124,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_SWAP
 		SWAP_RA,
 		SWAP_RA_HIT,
+#endif
+#ifdef CONFIG_X86
+		DIRECT_MAP_LEVEL2_SPLIT,
+		DIRECT_MAP_LEVEL3_SPLIT,
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 06cd78dc914d..5ba118521ded 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1369,6 +1369,10 @@ const char * const vmstat_text[] = {
 	"swap_ra",
 	"swap_ra_hit",
 #endif
+#ifdef CONFIG_X86
+	"direct_map_level2_splits",
+	"direct_map_level3_splits",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
-- 
cgit v1.2.3


From 1a08ae36cf8b5f26d0c64ebfe46f8eb07ea0b678 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 4 May 2021 18:38:53 -0700
Subject: mm cma: rename PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN

PF_MEMALLOC_NOCMA is used ot guarantee that the allocator will not
return pages that might belong to CMA region.  This is currently used
for long term gup to make sure that such pins are not going to be done
on any CMA pages.

When PF_MEMALLOC_NOCMA has been introduced we haven't realized that it
is focusing on CMA pages too much and that there is larger class of
pages that need the same treatment.  MOVABLE zone cannot contain any
long term pins as well so it makes sense to reuse and redefine this flag
for that usecase as well.  Rename the flag to PF_MEMALLOC_PIN which
defines an allocation context which can only get pages suitable for
long-term pins.

Also rename: memalloc_nocma_save()/memalloc_nocma_restore to
memalloc_pin_save()/memalloc_pin_restore() and make the new functions
common.

[rppt@linux.ibm.com: fix renaming of PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN]
  Link: https://lkml.kernel.org/r/20210331163816.11517-1-rppt@kernel.org

Link: https://lkml.kernel.org/r/20210215161349.246722-6-pasha.tatashin@soleen.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tyler Hicks <tyhicks@linux.microsoft.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h    |  2 +-
 include/linux/sched/mm.h | 21 +++++----------------
 mm/gup.c                 |  4 ++--
 mm/hugetlb.c             |  4 ++--
 mm/page_alloc.c          |  4 ++--
 5 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c25c8e67030..d2c881384517 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1583,7 +1583,7 @@ extern struct pid *cad_pid;
 #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
 #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
-#define PF_MEMALLOC_NOCMA	0x10000000	/* All allocation request will have _GFP_MOVABLE cleared */
+#define PF_MEMALLOC_PIN		0x10000000	/* Allocation context constrained to zones which allow long term pinning. */
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 90b2a0bce11c..ae654819e8aa 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -271,29 +271,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
 	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
 }
 
-#ifdef CONFIG_CMA
-static inline unsigned int memalloc_nocma_save(void)
+static inline unsigned int memalloc_pin_save(void)
 {
-	unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
+	unsigned int flags = current->flags & PF_MEMALLOC_PIN;
 
-	current->flags |= PF_MEMALLOC_NOCMA;
+	current->flags |= PF_MEMALLOC_PIN;
 	return flags;
 }
 
-static inline void memalloc_nocma_restore(unsigned int flags)
+static inline void memalloc_pin_restore(unsigned int flags)
 {
-	current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
+	current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
 }
-#else
-static inline unsigned int memalloc_nocma_save(void)
-{
-	return 0;
-}
-
-static inline void memalloc_nocma_restore(unsigned int flags)
-{
-}
-#endif
 
 #ifdef CONFIG_MEMCG
 DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
diff --git a/mm/gup.c b/mm/gup.c
index 3a00f6a8ffd6..a1eff7ad31da 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1722,7 +1722,7 @@ static long __gup_longterm_locked(struct mm_struct *mm,
 	long rc;
 
 	if (gup_flags & FOLL_LONGTERM)
-		flags = memalloc_nocma_save();
+		flags = memalloc_pin_save();
 
 	rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
 				     gup_flags);
@@ -1731,7 +1731,7 @@ static long __gup_longterm_locked(struct mm_struct *mm,
 		if (rc > 0)
 			rc = check_and_migrate_cma_pages(mm, start, rc, pages,
 							 vmas, gup_flags);
-		memalloc_nocma_restore(flags);
+		memalloc_pin_restore(flags);
 	}
 	return rc;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 533e5a26e437..60dc197a4417 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1079,11 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 {
 	struct page *page;
-	bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
 
 	lockdep_assert_held(&hugetlb_lock);
 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-		if (nocma && is_migrate_cma_page(page))
+		if (pin && is_migrate_cma_page(page))
 			continue;
 
 		if (PageHWPoison(page))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d12299c08b95..c2fc6a64bef9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3865,8 +3865,8 @@ static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
 #ifdef CONFIG_CMA
 	unsigned int pflags = current->flags;
 
-	if (!(pflags & PF_MEMALLOC_NOCMA) &&
-			gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+	if (!(pflags & PF_MEMALLOC_PIN) &&
+	    gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 
 #endif
-- 
cgit v1.2.3


From 8e3560d963d22ba41857f48e4114ce80373144ea Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 4 May 2021 18:39:00 -0700
Subject: mm: honor PF_MEMALLOC_PIN for all movable pages

PF_MEMALLOC_PIN is only honored for CMA pages, extend this flag to work
for any allocations from ZONE_MOVABLE by removing __GFP_MOVABLE from
gfp_mask when this flag is passed in the current context.

Add is_pinnable_page() to return true if page is in a pinnable page.  A
pinnable page is not in ZONE_MOVABLE and not of MIGRATE_CMA type.

Link: https://lkml.kernel.org/r/20210215161349.246722-8-pasha.tatashin@soleen.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tyler Hicks <tyhicks@linux.microsoft.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       | 18 ++++++++++++++++++
 include/linux/sched/mm.h |  6 +++++-
 mm/hugetlb.c             |  2 +-
 mm/page_alloc.c          | 20 +++++++++-----------
 4 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1dbb53c44243..d0e628f511e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1141,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page)
 }
 #endif
 
+static inline bool is_zone_movable_page(const struct page *page)
+{
+	return page_zonenum(page) == ZONE_MOVABLE;
+}
+
 #ifdef CONFIG_DEV_PAGEMAP_OPS
 void free_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
@@ -1550,6 +1555,19 @@ static inline unsigned long page_to_section(const struct page *page)
 }
 #endif
 
+/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
+#ifdef CONFIG_MIGRATION
+static inline bool is_pinnable_page(struct page *page)
+{
+	return !is_zone_movable_page(page) && !is_migrate_cma_page(page);
+}
+#else
+static inline bool is_pinnable_page(struct page *page)
+{
+	return true;
+}
+#endif
+
 static inline void set_page_zone(struct page *page, enum zone_type zone)
 {
 	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index ae654819e8aa..e24b1fe348e3 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk)
  * Applies per-task gfp context to the given allocation flags.
  * PF_MEMALLOC_NOIO implies GFP_NOIO
  * PF_MEMALLOC_NOFS implies GFP_NOFS
+ * PF_MEMALLOC_PIN  implies !GFP_MOVABLE
  */
 static inline gfp_t current_gfp_context(gfp_t flags)
 {
 	unsigned int pflags = READ_ONCE(current->flags);
 
-	if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) {
+	if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
 		/*
 		 * NOIO implies both NOIO and NOFS and it is a weaker context
 		 * so always make sure it makes precedence
@@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags)
 			flags &= ~(__GFP_IO | __GFP_FS);
 		else if (pflags & PF_MEMALLOC_NOFS)
 			flags &= ~__GFP_FS;
+
+		if (pflags & PF_MEMALLOC_PIN)
+			flags &= ~__GFP_MOVABLE;
 	}
 	return flags;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 60dc197a4417..629aa4c2259c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1083,7 +1083,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 
 	lockdep_assert_held(&hugetlb_lock);
 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-		if (pin && is_migrate_cma_page(page))
+		if (pin && !is_pinnable_page(page))
 			continue;
 
 		if (PageHWPoison(page))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c55eaafede1..81db38926266 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3859,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
 	return alloc_flags;
 }
 
-static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
-					unsigned int alloc_flags)
+/* Must be called after current_gfp_context() which can change gfp_mask */
+static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
+						  unsigned int alloc_flags)
 {
 #ifdef CONFIG_CMA
-	unsigned int pflags = current->flags;
-
-	if (!(pflags & PF_MEMALLOC_PIN) &&
-	    gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+	if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
-
 #endif
 	return alloc_flags;
 }
@@ -4526,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 
-	alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
 
 	return alloc_flags;
 }
@@ -4828,7 +4825,7 @@ retry:
 
 	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
 	if (reserve_flags)
-		alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
+		alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
 
 	/*
 	 * Reset the nodemask and zonelist iterators if memory policies can be
@@ -4997,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 	if (should_fail_alloc_page(gfp_mask, order))
 		return false;
 
-	*alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
+	*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
 
 	/* Dirty zone balancing only done in the fast path */
 	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -5184,7 +5181,8 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
 	 * resp. GFP_NOIO which has to be inherited for all allocation requests
 	 * from a particular context which has been marked by
-	 * memalloc_no{fs,io}_{save,restore}.
+	 * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
+	 * movable zones are not used during allocation.
 	 */
 	gfp = current_gfp_context(gfp);
 	alloc_gfp = gfp;
-- 
cgit v1.2.3


From 9afaf30f7a1aab2022961715a66f644275b8daec Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 4 May 2021 18:39:04 -0700
Subject: mm/gup: do not migrate zero page

On some platforms ZERO_PAGE(0) might end-up in a movable zone.  Do not
migrate zero page in gup during longterm pinning as migration of zero page
is not allowed.

For example, in x86 QEMU with 16G of memory and kernelcore=5G parameter, I
see the following:

Boot#1: zero_pfn  0x48a8d zero_pfn zone: ZONE_DMA32
Boot#2: zero_pfn 0x20168d zero_pfn zone: ZONE_MOVABLE

On x86, empty_zero_page is declared in .bss and depending on the loader
may end up in different physical locations during boots.

Also, move is_zero_pfn() my_zero_pfn() functions under CONFIG_MMU, because
zero_pfn that they are using is declared in memory.c which is compiled
with CONFIG_MMU.

Link: https://lkml.kernel.org/r/20210215161349.246722-9-pasha.tatashin@soleen.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tyler Hicks <tyhicks@linux.microsoft.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h      |  3 ++-
 include/linux/mmzone.h  |  4 ++++
 include/linux/pgtable.h | 12 ++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d0e628f511e4..76e27ebb28a3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1559,7 +1559,8 @@ static inline unsigned long page_to_section(const struct page *page)
 #ifdef CONFIG_MIGRATION
 static inline bool is_pinnable_page(struct page *page)
 {
-	return !is_zone_movable_page(page) && !is_migrate_cma_page(page);
+	return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
+		is_zero_pfn(page_to_pfn(page));
 }
 #else
 static inline bool is_pinnable_page(struct page *page)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3b2205741048..92b44149d5b9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -427,6 +427,10 @@ enum zone_type {
 	 *    techniques might use alloc_contig_range() to hide previously
 	 *    exposed pages from the buddy again (e.g., to implement some sort
 	 *    of memory unplug in virtio-mem).
+	 * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
+	 *    situations where ZERO_PAGE(0) which is allocated differently
+	 *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
+	 *    cannot be migrated.
 	 *
 	 * In general, no unmovable allocations that degrade memory offlining
 	 * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e772392a379..2194a9cd885c 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 extern void untrack_pfn_moved(struct vm_area_struct *vma);
 #endif
 
+#ifdef CONFIG_MMU
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
 {
@@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr)
 	return zero_pfn;
 }
 #endif
+#else
+static inline int is_zero_pfn(unsigned long pfn)
+{
+	return 0;
+}
+
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+	return 0;
+}
+#endif /* CONFIG_MMU */
 
 #ifdef CONFIG_MMU
 
-- 
cgit v1.2.3


From d1e153fea2a8940273174fc17733c44323d35cd5 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 4 May 2021 18:39:08 -0700
Subject: mm/gup: migrate pinned pages out of movable zone

We should not pin pages in ZONE_MOVABLE.  Currently, we do not pin only
movable CMA pages.  Generalize the function that migrates CMA pages to
migrate all movable pages.  Use is_pinnable_page() to check which pages
need to be migrated

Link: https://lkml.kernel.org/r/20210215161349.246722-10-pasha.tatashin@soleen.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tyler Hicks <tyhicks@linux.microsoft.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h        |  1 +
 include/linux/mmzone.h         |  9 ++++--
 include/trace/events/migrate.h |  3 +-
 mm/gup.c                       | 67 +++++++++++++++++++++---------------------
 4 files changed, 44 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 175ef15ae9e8..4bb4e519e3f5 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -27,6 +27,7 @@ enum migrate_reason {
 	MR_MEMPOLICY_MBIND,
 	MR_NUMA_MISPLACED,
 	MR_CONTIG_RANGE,
+	MR_LONGTERM_PIN,
 	MR_TYPES
 };
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 92b44149d5b9..e8922a67d1a4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -407,8 +407,13 @@ enum zone_type {
 	 * to increase the number of THP/huge pages. Notable special cases are:
 	 *
 	 * 1. Pinned pages: (long-term) pinning of movable pages might
-	 *    essentially turn such pages unmovable. Memory offlining might
-	 *    retry a long time.
+	 *    essentially turn such pages unmovable. Therefore, we do not allow
+	 *    pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
+	 *    faulted, they come from the right zone right away. However, it is
+	 *    still possible that address space already has pages in
+	 *    ZONE_MOVABLE at the time when pages are pinned (i.e. user has
+	 *    touches that memory before pinning). In such case we migrate them
+	 *    to a different zone. When migration fails - pinning fails.
 	 * 2. memblock allocations: kernelcore/movablecore setups might create
 	 *    situations where ZONE_MOVABLE contains unmovable allocations
 	 *    after boot. Memory offlining and allocations fail early.
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index f2c990603888..9fb2a3bbcdfb 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -20,7 +20,8 @@
 	EM( MR_SYSCALL,		"syscall_or_cpuset")		\
 	EM( MR_MEMPOLICY_MBIND,	"mempolicy_mbind")		\
 	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
-	EMe(MR_CONTIG_RANGE,	"contig_range")
+	EM( MR_CONTIG_RANGE,	"contig_range")			\
+	EMe(MR_LONGTERM_PIN,	"longterm_pin")
 
 /*
  * First define the enums in the above macros to be exported to userspace
diff --git a/mm/gup.c b/mm/gup.c
index a1eff7ad31da..4bc57420f535 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
 		int orig_refs = refs;
 
 		/*
-		 * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
-		 * path, so fail and let the caller fall back to the slow path.
+		 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+		 * right zone, so fail and let the caller fall back to the slow
+		 * path.
 		 */
-		if (unlikely(flags & FOLL_LONGTERM) &&
-				is_migrate_cma_page(page))
+		if (unlikely((flags & FOLL_LONGTERM) &&
+			     !is_pinnable_page(page)))
 			return NULL;
 
 		/*
@@ -1600,17 +1601,17 @@ struct page *get_dump_page(unsigned long addr)
 }
 #endif /* CONFIG_ELF_CORE */
 
-#ifdef CONFIG_CMA
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
-					unsigned long start,
-					unsigned long nr_pages,
-					struct page **pages,
-					struct vm_area_struct **vmas,
-					unsigned int gup_flags)
+#ifdef CONFIG_MIGRATION
+static long check_and_migrate_movable_pages(struct mm_struct *mm,
+					    unsigned long start,
+					    unsigned long nr_pages,
+					    struct page **pages,
+					    struct vm_area_struct **vmas,
+					    unsigned int gup_flags)
 {
 	unsigned long i, isolation_error_count;
 	bool drain_allow;
-	LIST_HEAD(cma_page_list);
+	LIST_HEAD(movable_page_list);
 	long ret = nr_pages;
 	struct page *prev_head, *head;
 	struct migration_target_control mtc = {
@@ -1628,13 +1629,12 @@ check_again:
 			continue;
 		prev_head = head;
 		/*
-		 * If we get a page from the CMA zone, since we are going to
-		 * be pinning these entries, we might as well move them out
-		 * of the CMA zone if possible.
+		 * If we get a movable page, since we are going to be pinning
+		 * these entries, try to move them out if possible.
 		 */
-		if (is_migrate_cma_page(head)) {
+		if (!is_pinnable_page(head)) {
 			if (PageHuge(head)) {
-				if (!isolate_huge_page(head, &cma_page_list))
+				if (!isolate_huge_page(head, &movable_page_list))
 					isolation_error_count++;
 			} else {
 				if (!PageLRU(head) && drain_allow) {
@@ -1646,7 +1646,7 @@ check_again:
 					isolation_error_count++;
 					continue;
 				}
-				list_add_tail(&head->lru, &cma_page_list);
+				list_add_tail(&head->lru, &movable_page_list);
 				mod_node_page_state(page_pgdat(head),
 						    NR_ISOLATED_ANON +
 						    page_is_file_lru(head),
@@ -1659,10 +1659,10 @@ check_again:
 	 * If list is empty, and no isolation errors, means that all pages are
 	 * in the correct zone.
 	 */
-	if (list_empty(&cma_page_list) && !isolation_error_count)
+	if (list_empty(&movable_page_list) && !isolation_error_count)
 		return ret;
 
-	if (!list_empty(&cma_page_list)) {
+	if (!list_empty(&movable_page_list)) {
 		/*
 		 * drop the above get_user_pages reference.
 		 */
@@ -1672,12 +1672,12 @@ check_again:
 			for (i = 0; i < nr_pages; i++)
 				put_page(pages[i]);
 
-		ret = migrate_pages(&cma_page_list, alloc_migration_target,
+		ret = migrate_pages(&movable_page_list, alloc_migration_target,
 				    NULL, (unsigned long)&mtc, MIGRATE_SYNC,
-				    MR_CONTIG_RANGE);
+				    MR_LONGTERM_PIN);
 		if (ret) {
-			if (!list_empty(&cma_page_list))
-				putback_movable_pages(&cma_page_list);
+			if (!list_empty(&movable_page_list))
+				putback_movable_pages(&movable_page_list);
 			return ret > 0 ? -ENOMEM : ret;
 		}
 
@@ -1696,16 +1696,16 @@ check_again:
 	goto check_again;
 }
 #else
-static long check_and_migrate_cma_pages(struct mm_struct *mm,
-					unsigned long start,
-					unsigned long nr_pages,
-					struct page **pages,
-					struct vm_area_struct **vmas,
-					unsigned int gup_flags)
+static long check_and_migrate_movable_pages(struct mm_struct *mm,
+					    unsigned long start,
+					    unsigned long nr_pages,
+					    struct page **pages,
+					    struct vm_area_struct **vmas,
+					    unsigned int gup_flags)
 {
 	return nr_pages;
 }
-#endif /* CONFIG_CMA */
+#endif /* CONFIG_MIGRATION */
 
 /*
  * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1729,8 +1729,9 @@ static long __gup_longterm_locked(struct mm_struct *mm,
 
 	if (gup_flags & FOLL_LONGTERM) {
 		if (rc > 0)
-			rc = check_and_migrate_cma_pages(mm, start, rc, pages,
-							 vmas, gup_flags);
+			rc = check_and_migrate_movable_pages(mm, start, rc,
+							     pages, vmas,
+							     gup_flags);
 		memalloc_pin_restore(flags);
 	}
 	return rc;
-- 
cgit v1.2.3


From a08a2ae3461383c2d50d0997dcc6cd1dd1fefb08 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Tue, 4 May 2021 18:39:42 -0700
Subject: mm,memory_hotplug: allocate memmap from the added memory range

Physical memory hotadd has to allocate a memmap (struct page array) for
the newly added memory section.  Currently, alloc_pages_node() is used
for those allocations.

This has some disadvantages:
 a) an existing memory is consumed for that purpose
    (eg: ~2MB per 128MB memory section on x86_64)
    This can even lead to extreme cases where system goes OOM because
    the physically hotplugged memory depletes the available memory before
    it is onlined.
 b) if the whole node is movable then we have off-node struct pages
    which has performance drawbacks.
 c) It might be there are no PMD_ALIGNED chunks so memmap array gets
    populated with base pages.

This can be improved when CONFIG_SPARSEMEM_VMEMMAP is enabled.

Vmemap page tables can map arbitrary memory.  That means that we can
reserve a part of the physically hotadded memory to back vmemmap page
tables.  This implementation uses the beginning of the hotplugged memory
for that purpose.

There are some non-obviously things to consider though.

Vmemmap pages are allocated/freed during the memory hotplug events
(add_memory_resource(), try_remove_memory()) when the memory is
added/removed.  This means that the reserved physical range is not
online although it is used.  The most obvious side effect is that
pfn_to_online_page() returns NULL for those pfns.  The current design
expects that this should be OK as the hotplugged memory is considered a
garbage until it is onlined.  For example hibernation wouldn't save the
content of those vmmemmaps into the image so it wouldn't be restored on
resume but this should be OK as there no real content to recover anyway
while metadata is reachable from other data structures (e.g.  vmemmap
page tables).

The reserved space is therefore (de)initialized during the {on,off}line
events (mhp_{de}init_memmap_on_memory).  That is done by extracting page
allocator independent initialization from the regular onlining path.
The primary reason to handle the reserved space outside of
{on,off}line_pages is to make each initialization specific to the
purpose rather than special case them in a single function.

As per above, the functions that are introduced are:

 - mhp_init_memmap_on_memory:
   Initializes vmemmap pages by calling move_pfn_range_to_zone(), calls
   kasan_add_zero_shadow(), and onlines as many sections as vmemmap pages
   fully span.

 - mhp_deinit_memmap_on_memory:
   Offlines as many sections as vmemmap pages fully span, removes the
   range from zhe zone by remove_pfn_range_from_zone(), and calls
   kasan_remove_zero_shadow() for the range.

The new function memory_block_online() calls mhp_init_memmap_on_memory()
before doing the actual online_pages().  Should online_pages() fail, we
clean up by calling mhp_deinit_memmap_on_memory().  Adjusting of
present_pages is done at the end once we know that online_pages()
succedeed.

On offline, memory_block_offline() needs to unaccount vmemmap pages from
present_pages() before calling offline_pages().  This is necessary because
offline_pages() tears down some structures based on the fact whether the
node or the zone become empty.  If offline_pages() fails, we account back
vmemmap pages.  If it succeeds, we call mhp_deinit_memmap_on_memory().

Hot-remove:

 We need to be careful when removing memory, as adding and
 removing memory needs to be done with the same granularity.
 To check that this assumption is not violated, we check the
 memory range we want to remove and if a) any memory block has
 vmemmap pages and b) the range spans more than a single memory
 block, we scream out loud and refuse to proceed.

 If all is good and the range was using memmap on memory (aka vmemmap pages),
 we construct an altmap structure so free_hugepage_table does the right
 thing and calls vmem_altmap_free instead of free_pagetable.

Link: https://lkml.kernel.org/r/20210421102701.25051-5-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          |  72 ++++++++++++++++--
 include/linux/memory.h         |   8 +-
 include/linux/memory_hotplug.h |  15 +++-
 include/linux/memremap.h       |   2 +-
 include/linux/mmzone.h         |   7 +-
 mm/Kconfig                     |   5 ++
 mm/memory_hotplug.c            | 161 ++++++++++++++++++++++++++++++++++++++---
 mm/sparse.c                    |   2 -
 8 files changed, 250 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f209925a5d4e..b31b3af5c490 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -173,16 +173,73 @@ static int memory_block_online(struct memory_block *mem)
 {
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+	struct zone *zone;
+	int ret;
+
+	zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages);
+
+	/*
+	 * Although vmemmap pages have a different lifecycle than the pages
+	 * they describe (they remain until the memory is unplugged), doing
+	 * their initialization and accounting at memory onlining/offlining
+	 * stage helps to keep accounting easier to follow - e.g vmemmaps
+	 * belong to the same zone as the memory they backed.
+	 */
+	if (nr_vmemmap_pages) {
+		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
+		if (ret)
+			return ret;
+	}
+
+	ret = online_pages(start_pfn + nr_vmemmap_pages,
+			   nr_pages - nr_vmemmap_pages, zone);
+	if (ret) {
+		if (nr_vmemmap_pages)
+			mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+		return ret;
+	}
+
+	/*
+	 * Account once onlining succeeded. If the zone was unpopulated, it is
+	 * now already properly populated.
+	 */
+	if (nr_vmemmap_pages)
+		adjust_present_page_count(zone, nr_vmemmap_pages);
 
-	return online_pages(start_pfn, nr_pages, mem->online_type, mem->nid);
+	return ret;
 }
 
 static int memory_block_offline(struct memory_block *mem)
 {
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
+	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+	struct zone *zone;
+	int ret;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+
+	/*
+	 * Unaccount before offlining, such that unpopulated zone and kthreads
+	 * can properly be torn down in offline_pages().
+	 */
+	if (nr_vmemmap_pages)
+		adjust_present_page_count(zone, -nr_vmemmap_pages);
 
-	return offline_pages(start_pfn, nr_pages);
+	ret = offline_pages(start_pfn + nr_vmemmap_pages,
+			    nr_pages - nr_vmemmap_pages);
+	if (ret) {
+		/* offline_pages() failed. Account back. */
+		if (nr_vmemmap_pages)
+			adjust_present_page_count(zone, nr_vmemmap_pages);
+		return ret;
+	}
+
+	if (nr_vmemmap_pages)
+		mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+
+	return ret;
 }
 
 /*
@@ -576,7 +633,8 @@ int register_memory(struct memory_block *memory)
 	return ret;
 }
 
-static int init_memory_block(unsigned long block_id, unsigned long state)
+static int init_memory_block(unsigned long block_id, unsigned long state,
+			     unsigned long nr_vmemmap_pages)
 {
 	struct memory_block *mem;
 	int ret = 0;
@@ -593,6 +651,7 @@ static int init_memory_block(unsigned long block_id, unsigned long state)
 	mem->start_section_nr = block_id * sections_per_block;
 	mem->state = state;
 	mem->nid = NUMA_NO_NODE;
+	mem->nr_vmemmap_pages = nr_vmemmap_pages;
 
 	ret = register_memory(mem);
 
@@ -612,7 +671,7 @@ static int add_memory_block(unsigned long base_section_nr)
 	if (section_count == 0)
 		return 0;
 	return init_memory_block(memory_block_id(base_section_nr),
-				 MEM_ONLINE);
+				 MEM_ONLINE, 0);
 }
 
 static void unregister_memory(struct memory_block *memory)
@@ -634,7 +693,8 @@ static void unregister_memory(struct memory_block *memory)
  *
  * Called under device_hotplug_lock.
  */
-int create_memory_block_devices(unsigned long start, unsigned long size)
+int create_memory_block_devices(unsigned long start, unsigned long size,
+				unsigned long vmemmap_pages)
 {
 	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
 	unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
@@ -647,7 +707,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
 		return -EINVAL;
 
 	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-		ret = init_memory_block(block_id, MEM_OFFLINE);
+		ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages);
 		if (ret)
 			break;
 	}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 4da95e684e20..97e92e8b556a 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -29,6 +29,11 @@ struct memory_block {
 	int online_type;		/* for passing data to online routine */
 	int nid;			/* NID for this memory block */
 	struct device dev;
+	/*
+	 * Number of vmemmap pages. These pages
+	 * lay at the beginning of the memory block.
+	 */
+	unsigned long nr_vmemmap_pages;
 };
 
 int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v)
 #else
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
-int create_memory_block_devices(unsigned long start, unsigned long size);
+int create_memory_block_devices(unsigned long start, unsigned long size,
+				unsigned long vmemmap_pages);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7288aa5ef73b..28f32fd00fe9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -55,6 +55,14 @@ typedef int __bitwise mhp_t;
  */
 #define MHP_MERGE_RESOURCE	((__force mhp_t)BIT(0))
 
+/*
+ * We want memmap (struct page array) to be self contained.
+ * To do so, we will use the beginning of the hot-added range to build
+ * the page tables for the memmap array that describes the entire range.
+ * Only selected architectures support it with SPARSE_VMEMMAP.
+ */
+#define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
+
 /*
  * Extended parameters for memory hotplug:
  * altmap: alternative allocator for memmap array (optional)
@@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone)
 extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
 extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
 extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+extern void adjust_present_page_count(struct zone *zone, long nr_pages);
 /* VM interface that may be used by firmware interface */
+extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+				     struct zone *zone);
+extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
 extern int online_pages(unsigned long pfn, unsigned long nr_pages,
-			int online_type, int nid);
+			struct zone *zone);
 extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
 				      struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
+extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f5b464daeeca..45a79da89c5f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -17,7 +17,7 @@ struct device;
  * @alloc: track pages consumed, private to vmemmap_populate()
  */
 struct vmem_altmap {
-	const unsigned long base_pfn;
+	unsigned long base_pfn;
 	const unsigned long end_pfn;
 	const unsigned long reserve;
 	unsigned long free;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e8922a67d1a4..917bd6c604d5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -436,6 +436,11 @@ enum zone_type {
 	 *    situations where ZERO_PAGE(0) which is allocated differently
 	 *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
 	 *    cannot be migrated.
+	 * 7. Memory-hotplug: when using memmap_on_memory and onlining the
+	 *    memory to the MOVABLE zone, the vmemmap pages are also placed in
+	 *    such zone. Such pages cannot be really moved around as they are
+	 *    self-stored in the range, but they are treated as movable when
+	 *    the range they describe is about to be offlined.
 	 *
 	 * In general, no unmovable allocations that degrade memory offlining
 	 * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
@@ -1392,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
 #endif
-#endif
 
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index fe4897c3c81b..02d44e3420f5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -188,6 +188,11 @@ config MEMORY_HOTREMOVE
 	depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
 	depends on MIGRATION
 
+config MHP_MEMMAP_ON_MEMORY
+	def_bool y
+	depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
+	depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 04f01fabc150..0b3157836814 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,6 +42,8 @@
 #include "internal.h"
 #include "shuffle.h"
 
+static bool memmap_on_memory;
+
 /*
  * online_page_callback contains pointer to current page onlining function.
  * Initially it is generic_online_page(). If it is required it could be
@@ -648,9 +650,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 	 * decide to not expose all pages to the buddy (e.g., expose them
 	 * later). We account all pages as being online and belonging to this
 	 * zone ("present").
+	 * When using memmap_on_memory, the range might not be aligned to
+	 * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
+	 * this and the first chunk to online will be pageblock_nr_pages.
 	 */
-	for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
-		(*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
+	for (pfn = start_pfn; pfn < end_pfn;) {
+		int order = min(MAX_ORDER - 1UL, __ffs(pfn));
+
+		(*online_page_callback)(pfn_to_page(pfn), order);
+		pfn += (1UL << order);
+	}
 
 	/* mark all involved sections as online */
 	online_mem_sections(start_pfn, end_pfn);
@@ -829,7 +838,11 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 	return default_zone_for_pfn(nid, start_pfn, nr_pages);
 }
 
-static void adjust_present_page_count(struct zone *zone, long nr_pages)
+/*
+ * This function should only be called by memory_block_{online,offline},
+ * and {online,offline}_pages.
+ */
+void adjust_present_page_count(struct zone *zone, long nr_pages)
 {
 	unsigned long flags;
 
@@ -839,12 +852,54 @@ static void adjust_present_page_count(struct zone *zone, long nr_pages)
 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
 
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
-		       int online_type, int nid)
+int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+			      struct zone *zone)
+{
+	unsigned long end_pfn = pfn + nr_pages;
+	int ret;
+
+	ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+	if (ret)
+		return ret;
+
+	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+
+	/*
+	 * It might be that the vmemmap_pages fully span sections. If that is
+	 * the case, mark those sections online here as otherwise they will be
+	 * left offline.
+	 */
+	if (nr_pages >= PAGES_PER_SECTION)
+	        online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+	return ret;
+}
+
+void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long end_pfn = pfn + nr_pages;
+
+	/*
+	 * It might be that the vmemmap_pages fully span sections. If that is
+	 * the case, mark those sections offline here as otherwise they will be
+	 * left online.
+	 */
+	if (nr_pages >= PAGES_PER_SECTION)
+		offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
+
+        /*
+	 * The pages associated with this vmemmap have been offlined, so
+	 * we can reset its state here.
+	 */
+	remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
+	kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+}
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
 {
 	unsigned long flags;
-	struct zone *zone;
 	int need_zonelists_rebuild = 0;
+	const int nid = zone_to_nid(zone);
 	int ret;
 	struct memory_notify arg;
 
@@ -863,7 +918,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	mem_hotplug_begin();
 
 	/* associate pfn range with the zone */
-	zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
 
 	arg.start_pfn = pfn;
@@ -1077,6 +1131,45 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 	return device_online(&mem->dev);
 }
 
+bool mhp_supports_memmap_on_memory(unsigned long size)
+{
+	unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+	unsigned long remaining_size = size - vmemmap_size;
+
+	/*
+	 * Besides having arch support and the feature enabled at runtime, we
+	 * need a few more assumptions to hold true:
+	 *
+	 * a) We span a single memory block: memory onlining/offlinin;g happens
+	 *    in memory block granularity. We don't want the vmemmap of online
+	 *    memory blocks to reside on offline memory blocks. In the future,
+	 *    we might want to support variable-sized memory blocks to make the
+	 *    feature more versatile.
+	 *
+	 * b) The vmemmap pages span complete PMDs: We don't want vmemmap code
+	 *    to populate memory from the altmap for unrelated parts (i.e.,
+	 *    other memory blocks)
+	 *
+	 * c) The vmemmap pages (and thereby the pages that will be exposed to
+	 *    the buddy) have to cover full pageblocks: memory onlining/offlining
+	 *    code requires applicable ranges to be page-aligned, for example, to
+	 *    set the migratetypes properly.
+	 *
+	 * TODO: Although we have a check here to make sure that vmemmap pages
+	 *       fully populate a PMD, it is not the right place to check for
+	 *       this. A much better solution involves improving vmemmap code
+	 *       to fallback to base pages when trying to populate vmemmap using
+	 *       altmap as an alternative source of memory, and we do not exactly
+	 *       populate a single PMD.
+	 */
+	return memmap_on_memory &&
+	       IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+	       size == memory_block_size_bytes() &&
+	       IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+	       IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+}
+
 /*
  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
  * and online/offline operations (triggered e.g. by sysfs).
@@ -1086,6 +1179,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
 int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
 	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+	struct vmem_altmap mhp_altmap = {};
 	u64 start, size;
 	bool new_node = false;
 	int ret;
@@ -1112,13 +1206,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		goto error;
 	new_node = ret;
 
+	/*
+	 * Self hosted memmap array
+	 */
+	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
+		if (!mhp_supports_memmap_on_memory(size)) {
+			ret = -EINVAL;
+			goto error;
+		}
+		mhp_altmap.free = PHYS_PFN(size);
+		mhp_altmap.base_pfn = PHYS_PFN(start);
+		params.altmap = &mhp_altmap;
+	}
+
 	/* call arch's memory hotadd */
 	ret = arch_add_memory(nid, start, size, &params);
 	if (ret < 0)
 		goto error;
 
 	/* create memory block devices after memory was added */
-	ret = create_memory_block_devices(start, size);
+	ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
 	if (ret) {
 		arch_remove_memory(nid, start, size, NULL);
 		goto error;
@@ -1767,6 +1874,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 	return 0;
 }
 
+static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+{
+	/*
+	 * If not set, continue with the next block.
+	 */
+	return mem->nr_vmemmap_pages;
+}
+
 static int check_cpu_on_node(pg_data_t *pgdat)
 {
 	int cpu;
@@ -1841,6 +1956,9 @@ EXPORT_SYMBOL(try_offline_node);
 static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
 	int rc = 0;
+	struct vmem_altmap mhp_altmap = {};
+	struct vmem_altmap *altmap = NULL;
+	unsigned long nr_vmemmap_pages;
 
 	BUG_ON(check_hotplug_memory_range(start, size));
 
@@ -1853,6 +1971,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	if (rc)
 		return rc;
 
+	/*
+	 * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
+	 * the same granularity it was added - a single memory block.
+	 */
+	if (memmap_on_memory) {
+		nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
+						      get_nr_vmemmap_pages_cb);
+		if (nr_vmemmap_pages) {
+			if (size != memory_block_size_bytes()) {
+				pr_warn("Refuse to remove %#llx - %#llx,"
+					"wrong granularity\n",
+					start, start + size);
+				return -EINVAL;
+			}
+
+			/*
+			 * Let remove_pmd_table->free_hugepage_table do the
+			 * right thing if we used vmem_altmap when hot-adding
+			 * the range.
+			 */
+			mhp_altmap.alloc = nr_vmemmap_pages;
+			altmap = &mhp_altmap;
+		}
+	}
+
 	/* remove memmap entry */
 	firmware_map_remove(start, start + size, "System RAM");
 
@@ -1864,7 +2007,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 
 	mem_hotplug_begin();
 
-	arch_remove_memory(nid, start, size, NULL);
+	arch_remove_memory(nid, start, size, altmap);
 
 	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
 		memblock_free(start, size);
diff --git a/mm/sparse.c b/mm/sparse.c
index 33406ea2ecc4..d3fbed26e64e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -624,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 	}
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 /* Mark all memory sections within the pfn range as offline */
 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -645,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 		ms->section_mem_map &= ~SECTION_IS_ONLINE;
 	}
 }
-#endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static struct page * __meminit populate_section_memmap(unsigned long pfn,
-- 
cgit v1.2.3


From 28961998f858114e51d2ae862065b858afcfa2b2 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 4 May 2021 18:40:03 -0700
Subject: iov_iter: lift memzero_page() to highmem.h

Patch series "btrfs: Convert kmap/memset/kunmap to memzero_user()".

Lifting memzero_user(), convert it to kmap_local_page() and then use it
in btrfs.

This patch (of 3):

memzero_page() can replace the kmap/memset/kunmap pattern in other
places in the code.  While zero_user() has the same interface it is not
the same call and its use should be limited and some of those calls may
be better converted from zero_user() to memzero_page().[1] But that is
not addressed in this series.

Lift memzero_page() to highmem.

[1] https://lore.kernel.org/lkml/CAHk-=wijdojzo56FzYqE5TOYw2Vws7ik3LEMGj9SPQaJJ+Z73Q@mail.gmail.com/

Link: https://lkml.kernel.org/r/20210309212137.2610186-1-ira.weiny@intel.com
Link: https://lkml.kernel.org/r/20210309212137.2610186-2-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: David Sterba <dsterba@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 7 +++++++
 lib/iov_iter.c          | 8 +-------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 44170f312ae7..832b49b50c7b 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
 	kunmap_local(to);
 }
 
+static inline void memzero_page(struct page *page, size_t offset, size_t len)
+{
+	char *addr = kmap_atomic(page);
+	memset(addr + offset, 0, len);
+	kunmap_atomic(addr);
+}
+
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 61228a6c69f8..c701b7a187f2 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -5,6 +5,7 @@
 #include <linux/fault-inject-usercopy.h>
 #include <linux/uio.h>
 #include <linux/pagemap.h>
+#include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/splice.h>
@@ -507,13 +508,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
-static void memzero_page(struct page *page, size_t offset, size_t len)
-{
-	char *addr = kmap_atomic(page);
-	memset(addr + offset, 0, len);
-	kunmap_atomic(addr);
-}
-
 static inline bool allocated(struct pipe_buffer *buf)
 {
 	return buf->ops == &default_pipe_buf_ops;
-- 
cgit v1.2.3


From 866a6dadbb027b2955a7ae00bab9705d382def12 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Tue, 4 May 2021 17:27:28 -0700
Subject: context_tracking: Move guest exit context tracking to separate
 helpers

Provide separate context tracking helpers for guest exit, the standalone
helpers will be called separately by KVM x86 in later patches to fix
tick-based accounting.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210505002735.1684165-2-seanjc@google.com
---
 include/linux/context_tracking.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index bceb06498521..b8c7313495a7 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -131,10 +131,15 @@ static __always_inline void guest_enter_irqoff(void)
 	}
 }
 
-static __always_inline void guest_exit_irqoff(void)
+static __always_inline void context_tracking_guest_exit(void)
 {
 	if (context_tracking_enabled())
 		__context_tracking_exit(CONTEXT_GUEST);
+}
+
+static __always_inline void guest_exit_irqoff(void)
+{
+	context_tracking_guest_exit();
 
 	instrumentation_begin();
 	if (vtime_accounting_enabled_this_cpu())
@@ -159,6 +164,8 @@ static __always_inline void guest_enter_irqoff(void)
 	instrumentation_end();
 }
 
+static __always_inline void context_tracking_guest_exit(void) { }
+
 static __always_inline void guest_exit_irqoff(void)
 {
 	instrumentation_begin();
-- 
cgit v1.2.3


From 88d8220bbf06dd8045b2ac4be1046290eaa7773a Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Tue, 4 May 2021 17:27:29 -0700
Subject: context_tracking: Move guest exit vtime accounting to separate
 helpers

Provide separate vtime accounting functions for guest exit instead of
open coding the logic within the context tracking code.  This will allow
KVM x86 to handle vtime accounting slightly differently when using
tick-based accounting.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Link: https://lore.kernel.org/r/20210505002735.1684165-3-seanjc@google.com
---
 include/linux/context_tracking.h | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index b8c7313495a7..4f4556232dcf 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -137,15 +137,20 @@ static __always_inline void context_tracking_guest_exit(void)
 		__context_tracking_exit(CONTEXT_GUEST);
 }
 
-static __always_inline void guest_exit_irqoff(void)
+static __always_inline void vtime_account_guest_exit(void)
 {
-	context_tracking_guest_exit();
-
-	instrumentation_begin();
 	if (vtime_accounting_enabled_this_cpu())
 		vtime_guest_exit(current);
 	else
 		current->flags &= ~PF_VCPU;
+}
+
+static __always_inline void guest_exit_irqoff(void)
+{
+	context_tracking_guest_exit();
+
+	instrumentation_begin();
+	vtime_account_guest_exit();
 	instrumentation_end();
 }
 
@@ -166,12 +171,17 @@ static __always_inline void guest_enter_irqoff(void)
 
 static __always_inline void context_tracking_guest_exit(void) { }
 
+static __always_inline void vtime_account_guest_exit(void)
+{
+	vtime_account_kernel(current);
+	current->flags &= ~PF_VCPU;
+}
+
 static __always_inline void guest_exit_irqoff(void)
 {
 	instrumentation_begin();
 	/* Flush the guest cputime we spent on the guest */
-	vtime_account_kernel(current);
-	current->flags &= ~PF_VCPU;
+	vtime_account_guest_exit();
 	instrumentation_end();
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-- 
cgit v1.2.3


From b41c723b203e19480c26f2ec8f04eedc03d34b34 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 4 May 2021 17:27:31 -0700
Subject: sched/vtime: Move vtime accounting external declarations above
 inlines

Move the blob of external declarations (and their stubs) above the set of
inline definitions (and their stubs) for vtime accounting.  This will
allow a future patch to bring in more inline definitions without also
having to shuffle large chunks of code.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Link: https://lore.kernel.org/r/20210505002735.1684165-5-seanjc@google.com
---
 include/linux/vtime.h | 74 +++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 041d6524d144..6a4317560539 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -10,6 +10,43 @@
 
 struct task_struct;
 
+/*
+ * Common vtime APIs
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+extern void vtime_account_kernel(struct task_struct *tsk);
+extern void vtime_account_idle(struct task_struct *tsk);
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+static inline void vtime_account_kernel(struct task_struct *tsk) { }
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+extern void arch_vtime_task_switch(struct task_struct *tsk);
+extern void vtime_user_enter(struct task_struct *tsk);
+extern void vtime_user_exit(struct task_struct *tsk);
+extern void vtime_guest_enter(struct task_struct *tsk);
+extern void vtime_guest_exit(struct task_struct *tsk);
+extern void vtime_init_idle(struct task_struct *tsk, int cpu);
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
+static inline void vtime_user_enter(struct task_struct *tsk) { }
+static inline void vtime_user_exit(struct task_struct *tsk) { }
+static inline void vtime_guest_enter(struct task_struct *tsk) { }
+static inline void vtime_guest_exit(struct task_struct *tsk) { }
+static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
+#endif
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
+extern void vtime_account_softirq(struct task_struct *tsk);
+extern void vtime_account_hardirq(struct task_struct *tsk);
+extern void vtime_flush(struct task_struct *tsk);
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
+static inline void vtime_account_softirq(struct task_struct *tsk) { }
+static inline void vtime_account_hardirq(struct task_struct *tsk) { }
+static inline void vtime_flush(struct task_struct *tsk) { }
+#endif
+
 /*
  * vtime_accounting_enabled_this_cpu() definitions/declarations
  */
@@ -57,43 +94,6 @@ static inline void vtime_task_switch(struct task_struct *prev) { }
 
 #endif
 
-/*
- * Common vtime APIs
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void vtime_account_kernel(struct task_struct *tsk);
-extern void vtime_account_idle(struct task_struct *tsk);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-static inline void vtime_account_kernel(struct task_struct *tsk) { }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_user_enter(struct task_struct *tsk);
-extern void vtime_user_exit(struct task_struct *tsk);
-extern void vtime_guest_enter(struct task_struct *tsk);
-extern void vtime_guest_exit(struct task_struct *tsk);
-extern void vtime_init_idle(struct task_struct *tsk, int cpu);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
-static inline void vtime_user_enter(struct task_struct *tsk) { }
-static inline void vtime_user_exit(struct task_struct *tsk) { }
-static inline void vtime_guest_enter(struct task_struct *tsk) { }
-static inline void vtime_guest_exit(struct task_struct *tsk) { }
-static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
-#endif
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
-extern void vtime_account_softirq(struct task_struct *tsk);
-extern void vtime_account_hardirq(struct task_struct *tsk);
-extern void vtime_flush(struct task_struct *tsk);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
-static inline void vtime_account_softirq(struct task_struct *tsk) { }
-static inline void vtime_account_hardirq(struct task_struct *tsk) { }
-static inline void vtime_flush(struct task_struct *tsk) { }
-#endif
-
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset);
-- 
cgit v1.2.3


From 6f922b89e5518143920b10e3643e556d9df58d94 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 4 May 2021 17:27:32 -0700
Subject: sched/vtime: Move guest enter/exit vtime accounting to vtime.h

Provide separate helpers for guest enter vtime accounting (in addition to
the existing guest exit helpers), and move all vtime accounting helpers
to vtime.h where the existing #ifdef infrastructure can be leveraged to
better delineate the different types of accounting.  This will also allow
future cleanups via deduplication of context tracking code.

Opportunstically delete the vtime_account_kernel() stub now that all
callers are wrapped with CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210505002735.1684165-6-seanjc@google.com
---
 include/linux/context_tracking.h | 17 +--------------
 include/linux/vtime.h            | 46 ++++++++++++++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 4f4556232dcf..56c648bdbde8 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -137,14 +137,6 @@ static __always_inline void context_tracking_guest_exit(void)
 		__context_tracking_exit(CONTEXT_GUEST);
 }
 
-static __always_inline void vtime_account_guest_exit(void)
-{
-	if (vtime_accounting_enabled_this_cpu())
-		vtime_guest_exit(current);
-	else
-		current->flags &= ~PF_VCPU;
-}
-
 static __always_inline void guest_exit_irqoff(void)
 {
 	context_tracking_guest_exit();
@@ -163,20 +155,13 @@ static __always_inline void guest_enter_irqoff(void)
 	 * to flush.
 	 */
 	instrumentation_begin();
-	vtime_account_kernel(current);
-	current->flags |= PF_VCPU;
+	vtime_account_guest_enter();
 	rcu_virt_note_context_switch(smp_processor_id());
 	instrumentation_end();
 }
 
 static __always_inline void context_tracking_guest_exit(void) { }
 
-static __always_inline void vtime_account_guest_exit(void)
-{
-	vtime_account_kernel(current);
-	current->flags &= ~PF_VCPU;
-}
-
 static __always_inline void guest_exit_irqoff(void)
 {
 	instrumentation_begin();
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 6a4317560539..3684487d01e1 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -3,21 +3,18 @@
 #define _LINUX_KERNEL_VTIME_H
 
 #include <linux/context_tracking_state.h>
+#include <linux/sched.h>
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 #include <asm/vtime.h>
 #endif
 
-
-struct task_struct;
-
 /*
  * Common vtime APIs
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 extern void vtime_account_kernel(struct task_struct *tsk);
 extern void vtime_account_idle(struct task_struct *tsk);
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-static inline void vtime_account_kernel(struct task_struct *tsk) { }
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -55,6 +52,18 @@ static inline void vtime_flush(struct task_struct *tsk) { }
 static inline bool vtime_accounting_enabled_this_cpu(void) { return true; }
 extern void vtime_task_switch(struct task_struct *prev);
 
+static __always_inline void vtime_account_guest_enter(void)
+{
+	vtime_account_kernel(current);
+	current->flags |= PF_VCPU;
+}
+
+static __always_inline void vtime_account_guest_exit(void)
+{
+	vtime_account_kernel(current);
+	current->flags &= ~PF_VCPU;
+}
+
 #elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)
 
 /*
@@ -86,12 +95,37 @@ static inline void vtime_task_switch(struct task_struct *prev)
 		vtime_task_switch_generic(prev);
 }
 
+static __always_inline void vtime_account_guest_enter(void)
+{
+	if (vtime_accounting_enabled_this_cpu())
+		vtime_guest_enter(current);
+	else
+		current->flags |= PF_VCPU;
+}
+
+static __always_inline void vtime_account_guest_exit(void)
+{
+	if (vtime_accounting_enabled_this_cpu())
+		vtime_guest_exit(current);
+	else
+		current->flags &= ~PF_VCPU;
+}
+
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
-static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; }
 static inline bool vtime_accounting_enabled_this_cpu(void) { return false; }
 static inline void vtime_task_switch(struct task_struct *prev) { }
 
+static __always_inline void vtime_account_guest_enter(void)
+{
+	current->flags |= PF_VCPU;
+}
+
+static __always_inline void vtime_account_guest_exit(void)
+{
+	current->flags &= ~PF_VCPU;
+}
+
 #endif
 
 
-- 
cgit v1.2.3


From 14296e0c447885d6c7b326e059fb528eb00526ed Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 4 May 2021 17:27:33 -0700
Subject: context_tracking: Consolidate guest enter/exit wrappers

Consolidate the guest enter/exit wrappers, providing and tweaking stubs
as needed.  This will allow moving the wrappers under KVM without having
to bleed #ifdefs into the soon-to-be KVM code.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210505002735.1684165-7-seanjc@google.com
---
 include/linux/context_tracking.h | 65 +++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 56c648bdbde8..aa58c2ac67ca 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -71,6 +71,19 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 	}
 }
 
+static __always_inline bool context_tracking_guest_enter(void)
+{
+	if (context_tracking_enabled())
+		__context_tracking_enter(CONTEXT_GUEST);
+
+	return context_tracking_enabled_this_cpu();
+}
+
+static __always_inline void context_tracking_guest_exit(void)
+{
+	if (context_tracking_enabled())
+		__context_tracking_exit(CONTEXT_GUEST);
+}
 
 /**
  * ct_state() - return the current context tracking state if known
@@ -92,6 +105,9 @@ static inline void user_exit_irqoff(void) { }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
 static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
+static inline bool context_tracking_guest_enter(void) { return false; }
+static inline void context_tracking_guest_exit(void) { }
+
 #endif /* !CONFIG_CONTEXT_TRACKING */
 
 #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
@@ -102,74 +118,41 @@ extern void context_tracking_init(void);
 static inline void context_tracking_init(void) { }
 #endif /* CONFIG_CONTEXT_TRACKING_FORCE */
 
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 /* must be called with irqs disabled */
 static __always_inline void guest_enter_irqoff(void)
 {
+	/*
+	 * This is running in ioctl context so its safe to assume that it's the
+	 * stime pending cputime to flush.
+	 */
 	instrumentation_begin();
-	if (vtime_accounting_enabled_this_cpu())
-		vtime_guest_enter(current);
-	else
-		current->flags |= PF_VCPU;
+	vtime_account_guest_enter();
 	instrumentation_end();
 
-	if (context_tracking_enabled())
-		__context_tracking_enter(CONTEXT_GUEST);
-
-	/* KVM does not hold any references to rcu protected data when it
+	/*
+	 * KVM does not hold any references to rcu protected data when it
 	 * switches CPU into a guest mode. In fact switching to a guest mode
 	 * is very similar to exiting to userspace from rcu point of view. In
 	 * addition CPU may stay in a guest mode for quite a long time (up to
 	 * one time slice). Lets treat guest mode as quiescent state, just like
 	 * we do with user-mode execution.
 	 */
-	if (!context_tracking_enabled_this_cpu()) {
+	if (!context_tracking_guest_enter()) {
 		instrumentation_begin();
 		rcu_virt_note_context_switch(smp_processor_id());
 		instrumentation_end();
 	}
 }
 
-static __always_inline void context_tracking_guest_exit(void)
-{
-	if (context_tracking_enabled())
-		__context_tracking_exit(CONTEXT_GUEST);
-}
-
 static __always_inline void guest_exit_irqoff(void)
 {
 	context_tracking_guest_exit();
 
-	instrumentation_begin();
-	vtime_account_guest_exit();
-	instrumentation_end();
-}
-
-#else
-static __always_inline void guest_enter_irqoff(void)
-{
-	/*
-	 * This is running in ioctl context so its safe
-	 * to assume that it's the stime pending cputime
-	 * to flush.
-	 */
-	instrumentation_begin();
-	vtime_account_guest_enter();
-	rcu_virt_note_context_switch(smp_processor_id());
-	instrumentation_end();
-}
-
-static __always_inline void context_tracking_guest_exit(void) { }
-
-static __always_inline void guest_exit_irqoff(void)
-{
 	instrumentation_begin();
 	/* Flush the guest cputime we spent on the guest */
 	vtime_account_guest_exit();
 	instrumentation_end();
 }
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
 
 static inline void guest_exit(void)
 {
-- 
cgit v1.2.3


From 1ca0016c149be35fe19a6b75fce95c25807b7159 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 4 May 2021 17:27:34 -0700
Subject: context_tracking: KVM: Move guest enter/exit wrappers to KVM's domain

Move the guest enter/exit wrappers to kvm_host.h so that KVM can manage
its context tracking vs. vtime accounting without bleeding too many KVM
details into the context tracking code.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210505002735.1684165-8-seanjc@google.com
---
 include/linux/context_tracking.h | 45 ----------------------------------------
 include/linux/kvm_host.h         | 45 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index aa58c2ac67ca..4d7fced3a39f 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -118,49 +118,4 @@ extern void context_tracking_init(void);
 static inline void context_tracking_init(void) { }
 #endif /* CONFIG_CONTEXT_TRACKING_FORCE */
 
-/* must be called with irqs disabled */
-static __always_inline void guest_enter_irqoff(void)
-{
-	/*
-	 * This is running in ioctl context so its safe to assume that it's the
-	 * stime pending cputime to flush.
-	 */
-	instrumentation_begin();
-	vtime_account_guest_enter();
-	instrumentation_end();
-
-	/*
-	 * KVM does not hold any references to rcu protected data when it
-	 * switches CPU into a guest mode. In fact switching to a guest mode
-	 * is very similar to exiting to userspace from rcu point of view. In
-	 * addition CPU may stay in a guest mode for quite a long time (up to
-	 * one time slice). Lets treat guest mode as quiescent state, just like
-	 * we do with user-mode execution.
-	 */
-	if (!context_tracking_guest_enter()) {
-		instrumentation_begin();
-		rcu_virt_note_context_switch(smp_processor_id());
-		instrumentation_end();
-	}
-}
-
-static __always_inline void guest_exit_irqoff(void)
-{
-	context_tracking_guest_exit();
-
-	instrumentation_begin();
-	/* Flush the guest cputime we spent on the guest */
-	vtime_account_guest_exit();
-	instrumentation_end();
-}
-
-static inline void guest_exit(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	guest_exit_irqoff();
-	local_irq_restore(flags);
-}
-
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8895b95b6a22..2f34487e21f2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -338,6 +338,51 @@ struct kvm_vcpu {
 	struct kvm_dirty_ring dirty_ring;
 };
 
+/* must be called with irqs disabled */
+static __always_inline void guest_enter_irqoff(void)
+{
+	/*
+	 * This is running in ioctl context so its safe to assume that it's the
+	 * stime pending cputime to flush.
+	 */
+	instrumentation_begin();
+	vtime_account_guest_enter();
+	instrumentation_end();
+
+	/*
+	 * KVM does not hold any references to rcu protected data when it
+	 * switches CPU into a guest mode. In fact switching to a guest mode
+	 * is very similar to exiting to userspace from rcu point of view. In
+	 * addition CPU may stay in a guest mode for quite a long time (up to
+	 * one time slice). Lets treat guest mode as quiescent state, just like
+	 * we do with user-mode execution.
+	 */
+	if (!context_tracking_guest_enter()) {
+		instrumentation_begin();
+		rcu_virt_note_context_switch(smp_processor_id());
+		instrumentation_end();
+	}
+}
+
+static __always_inline void guest_exit_irqoff(void)
+{
+	context_tracking_guest_exit();
+
+	instrumentation_begin();
+	/* Flush the guest cputime we spent on the guest */
+	vtime_account_guest_exit();
+	instrumentation_end();
+}
+
+static inline void guest_exit(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	guest_exit_irqoff();
+	local_irq_restore(flags);
+}
+
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 {
 	/*
-- 
cgit v1.2.3


From 1139aeb1c521eb4a050920ce6c64c36c4f2a3ab7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 5 May 2021 23:12:42 +0200
Subject: smp: Fix smp_call_function_single_async prototype

As of commit 966a967116e6 ("smp: Avoid using two cache lines for struct
call_single_data"), the smp code prefers 32-byte aligned call_single_data
objects for performance reasons, but the block layer includes an instance
of this structure in the main 'struct request' that is more senstive
to size than to performance here, see 4ccafe032005 ("block: unalign
call_single_data in struct request").

The result is a violation of the calling conventions that clang correctly
points out:

block/blk-mq.c:630:39: warning: passing 8-byte aligned argument to 32-byte aligned parameter 2 of 'smp_call_function_single_async' may result in an unaligned pointer access [-Walign-mismatch]
                smp_call_function_single_async(cpu, &rq->csd);

It does seem that the usage of the call_single_data without cache line
alignment should still be allowed by the smp code, so just change the
function prototype so it accepts both, but leave the default alignment
unchanged for the other users. This seems better to me than adding
a local hack to shut up an otherwise correct warning in the caller.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
Link: https://lkml.kernel.org/r/20210505211300.3174456-1-arnd@kernel.org
---
 include/linux/smp.h |  2 +-
 kernel/smp.c        | 26 +++++++++++++-------------
 kernel/up.c         |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 84a0b4828f66..f0d3ef654207 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,7 +53,7 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 			   void *info, bool wait, const struct cpumask *mask);
 
-int smp_call_function_single_async(int cpu, call_single_data_t *csd);
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
 
 /*
  * Call a function on all processors
diff --git a/kernel/smp.c b/kernel/smp.c
index e21074900006..52bf159ec400 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -211,7 +211,7 @@ static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type)
 	} while (0)
 
 /* Record current CSD work for current CPU, NULL to erase. */
-static void __csd_lock_record(call_single_data_t *csd)
+static void __csd_lock_record(struct __call_single_data *csd)
 {
 	if (!csd) {
 		smp_mb(); /* NULL cur_csd after unlock. */
@@ -226,13 +226,13 @@ static void __csd_lock_record(call_single_data_t *csd)
 		  /* Or before unlock, as the case may be. */
 }
 
-static __always_inline void csd_lock_record(call_single_data_t *csd)
+static __always_inline void csd_lock_record(struct __call_single_data *csd)
 {
 	if (static_branch_unlikely(&csdlock_debug_enabled))
 		__csd_lock_record(csd);
 }
 
-static int csd_lock_wait_getcpu(call_single_data_t *csd)
+static int csd_lock_wait_getcpu(struct __call_single_data *csd)
 {
 	unsigned int csd_type;
 
@@ -282,7 +282,7 @@ static const char *csd_lock_get_type(unsigned int type)
 	return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type];
 }
 
-static void csd_lock_print_extended(call_single_data_t *csd, int cpu)
+static void csd_lock_print_extended(struct __call_single_data *csd, int cpu)
 {
 	struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu);
 	unsigned int srccpu = csd->node.src;
@@ -321,7 +321,7 @@ static void csd_lock_print_extended(call_single_data_t *csd, int cpu)
  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
  * so waiting on other types gets much less information.
  */
-static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
 {
 	int cpu = -1;
 	int cpux;
@@ -387,7 +387,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static void __csd_lock_wait(call_single_data_t *csd)
+static void __csd_lock_wait(struct __call_single_data *csd)
 {
 	int bug_id = 0;
 	u64 ts0, ts1;
@@ -401,7 +401,7 @@ static void __csd_lock_wait(call_single_data_t *csd)
 	smp_acquire__after_ctrl_dep();
 }
 
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 {
 	if (static_branch_unlikely(&csdlock_debug_enabled)) {
 		__csd_lock_wait(csd);
@@ -431,17 +431,17 @@ static void __smp_call_single_queue_debug(int cpu, struct llist_node *node)
 #else
 #define cfd_seq_store(var, src, dst, type)
 
-static void csd_lock_record(call_single_data_t *csd)
+static void csd_lock_record(struct __call_single_data *csd)
 {
 }
 
-static __always_inline void csd_lock_wait(call_single_data_t *csd)
+static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 {
 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #endif
 
-static __always_inline void csd_lock(call_single_data_t *csd)
+static __always_inline void csd_lock(struct __call_single_data *csd)
 {
 	csd_lock_wait(csd);
 	csd->node.u_flags |= CSD_FLAG_LOCK;
@@ -454,7 +454,7 @@ static __always_inline void csd_lock(call_single_data_t *csd)
 	smp_wmb();
 }
 
-static __always_inline void csd_unlock(call_single_data_t *csd)
+static __always_inline void csd_unlock(struct __call_single_data *csd)
 {
 	WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 
@@ -501,7 +501,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static int generic_exec_single(int cpu, call_single_data_t *csd)
+static int generic_exec_single(int cpu, struct __call_single_data *csd)
 {
 	if (cpu == smp_processor_id()) {
 		smp_call_func_t func = csd->func;
@@ -784,7 +784,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  * NOTE: Be careful, there is unfortunately no current debugging facility to
  * validate the correctness of this serialization.
  */
-int smp_call_function_single_async(int cpu, call_single_data_t *csd)
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
 {
 	int err = 0;
 
diff --git a/kernel/up.c b/kernel/up.c
index bf20b4a9af60..c7321307923a 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int smp_call_function_single_async(int cpu, call_single_data_t *csd)
+int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
 {
 	unsigned long flags;
 
-- 
cgit v1.2.3


From d4455faccd6cbe11ddfdbe28723a2122453b4f4e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 6 May 2021 18:02:16 -0700
Subject: proc: mandate ->proc_lseek in "struct proc_ops"

Now that proc_ops are separate from file_operations and other operations
it easy to check all instances to have ->proc_lseek hook and remove check
in main code.

Note:
nonseekable_open() files naturally don't require ->proc_lseek.

Garbage collect pde_lseek() function.

[adobriyan@gmail.com: smoke test lseek()]
  Link: https://lkml.kernel.org/r/YG4OIhChOrVTPgdN@localhost.localdomain

Link: https://lkml.kernel.org/r/YFYX0Bzwxlc7aBa/@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/isdn/capi/kcapi_proc.c                     |  1 +
 drivers/net/wireless/intersil/hostap/hostap_proc.c |  1 +
 drivers/scsi/esas2r/esas2r_main.c                  |  1 +
 fs/proc/inode.c                                    | 14 ++------------
 include/linux/proc_fs.h                            |  1 +
 tools/testing/selftests/proc/read.c                |  4 +++-
 6 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/kcapi_proc.c b/drivers/isdn/capi/kcapi_proc.c
index b5ed4ea145cb..77e951206809 100644
--- a/drivers/isdn/capi/kcapi_proc.c
+++ b/drivers/isdn/capi/kcapi_proc.c
@@ -201,6 +201,7 @@ static ssize_t empty_read(struct file *file, char __user *buf,
 
 static const struct proc_ops empty_proc_ops = {
 	.proc_read	= empty_read,
+	.proc_lseek	= default_llseek,
 };
 
 // ---------------------------------------------------------------------------
diff --git a/drivers/net/wireless/intersil/hostap/hostap_proc.c b/drivers/net/wireless/intersil/hostap/hostap_proc.c
index 97c270845fd1..51c847d98755 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_proc.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_proc.c
@@ -227,6 +227,7 @@ static ssize_t prism2_aux_dump_proc_no_read(struct file *file, char __user *buf,
 
 static const struct proc_ops prism2_aux_dump_proc_ops = {
 	.proc_read	= prism2_aux_dump_proc_no_read,
+	.proc_lseek	= default_llseek,
 };
 
 
diff --git a/drivers/scsi/esas2r/esas2r_main.c b/drivers/scsi/esas2r/esas2r_main.c
index 5d9eeac6717a..45ec9f16c085 100644
--- a/drivers/scsi/esas2r/esas2r_main.c
+++ b/drivers/scsi/esas2r/esas2r_main.c
@@ -616,6 +616,7 @@ static const struct file_operations esas2r_proc_fops = {
 };
 
 static const struct proc_ops esas2r_proc_ops = {
+	.proc_lseek		= default_llseek,
 	.proc_ioctl		= esas2r_proc_ioctl,
 #ifdef CONFIG_COMPAT
 	.proc_compat_ioctl	= compat_ptr_ioctl,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bde6b6f69852..2ea8aaa7206e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -273,25 +273,15 @@ void proc_entry_rundown(struct proc_dir_entry *de)
 	spin_unlock(&de->pde_unload_lock);
 }
 
-static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
-{
-	typeof_member(struct proc_ops, proc_lseek) lseek;
-
-	lseek = pde->proc_ops->proc_lseek;
-	if (!lseek)
-		lseek = default_llseek;
-	return lseek(file, offset, whence);
-}
-
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	loff_t rv = -EINVAL;
 
 	if (pde_is_permanent(pde)) {
-		return pde_lseek(pde, file, offset, whence);
+		return pde->proc_ops->proc_lseek(file, offset, whence);
 	} else if (use_pde(pde)) {
-		rv = pde_lseek(pde, file, offset, whence);
+		rv = pde->proc_ops->proc_lseek(file, offset, whence);
 		unuse_pde(pde);
 	}
 	return rv;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 000cc0533c33..069c7fd95396 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -32,6 +32,7 @@ struct proc_ops {
 	ssize_t	(*proc_read)(struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *);
 	ssize_t	(*proc_write)(struct file *, const char __user *, size_t, loff_t *);
+	/* mandatory unless nonseekable_open() or equivalent is used */
 	loff_t	(*proc_lseek)(struct file *, loff_t, int);
 	int	(*proc_release)(struct inode *, struct file *);
 	__poll_t (*proc_poll)(struct file *, struct poll_table_struct *);
diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c
index b3ef9e14d6cc..35ee78dff144 100644
--- a/tools/testing/selftests/proc/read.c
+++ b/tools/testing/selftests/proc/read.c
@@ -14,7 +14,7 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 // Test
-// 1) read of every file in /proc
+// 1) read and lseek on every file in /proc
 // 2) readlink of every symlink in /proc
 // 3) recursively (1) + (2) for every directory in /proc
 // 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs
@@ -45,6 +45,8 @@ static void f_reg(DIR *d, const char *filename)
 	fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK);
 	if (fd == -1)
 		return;
+	/* struct proc_ops::proc_lseek is mandatory if file is seekable. */
+	(void)lseek(fd, 0, SEEK_SET);
 	rv = read(fd, buf, sizeof(buf));
 	assert((0 <= rv && rv <= sizeof(buf)) || rv == -1);
 	close(fd);
-- 
cgit v1.2.3


From 4ee60ec156d91c315d1f62dfc1bc5799dcc6b473 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 6 May 2021 18:02:27 -0700
Subject: include: remove pagemap.h from blkdev.h

My UEK-derived config has 1030 files depending on pagemap.h before this
change.  Afterwards, just 326 files need to be rebuilt when I touch
pagemap.h.  I think blkdev.h is probably included too widely, but
untangling that dependency is harder and this solves my problem.  x86
allmodconfig builds, but there may be implicit include problems on other
architectures.

Link: https://lkml.kernel.org/r/20210309195747.283796-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Dan Williams <dan.j.williams@intel.com>		[nvdimm]
Acked-by: Jens Axboe <axboe@kernel.dk>				[block]
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Coly Li <colyli@suse.de>				[bcache]
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>	[scsi]
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-settings.c      | 1 +
 drivers/block/brd.c       | 1 +
 drivers/block/loop.c      | 1 +
 drivers/md/bcache/super.c | 1 +
 drivers/nvdimm/btt.c      | 1 +
 drivers/nvdimm/pmem.c     | 1 +
 drivers/scsi/scsicam.c    | 1 +
 include/linux/blkdev.h    | 1 -
 include/linux/swap.h      | 1 +
 9 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9c009090c4b5..902c40d67120 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/gcd.h>
 #include <linux/lcm.h>
 #include <linux/jiffies.h>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 6e622c1327ee..7562cf30b14e 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/highmem.h>
 #include <linux/mutex.h>
+#include <linux/pagemap.h>
 #include <linux/radix-tree.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a370cde3ddd4..d58d68f3c7cd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -53,6 +53,7 @@
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 2b6d6e9cd680..bea8c4429ae8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -16,6 +16,7 @@
 #include "features.h"
 
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/debugfs.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 41aa1f01fc07..18a267d5073f 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -6,6 +6,7 @@
 #include <linux/highmem.h>
 #include <linux/debugfs.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 7daac795db39..ed10a8b66068 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/hdreg.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c
index f1553a453616..0ffdb8f2995f 100644
--- a/drivers/scsi/scsicam.c
+++ b/drivers/scsi/scsicam.c
@@ -17,6 +17,7 @@
 #include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
+#include <linux/pagemap.h>
 #include <linux/msdos_partition.h>
 #include <asm/unaligned.h>
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b91ba6207365..1255823b2bc0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -11,7 +11,6 @@
 #include <linux/minmax.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
-#include <linux/pagemap.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f69e0f67651d..144727041e78 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -10,6 +10,7 @@
 #include <linux/sched.h>
 #include <linux/node.h>
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
 #include <uapi/linux/mempolicy.h>
-- 
cgit v1.2.3


From 08c5188ef40ff82aed559123dc0ab2d2254b1b1c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 6 May 2021 18:02:30 -0700
Subject: kernel.h: drop inclusion in bitmap.h

The bitmap.h header is used in a lot of code around the kernel.  Besides
that it includes kernel.h which sometimes makes a loop.

The problem here is many unneeded loops that make header hell
dependencies.  For example, how may you move bitmap_zalloc() from C-file
to the header?  Currently it's impossible.  And bitmap.h here is only the
tip of an iceberg.

kerne.h is a dump of everything that even has nothing in common at all.
We may still have it, but in my new code I prefer to include only the
headers that I want to use, without the bulk of unneeded kernel code.

Break the loop by introducing align.h, including it in kernel.h and
bitmap.h followed by replacing kernel.h with limits.h.

Link: https://lkml.kernel.org/r/20210326170347.37441-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/align.h  | 15 +++++++++++++++
 include/linux/bitmap.h |  3 ++-
 include/linux/kernel.h |  9 +--------
 3 files changed, 18 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/align.h

(limited to 'include/linux')

diff --git a/include/linux/align.h b/include/linux/align.h
new file mode 100644
index 000000000000..2b4acec7b95a
--- /dev/null
+++ b/include/linux/align.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ALIGN_H
+#define _LINUX_ALIGN_H
+
+#include <linux/const.h>
+
+/* @a is a power of 2 value */
+#define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
+#define ALIGN_DOWN(x, a)	__ALIGN_KERNEL((x) - ((a) - 1), (a))
+#define __ALIGN_MASK(x, mask)	__ALIGN_KERNEL_MASK((x), (mask))
+#define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
+#define PTR_ALIGN_DOWN(p, a)	((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
+#define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
+
+#endif	/* _LINUX_ALIGN_H */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 70a932470b2d..6cbcd9d9edd2 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -4,10 +4,11 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/align.h>
 #include <linux/types.h>
 #include <linux/bitops.h>
+#include <linux/limits.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
 
 /*
  * bitmaps provide bit arrays that consume one or more unsigned
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5b7ed6dc99ac..09035ac67d4b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -3,6 +3,7 @@
 #define _LINUX_KERNEL_H
 
 #include <stdarg.h>
+#include <linux/align.h>
 #include <linux/limits.h>
 #include <linux/linkage.h>
 #include <linux/stddef.h>
@@ -30,14 +31,6 @@
  */
 #define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))
 
-/* @a is a power of 2 value */
-#define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
-#define ALIGN_DOWN(x, a)	__ALIGN_KERNEL((x) - ((a) - 1), (a))
-#define __ALIGN_MASK(x, mask)	__ALIGN_KERNEL_MASK((x), (mask))
-#define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
-#define PTR_ALIGN_DOWN(p, a)	((typeof(p))ALIGN_DOWN((unsigned long)(p), (a)))
-#define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
-
 /* generic data direction definitions */
 #define READ			0
 #define WRITE			1
-- 
cgit v1.2.3


From 112dfce8f29798192eb0be8066b54f4a68f4eb36 Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Thu, 6 May 2021 18:02:33 -0700
Subject: linux/profile.h: remove unnecessary declaration

Declaring struct pt_regs is unnecessary.  On the one hand, there is no
function using it; on the other hand, struct pt_regs has been declared in
linux/kernel.h.  Remove them.

Link: https://lkml.kernel.org/r/20210401104834.1009157-1-wanjiabing@vivo.com
Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/profile.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/profile.h b/include/linux/profile.h
index bad18ca43150..fd18ca96f557 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -15,7 +15,6 @@
 #define KVM_PROFILING	4
 
 struct proc_dir_entry;
-struct pt_regs;
 struct notifier_block;
 
 #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS)
@@ -84,8 +83,6 @@ int task_handoff_unregister(struct notifier_block * n);
 int profile_event_register(enum profile_type, struct notifier_block * n);
 int profile_event_unregister(enum profile_type, struct notifier_block * n);
 
-struct pt_regs;
-
 #else
 
 #define prof_on 0
-- 
cgit v1.2.3


From 32c93976ac2ee7ecb4b09cc032efe1445d37bd7e Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 6 May 2021 18:02:39 -0700
Subject: kernel/cred.c: make init_groups static

init_groups is declared in both cred.h and init_task.h, but it is not
actually referenced anywhere outside of cred.c where it is defined.  So
make it static and remove the declarations.

Link: https://lkml.kernel.org/r/20210310220102.2484201-1-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h      | 1 -
 include/linux/init_task.h | 1 -
 kernel/cred.c             | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index ac0e5f97d7d8..14971322e1a0 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -53,7 +53,6 @@ do {							\
 		groups_free(group_info);		\
 } while (0)
 
-extern struct group_info init_groups;
 #ifdef CONFIG_MULTIUSER
 extern struct group_info *groups_alloc(int);
 extern void groups_free(struct group_info *);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index b2412b4d4c20..40fc5813cf93 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,7 +25,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 extern struct nsproxy init_nsproxy;
-extern struct group_info init_groups;
 extern struct cred init_cred;
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/cred.c b/kernel/cred.c
index 421b1149c651..e1d274cd741b 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -33,7 +33,7 @@ do {									\
 static struct kmem_cache *cred_jar;
 
 /* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
 
 /*
  * The initial credentials for the initial task
-- 
cgit v1.2.3


From 586eaebea5988302c5a8b018096dd6c6f4564940 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 6 May 2021 18:02:56 -0700
Subject: lib: extend the scope of small_const_nbits() macro

find_bit would also benefit from small_const_nbits() optimizations.  The
detailed comment is provided by Rasmus Villemoes.

Link: https://lkml.kernel.org/r/20210401003153.97325-6-yury.norov@gmail.com
Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Alexey Klimov <aklimov@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Jianpeng Ma <jianpeng.ma@intel.com>
Cc: Joe Perches <joe@perches.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Stefano Brivio <sbrivio@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Wolfram Sang <wsa+renesas@sang-engineering.com>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/bitsperlong.h | 12 ++++++++++++
 include/linux/bitmap.h            |  8 --------
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h
index 3905c1c93dc2..1023e2a4bd37 100644
--- a/include/asm-generic/bitsperlong.h
+++ b/include/asm-generic/bitsperlong.h
@@ -23,4 +23,16 @@
 #define BITS_PER_LONG_LONG 64
 #endif
 
+/*
+ * small_const_nbits(n) is true precisely when it is known at compile-time
+ * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows
+ * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps
+ * of size 0 are very rare, and a compile-time-known-size 0 is most likely
+ * a sign of error. They will be handled correctly by the bit/bitmap APIs,
+ * but using the out-of-line functions, so that the inline implementations
+ * can unconditionally dereference the pointer(s).
+ */
+#define small_const_nbits(nbits) \
+	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
+
 #endif /* __ASM_GENERIC_BITS_PER_LONG */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 6cbcd9d9edd2..f57f4473bbe4 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -223,14 +223,6 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf,
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
-/*
- * The static inlines below do not handle constant nbits==0 correctly,
- * so make such users (should any ever turn up) call the out-of-line
- * versions.
- */
-#define small_const_nbits(nbits) \
-	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0)
-
 static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
 	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-- 
cgit v1.2.3


From 2cc7b6a44ac21d31b398b03f4845c53152070416 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 6 May 2021 18:03:14 -0700
Subject: lib: add fast path for find_first_*_bit() and find_last_bit()

Similarly to bitmap functions, users would benefit if we'll handle a case
of small-size bitmaps that fit into a single word.

While here, move the find_last_bit() declaration to bitops/find.h where
other find_*_bit() functions sit.

Link: https://lkml.kernel.org/r/20210401003153.97325-11-yury.norov@gmail.com
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Alexey Klimov <aklimov@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David Sterba <dsterba@suse.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Jianpeng Ma <jianpeng.ma@intel.com>
Cc: Joe Perches <joe@perches.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Stefano Brivio <sbrivio@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Wolfram Sang <wsa+renesas@sang-engineering.com>
Cc: Yoshinori Sato <ysato@users.osdn.me>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/bitops/find.h | 50 +++++++++++++++++++++++++++++++++++----
 include/linux/bitops.h            | 12 ----------
 lib/find_bit.c                    | 12 +++++-----
 3 files changed, 52 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h
index 4148c74a1e4d..0d132ee2a291 100644
--- a/include/asm-generic/bitops/find.h
+++ b/include/asm-generic/bitops/find.h
@@ -5,6 +5,9 @@
 extern unsigned long _find_next_bit(const unsigned long *addr1,
 		const unsigned long *addr2, unsigned long nbits,
 		unsigned long start, unsigned long invert, unsigned long le);
+extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
+extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
 
 #ifndef find_next_bit
 /**
@@ -102,8 +105,17 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
  * Returns the bit number of the first set bit.
  * If no bits are set, returns @size.
  */
-extern unsigned long find_first_bit(const unsigned long *addr,
-				    unsigned long size);
+static inline
+unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr & GENMASK(size - 1, 0);
+
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_first_bit(addr, size);
+}
 
 /**
  * find_first_zero_bit - find the first cleared bit in a memory region
@@ -113,8 +125,17 @@ extern unsigned long find_first_bit(const unsigned long *addr,
  * Returns the bit number of the first cleared bit.
  * If no bits are zero, returns @size.
  */
-extern unsigned long find_first_zero_bit(const unsigned long *addr,
-					 unsigned long size);
+static inline
+unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr | ~GENMASK(size - 1, 0);
+
+		return val == ~0UL ? size : ffz(val);
+	}
+
+	return _find_first_zero_bit(addr, size);
+}
 #else /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
 #ifndef find_first_bit
@@ -126,6 +147,27 @@ extern unsigned long find_first_zero_bit(const unsigned long *addr,
 
 #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
+#ifndef find_last_bit
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The number of bits to search
+ *
+ * Returns the bit number of the last set bit, or size.
+ */
+static inline
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr & GENMASK(size - 1, 0);
+
+		return val ? __fls(val) : size;
+	}
+
+	return _find_last_bit(addr, size);
+}
+#endif
+
 /**
  * find_next_clump8 - find next 8-bit clump with set bits in a memory region
  * @clump: location to store copy of found clump
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index a5a48303b0f1..26bf15e6cd35 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -286,17 +286,5 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
 })
 #endif
 
-#ifndef find_last_bit
-/**
- * find_last_bit - find the last set bit in a memory region
- * @addr: The address to start the search at
- * @size: The number of bits to search
- *
- * Returns the bit number of the last set bit, or size.
- */
-extern unsigned long find_last_bit(const unsigned long *addr,
-				   unsigned long size);
-#endif
-
 #endif /* __KERNEL__ */
 #endif
diff --git a/lib/find_bit.c b/lib/find_bit.c
index b03a101367f8..0f8e2e369b1d 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(_find_next_bit);
 /*
  * Find the first set bit in a memory region.
  */
-unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
 {
 	unsigned long idx;
 
@@ -86,14 +86,14 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
 
 	return size;
 }
-EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(_find_first_bit);
 #endif
 
 #ifndef find_first_zero_bit
 /*
  * Find the first cleared bit in a memory region.
  */
-unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
 {
 	unsigned long idx;
 
@@ -104,11 +104,11 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
 
 	return size;
 }
-EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(_find_first_zero_bit);
 #endif
 
 #ifndef find_last_bit
-unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+unsigned long _find_last_bit(const unsigned long *addr, unsigned long size)
 {
 	if (size) {
 		unsigned long val = BITMAP_LAST_WORD_MASK(size);
@@ -124,7 +124,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
 	}
 	return size;
 }
-EXPORT_SYMBOL(find_last_bit);
+EXPORT_SYMBOL(_find_last_bit);
 #endif
 
 unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
-- 
cgit v1.2.3


From e18baa7cc3598999317d6c2fe255756f6b3b7562 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 6 May 2021 18:03:37 -0700
Subject: lib: crc8: pointer to data block should be const

crc8() does not change the data passed to it, so the pointer argument
should be declared const.  This avoids callers that receive const data
having to cast it to a non-const pointer to call crc8().

Link: https://lkml.kernel.org/r/20210329122409.3291-1-rf@opensource.cirrus.com
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crc8.h | 2 +-
 lib/crc8.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crc8.h b/include/linux/crc8.h
index 13c8dabb0441..674045c59a04 100644
--- a/include/linux/crc8.h
+++ b/include/linux/crc8.h
@@ -96,6 +96,6 @@ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial);
  * Williams, Ross N., ross<at>ross.net
  * (see URL http://www.ross.net/crc/download/crc_v3.txt).
  */
-u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc);
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc);
 
 #endif /* __CRC8_H_ */
diff --git a/lib/crc8.c b/lib/crc8.c
index 595a5a75e3cd..1ad8e501d9b6 100644
--- a/lib/crc8.c
+++ b/lib/crc8.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(crc8_populate_lsb);
  * @nbytes: number of bytes in data buffer.
  * @crc: previous returned crc8 value.
  */
-u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc)
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc)
 {
 	/* loop over the buffer data */
 	while (nbytes-- > 0)
-- 
cgit v1.2.3


From e13d04ec45b07388d3c38c0e18a4d0aa4841b0c3 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 6 May 2021 18:03:52 -0700
Subject: include/linux/compat.h: remove unneeded declaration from
 COMPAT_SYSCALL_DEFINEx()

compat_sys##name is declared twice, just one line below.

With this removal SYSCALL_DEFINEx() (defined in <linux/syscalls.h>)
and COMPAT_SYSCALL_DEFINEx() look symmetrical.

Link: https://lkml.kernel.org/r/20210223114924.854794-1-masahiroy@kernel.org
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index f0d2dd35d408..cfac4ec9a7df 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -75,7 +75,6 @@
 	__diag_push();								\
 	__diag_ignore(GCC, 8, "-Wattribute-alias",				\
 		      "Type aliasing is used to sanitize syscall arguments");\
-	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
 	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
 		__attribute__((alias(__stringify(__se_compat_sys##name))));	\
 	ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO);				\
-- 
cgit v1.2.3


From 6f1f942cd5fbbe308f912fc84e3f10fbc8113a68 Mon Sep 17 00:00:00 2001
From: He Ying <heying24@huawei.com>
Date: Thu, 6 May 2021 18:04:57 -0700
Subject: smp: kernel/panic.c - silence warnings

We found these warnings in kernel/panic.c by using sparse tool:
warning: symbol 'panic_smp_self_stop' was not declared.
warning: symbol 'nmi_panic_self_stop' was not declared.
warning: symbol 'crash_smp_send_stop' was not declared.

To avoid them, add declarations for these three functions in
include/linux/smp.h.

Link: https://lkml.kernel.org/r/20210316084150.75201-1-heying24@huawei.com
Signed-off-by: He Ying <heying24@huawei.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 84a0b4828f66..669e35c03be2 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -55,6 +55,14 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 
 int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
+/*
+ * Cpus stopping functions in panic. All have default weak definitions.
+ * Architecture-dependent code may override them.
+ */
+void panic_smp_self_stop(void);
+void nmi_panic_self_stop(struct pt_regs *regs);
+void crash_smp_send_stop(void);
+
 /*
  * Call a function on all processors
  */
-- 
cgit v1.2.3


From 3d1c7fd97e4c5e54034231cd11319079dfaed60e Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Thu, 6 May 2021 18:05:00 -0700
Subject: delayacct: clear right task's flag after blkio completes

When I was implementing a latency analyzer tool by using task->delays
and other things, I found an issue in delayacct.  The issue is it should
clear the target's flag instead of current's in delayacct_blkio_end().

When I git blame delayacct, I found there're some similar issues we have
fixed in delayacct_blkio_end().

 - Commit c96f5471ce7d ("delayacct: Account blkio completion on the
   correct task") fixed the issue that it should account blkio
   completion on the target task instead of current.

 - Commit b512719f771a ("delayacct: fix crash in delayacct_blkio_end()
   after delayacct init failure") fixed the issue that it should check
   target task's delays instead of current task'.

It seems that delayacct_blkio_{begin, end} are error prone.

So I introduce a new paratmeter - the target task 'p' - to these
helpers.  After that change, the callsite will specifilly set the right
task, which should make it less error prone.

Link: https://lkml.kernel.org/r/20210414083720.24083-1-laoar.shao@gmail.com
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Josh Snyder <joshs@netflix.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/delayacct.h | 20 ++++++++++----------
 mm/memory.c               |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 2d3bdcccf5eb..21651f946751 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -82,16 +82,16 @@ static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
 		return 0;
 }
 
-static inline void delayacct_set_flag(int flag)
+static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {
-	if (current->delays)
-		current->delays->flags |= flag;
+	if (p->delays)
+		p->delays->flags |= flag;
 }
 
-static inline void delayacct_clear_flag(int flag)
+static inline void delayacct_clear_flag(struct task_struct *p, int flag)
 {
-	if (current->delays)
-		current->delays->flags &= ~flag;
+	if (p->delays)
+		p->delays->flags &= ~flag;
 }
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
@@ -114,7 +114,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
 
 static inline void delayacct_blkio_start(void)
 {
-	delayacct_set_flag(DELAYACCT_PF_BLKIO);
+	delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
 	if (current->delays)
 		__delayacct_blkio_start();
 }
@@ -123,7 +123,7 @@ static inline void delayacct_blkio_end(struct task_struct *p)
 {
 	if (p->delays)
 		__delayacct_blkio_end(p);
-	delayacct_clear_flag(DELAYACCT_PF_BLKIO);
+	delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
 }
 
 static inline int delayacct_add_tsk(struct taskstats *d,
@@ -166,9 +166,9 @@ static inline void delayacct_thrashing_end(void)
 }
 
 #else
-static inline void delayacct_set_flag(int flag)
+static inline void delayacct_set_flag(struct task_struct *p, int flag)
 {}
-static inline void delayacct_clear_flag(int flag)
+static inline void delayacct_clear_flag(struct task_struct *p, int flag)
 {}
 static inline void delayacct_init(void)
 {}
diff --git a/mm/memory.c b/mm/memory.c
index cbdc2cd9cedb..8c491f813687 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3339,7 +3339,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	}
 
 
-	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry, vma, vmf->address);
 	swapcache = page;
 
@@ -3388,7 +3388,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 					vmf->address, &vmf->ptl);
 			if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
 				ret = VM_FAULT_OOM;
-			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+			delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
 			goto unlock;
 		}
 
@@ -3402,13 +3402,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 		 * owner processes (which may be unknown at hwpoison time)
 		 */
 		ret = VM_FAULT_HWPOISON;
-		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+		delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
 		goto out_release;
 	}
 
 	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
-	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+	delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
 	if (!locked) {
 		ret |= VM_FAULT_RETRY;
 		goto out_release;
-- 
cgit v1.2.3


From a065c0faacb1e472cd4e048986407d1b177373a2 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 6 May 2021 18:05:39 -0700
Subject: kernel/async.c: remove async_unregister_domain()

No callers in the tree.

Link: https://lkml.kernel.org/r/20210309151723.1907838-2-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/async.h |  1 -
 kernel/async.c        | 18 ------------------
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/async.h b/include/linux/async.h
index 0a17cd27f348..cce4ad31e8fc 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -112,7 +112,6 @@ async_schedule_dev_domain(async_func_t func, struct device *dev,
 	return async_schedule_node_domain(func, dev, dev_to_node(dev), domain);
 }
 
-void async_unregister_domain(struct async_domain *domain);
 extern void async_synchronize_full(void);
 extern void async_synchronize_full_domain(struct async_domain *domain);
 extern void async_synchronize_cookie(async_cookie_t cookie);
diff --git a/kernel/async.c b/kernel/async.c
index 4b5971142922..b8d7a663497f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -245,24 +245,6 @@ void async_synchronize_full(void)
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
-/**
- * async_unregister_domain - ensure no more anonymous waiters on this domain
- * @domain: idle domain to flush out of any async_synchronize_full instances
- *
- * async_synchronize_{cookie|full}_domain() are not flushed since callers
- * of these routines should know the lifetime of @domain
- *
- * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
- */
-void async_unregister_domain(struct async_domain *domain)
-{
-	spin_lock_irq(&async_lock);
-	WARN_ON(!domain->registered || !list_empty(&domain->pending));
-	domain->registered = 0;
-	spin_unlock_irq(&async_lock);
-}
-EXPORT_SYMBOL_GPL(async_unregister_domain);
-
 /**
  * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
  * @domain: the domain to synchronize
-- 
cgit v1.2.3


From e7cb072eb988e46295512617c39d004f9e1c26f8 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 6 May 2021 18:05:42 -0700
Subject: init/initramfs.c: do unpacking asynchronously

Patch series "background initramfs unpacking, and CONFIG_MODPROBE_PATH", v3.

These two patches are independent, but better-together.

The second is a rather trivial patch that simply allows the developer to
change "/sbin/modprobe" to something else - e.g.  the empty string, so
that all request_module() during early boot return -ENOENT early, without
even spawning a usermode helper, needlessly synchronizing with the
initramfs unpacking.

The first patch delegates decompressing the initramfs to a worker thread,
allowing do_initcalls() in main.c to proceed to the device_ and late_
initcalls without waiting for that decompression (and populating of
rootfs) to finish.  Obviously, some of those later calls may rely on the
initramfs being available, so I've added synchronization points in the
firmware loader and usermodehelper paths - there might be other places
that would need this, but so far no one has been able to think of any
places I have missed.

There's not much to win if most of the functionality needed during boot is
only available as modules.  But systems with a custom-made .config and
initramfs can boot faster, partly due to utilizing more than one cpu
earlier, partly by avoiding known-futile modprobe calls (which would still
trigger synchronization with the initramfs unpacking, thus eliminating
most of the first benefit).

This patch (of 2):

Most of the boot process doesn't actually need anything from the
initramfs, until of course PID1 is to be executed.  So instead of doing
the decompressing and populating of the initramfs synchronously in
populate_rootfs() itself, push that off to a worker thread.

This is primarily motivated by an embedded ppc target, where unpacking
even the rather modest sized initramfs takes 0.6 seconds, which is long
enough that the external watchdog becomes unhappy that it doesn't get
attention soon enough.  By doing the initramfs decompression in a worker
thread, we get to do the device_initcalls and hence start petting the
watchdog much sooner.

Normal desktops might benefit as well.  On my mostly stock Ubuntu kernel,
my initramfs is a 26M xz-compressed blob, decompressing to around 126M.
That takes almost two seconds:

[    0.201454] Trying to unpack rootfs image as initramfs...
[    1.976633] Freeing initrd memory: 29416K

Before this patch, these lines occur consecutively in dmesg.  With this
patch, the timestamps on these two lines is roughly the same as above, but
with 172 lines inbetween - so more than one cpu has been kept busy doing
work that would otherwise only happen after the populate_rootfs()
finished.

Should one of the initcalls done after rootfs_initcall time (i.e., device_
and late_ initcalls) need something from the initramfs (say, a kernel
module or a firmware blob), it will simply wait for the initramfs
unpacking to be done before proceeding, which should in theory make this
completely safe.

But if some driver pokes around in the filesystem directly and not via one
of the official kernel interfaces (i.e.  request_firmware*(),
call_usermodehelper*) that theory may not hold - also, I certainly might
have missed a spot when sprinkling wait_for_initramfs().  So there is an
escape hatch in the form of an initramfs_async= command line parameter.

Link: https://lkml.kernel.org/r/20210313212528.2956377-1-linux@rasmusvillemoes.dk
Link: https://lkml.kernel.org/r/20210313212528.2956377-2-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 12 ++++++++
 drivers/base/firmware_loader/main.c             |  2 ++
 include/linux/initrd.h                          |  2 ++
 init/initramfs.c                                | 38 ++++++++++++++++++++++++-
 init/main.c                                     |  1 +
 kernel/umh.c                                    |  2 ++
 6 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d93fbc1c1917..7866cc1bd4a9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1833,6 +1833,18 @@
 			initcall functions.  Useful for debugging built-in
 			modules and initcalls.
 
+	initramfs_async= [KNL]
+			Format: <bool>
+			Default: 1
+			This parameter controls whether the initramfs
+			image is unpacked asynchronously, concurrently
+			with devices being probed and
+			initialized. This should normally just work,
+			but as a debugging aid, one can get the
+			historical behaviour of the initramfs
+			unpacking being completed before device_ and
+			late_ initcalls.
+
 	initrd=		[BOOT] Specify the location of the initial ramdisk
 
 	initrdmem=	[KNL] Specify a physical address and size from which to
diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index 78355095e00d..4fdb8219cd08 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -15,6 +15,7 @@
 #include <linux/kernel_read_file.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/initrd.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
 #include <linux/interrupt.h>
@@ -504,6 +505,7 @@ fw_get_filesystem_firmware(struct device *device, struct fw_priv *fw_priv,
 	if (!path)
 		return -ENOMEM;
 
+	wait_for_initramfs();
 	for (i = 0; i < ARRAY_SIZE(fw_path); i++) {
 		size_t file_size = 0;
 		size_t *file_size_ptr = NULL;
diff --git a/include/linux/initrd.h b/include/linux/initrd.h
index 85c15717af34..1bbe9af48dc3 100644
--- a/include/linux/initrd.h
+++ b/include/linux/initrd.h
@@ -20,8 +20,10 @@ extern void free_initrd_mem(unsigned long, unsigned long);
 
 #ifdef CONFIG_BLK_DEV_INITRD
 extern void __init reserve_initrd_mem(void);
+extern void wait_for_initramfs(void);
 #else
 static inline void __init reserve_initrd_mem(void) {}
+static inline void wait_for_initramfs(void) {}
 #endif
 
 extern phys_addr_t phys_initrd_start;
diff --git a/init/initramfs.c b/init/initramfs.c
index d677e8e717f1..af27abc59643 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/init.h>
+#include <linux/async.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -541,6 +542,14 @@ static int __init keepinitrd_setup(char *__unused)
 __setup("keepinitrd", keepinitrd_setup);
 #endif
 
+static bool __initdata initramfs_async = true;
+static int __init initramfs_async_setup(char *str)
+{
+	strtobool(str, &initramfs_async);
+	return 1;
+}
+__setup("initramfs_async=", initramfs_async_setup);
+
 extern char __initramfs_start[];
 extern unsigned long __initramfs_size;
 #include <linux/initrd.h>
@@ -658,7 +667,7 @@ static void __init populate_initrd_image(char *err)
 }
 #endif /* CONFIG_BLK_DEV_RAM */
 
-static int __init populate_rootfs(void)
+static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
 {
 	/* Load the built in initramfs */
 	char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
@@ -693,6 +702,33 @@ done:
 	initrd_end = 0;
 
 	flush_delayed_fput();
+}
+
+static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain);
+static async_cookie_t initramfs_cookie;
+
+void wait_for_initramfs(void)
+{
+	if (!initramfs_cookie) {
+		/*
+		 * Something before rootfs_initcall wants to access
+		 * the filesystem/initramfs. Probably a bug. Make a
+		 * note, avoid deadlocking the machine, and let the
+		 * caller's access fail as it used to.
+		 */
+		pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n");
+		return;
+	}
+	async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain);
+}
+EXPORT_SYMBOL_GPL(wait_for_initramfs);
+
+static int __init populate_rootfs(void)
+{
+	initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL,
+						 &initramfs_domain);
+	if (!initramfs_async)
+		wait_for_initramfs();
 	return 0;
 }
 rootfs_initcall(populate_rootfs);
diff --git a/init/main.c b/init/main.c
index dd11bfd10ead..11d34ccf5786 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1561,6 +1561,7 @@ static noinline void __init kernel_init_freeable(void)
 
 	kunit_run_all_tests();
 
+	wait_for_initramfs();
 	console_on_rootfs();
 
 	/*
diff --git a/kernel/umh.c b/kernel/umh.c
index 3f646613a9d3..61f6b82c354b 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -27,6 +27,7 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
+#include <linux/initrd.h>
 
 #include <trace/events/module.h>
 
@@ -107,6 +108,7 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
+	wait_for_initramfs();
 	retval = kernel_execve(sub_info->path,
 			       (const char *const *)sub_info->argv,
 			       (const char *const *)sub_info->envp);
-- 
cgit v1.2.3


From cb152a1a95606aadd81df7a537dde9ef16da4b80 Mon Sep 17 00:00:00 2001
From: Shijie Luo <luoshijie1@huawei.com>
Date: Thu, 6 May 2021 18:05:51 -0700
Subject: mm: fix some typos and code style problems

fix some typos and code style problems in mm.

gfp.h: s/MAXNODES/MAX_NUMNODES
mmzone.h: s/then/than
rmap.c: s/__vma_split()/__vma_adjust()
swap.c: s/__mod_zone_page_stat/__mod_zone_page_state, s/is is/is
swap_state.c: s/whoes/whose
z3fold.c: code style problem fix in z3fold_unregister_migration
zsmalloc.c: s/of/or, s/give/given

Link: https://lkml.kernel.org/r/20210419083057.64820-1-luoshijie1@huawei.com
Signed-off-by: Shijie Luo <luoshijie1@huawei.com>
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    | 2 +-
 include/linux/mmzone.h | 2 +-
 mm/rmap.c              | 2 +-
 mm/swap.c              | 4 ++--
 mm/swap_state.c        | 2 +-
 mm/z3fold.c            | 2 +-
 mm/zsmalloc.c          | 4 ++--
 7 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8a5f6c3d7dba..11da8af06704 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -490,7 +490,7 @@ static inline int gfp_zonelist(gfp_t flags)
 
 /*
  * We get the zone list from the current node and the gfp_mask.
- * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
+ * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
  * There are two zonelists per node, one for all zones with memory and
  * one containing just zones from the node the zonelist belongs to.
  *
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 917bd6c604d5..0d53eba1c383 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -55,7 +55,7 @@ enum migratetype {
 	 * pageblocks to MIGRATE_CMA which can be done by
 	 * __free_pageblock_cma() function.  What is important though
 	 * is that a range of pageblocks must be aligned to
-	 * MAX_ORDER_NR_PAGES should biggest page be bigger then
+	 * MAX_ORDER_NR_PAGES should biggest page be bigger than
 	 * a single pageblock.
 	 */
 	MIGRATE_CMA,
diff --git a/mm/rmap.c b/mm/rmap.c
index b0fc27e77d6d..693a610e181d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -257,7 +257,7 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
  * Attach the anon_vmas from src to dst.
  * Returns 0 on success, -ENOMEM on failure.
  *
- * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
  * anon_vma_fork(). The first three want an exact copy of src, while the last
  * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
  * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
diff --git a/mm/swap.c b/mm/swap.c
index a75a8265302b..dfb48cf9c2c9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
 	if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
 		int nr_pages = thp_nr_pages(page);
 		/*
-		 * We use the irq-unsafe __mod_zone_page_stat because this
+		 * We use the irq-unsafe __mod_zone_page_state because this
 		 * counter is not modified from interrupt context, and the pte
 		 * lock is held(spinlock), which implies preemption disabled.
 		 */
@@ -808,7 +808,7 @@ inline void __lru_add_drain_all(bool force_all_cpus)
 	 * below which drains the page vectors.
 	 *
 	 * Let x, y, and z represent some system CPU numbers, where x < y < z.
-	 * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+	 * Assume CPU #z is in the middle of the for_each_online_cpu loop
 	 * below and has already reached CPU #y's per-cpu data. CPU #x comes
 	 * along, adds some pages to its per-cpu vectors, then calls
 	 * lru_add_drain_all().
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3a1259c13f3b..272ea2108c9d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -792,7 +792,7 @@ static void swap_ra_info(struct vm_fault *vmf,
  *
  * Returns the struct page for entry and addr, after queueing swapin.
  *
- * Primitive swap readahead code. We simply read in a few pages whoes
+ * Primitive swap readahead code. We simply read in a few pages whose
  * virtual addresses are around the fault address in the same vma.
  *
  * Caller must hold read mmap_lock if vmf->vma is not NULL.
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 9d889ad2bb86..7fe7adaaad01 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -391,7 +391,7 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool)
 {
 	if (pool->inode)
 		iput(pool->inode);
- }
+}
 
 /* Initializes the z3fold header of a newly allocated z3fold page */
 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 58697f7a43f8..5004c176b045 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -61,7 +61,7 @@
 #define ZSPAGE_MAGIC	0x58
 
 /*
- * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * This must be power of 2 and greater than or equal to sizeof(link_free).
  * These two conditions ensure that any 'struct link_free' itself doesn't
  * span more than 1 page which avoids complex case of mapping 2 pages simply
  * to restore link_free pointer values.
@@ -530,7 +530,7 @@ static void set_zspage_mapping(struct zspage *zspage,
  * class maintains a list of zspages where each zspage is divided
  * into equal sized chunks. Each allocation falls into one of these
  * classes depending on its size. This function returns index of the
- * size class which has chunk size big enough to hold the give size.
+ * size class which has chunk size big enough to hold the given size.
  */
 static int get_size_class_index(int size)
 {
-- 
cgit v1.2.3


From bbcd53c960713507ae764bf81970651b5577b95a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 6 May 2021 18:05:55 -0700
Subject: drivers/char: remove /dev/kmem for good

Patch series "drivers/char: remove /dev/kmem for good".

Exploring /dev/kmem and /dev/mem in the context of memory hot(un)plug and
memory ballooning, I started questioning the existence of /dev/kmem.

Comparing it with the /proc/kcore implementation, it does not seem to be
able to deal with things like

a) Pages unmapped from the direct mapping (e.g., to be used by secretmem)
  -> kern_addr_valid(). virt_addr_valid() is not sufficient.

b) Special cases like gart aperture memory that is not to be touched
  -> mem_pfn_is_ram()

Unless I am missing something, it's at least broken in some cases and might
fault/crash the machine.

Looks like its existence has been questioned before in 2005 and 2010 [1],
after ~11 additional years, it might make sense to revive the discussion.

CONFIG_DEVKMEM is only enabled in a single defconfig (on purpose or by
mistake?).  All distributions disable it: in Ubuntu it has been disabled
for more than 10 years, in Debian since 2.6.31, in Fedora at least
starting with FC3, in RHEL starting with RHEL4, in SUSE starting from
15sp2, and OpenSUSE has it disabled as well.

1) /dev/kmem was popular for rootkits [2] before it got disabled
   basically everywhere. Ubuntu documents [3] "There is no modern user of
   /dev/kmem any more beyond attackers using it to load kernel rootkits.".
   RHEL documents in a BZ [5] "it served no practical purpose other than to
   serve as a potential security problem or to enable binary module drivers
   to access structures/functions they shouldn't be touching"

2) /proc/kcore is a decent interface to have a controlled way to read
   kernel memory for debugging puposes. (will need some extensions to
   deal with memory offlining/unplug, memory ballooning, and poisoned
   pages, though)

3) It might be useful for corner case debugging [1]. KDB/KGDB might be a
   better fit, especially, to write random memory; harder to shoot
   yourself into the foot.

4) "Kernel Memory Editor" [4] hasn't seen any updates since 2000 and seems
   to be incompatible with 64bit [1]. For educational purposes,
   /proc/kcore might be used to monitor value updates -- or older
   kernels can be used.

5) It's broken on arm64, and therefore, completely disabled there.

Looks like it's essentially unused and has been replaced by better
suited interfaces for individual tasks (/proc/kcore, KDB/KGDB). Let's
just remove it.

[1] https://lwn.net/Articles/147901/
[2] https://www.linuxjournal.com/article/10505
[3] https://wiki.ubuntu.com/Security/Features#A.2Fdev.2Fkmem_disabled
[4] https://sourceforge.net/projects/kme/
[5] https://bugzilla.redhat.com/show_bug.cgi?id=154796

Link: https://lkml.kernel.org/r/20210324102351.6932-1-david@redhat.com
Link: https://lkml.kernel.org/r/20210324102351.6932-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Alexander A. Klimov" <grandmaster@al2klimov.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Andrey Zhizhikin <andrey.zhizhikin@leica-geosystems.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Brian Cain <bcain@codeaurora.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Chris Zankel <chris@zankel.net>
Cc: Corentin Labbe <clabbe@baylibre.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Gregory Clement <gregory.clement@bootlin.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Hillf Danton <hdanton@sina.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: James Troup <james.troup@canonical.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kairui Song <kasong@redhat.com>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Cc: Liviu Dudau <liviu.dudau@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Niklas Schnelle <schnelle@linux.ibm.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: openrisc@lists.librecores.org
Cc: Palmer Dabbelt <palmerdabbelt@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Pavel Machek (CIP)" <pavel@denx.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Pierre Morel <pmorel@linux.ibm.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Rich Felker <dalias@libc.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Cc: sparclinux@vger.kernel.org
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Theodore Dubois <tblodt@icloud.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: William Cohen <wcohen@redhat.com>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/devices.txt     |   2 +-
 arch/arm/configs/dove_defconfig           |   1 -
 arch/arm/configs/magician_defconfig       |   1 -
 arch/arm/configs/moxart_defconfig         |   1 -
 arch/arm/configs/mps2_defconfig           |   1 -
 arch/arm/configs/mvebu_v5_defconfig       |   1 -
 arch/arm/configs/xcep_defconfig           |   1 -
 arch/hexagon/configs/comet_defconfig      |   1 -
 arch/m68k/configs/amcore_defconfig        |   1 -
 arch/openrisc/configs/or1ksim_defconfig   |   1 -
 arch/sh/configs/edosk7705_defconfig       |   1 -
 arch/sh/configs/se7206_defconfig          |   1 -
 arch/sh/configs/sh2007_defconfig          |   1 -
 arch/sh/configs/sh7724_generic_defconfig  |   1 -
 arch/sh/configs/sh7770_generic_defconfig  |   1 -
 arch/sh/configs/sh7785lcr_32bit_defconfig |   1 -
 arch/sparc/configs/sparc64_defconfig      |   1 -
 arch/xtensa/configs/xip_kc705_defconfig   |   1 -
 drivers/char/Kconfig                      |  10 --
 drivers/char/mem.c                        | 231 ------------------------------
 include/linux/fs.h                        |   2 +-
 include/linux/vmalloc.h                   |   2 +-
 kernel/configs/android-base.config        |   1 -
 mm/ksm.c                                  |   2 +-
 mm/vmalloc.c                              |   2 +-
 25 files changed, 5 insertions(+), 264 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index ef41f77cb979..9c2be821c225 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -4,7 +4,7 @@
 
    1 char	Memory devices
 		  1 = /dev/mem		Physical memory access
-		  2 = /dev/kmem		Kernel virtual memory access
+		  2 = /dev/kmem		OBSOLETE - replaced by /proc/kcore
 		  3 = /dev/null		Null device
 		  4 = /dev/port		I/O port access
 		  5 = /dev/zero		Null byte source
diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig
index e70c997d5f4c..b935162a8bba 100644
--- a/arch/arm/configs/dove_defconfig
+++ b/arch/arm/configs/dove_defconfig
@@ -63,7 +63,6 @@ CONFIG_INPUT_EVDEV=y
 # CONFIG_MOUSE_PS2 is not set
 # CONFIG_SERIO is not set
 CONFIG_LEGACY_PTY_COUNT=16
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_RUNTIME_UARTS=2
diff --git a/arch/arm/configs/magician_defconfig b/arch/arm/configs/magician_defconfig
index b4670d42f378..abde1fb23b20 100644
--- a/arch/arm/configs/magician_defconfig
+++ b/arch/arm/configs/magician_defconfig
@@ -72,7 +72,6 @@ CONFIG_INPUT_TOUCHSCREEN=y
 CONFIG_INPUT_MISC=y
 CONFIG_INPUT_UINPUT=m
 # CONFIG_SERIO is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_PXA=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig
index 6834e97af348..eacc089d86c5 100644
--- a/arch/arm/configs/moxart_defconfig
+++ b/arch/arm/configs/moxart_defconfig
@@ -79,7 +79,6 @@ CONFIG_INPUT_EVBUG=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_NR_UARTS=1
diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig
index 1d923dbb9928..89f4a6ff30bd 100644
--- a/arch/arm/configs/mps2_defconfig
+++ b/arch/arm/configs/mps2_defconfig
@@ -69,7 +69,6 @@ CONFIG_SMSC911X=y
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_NONSTANDARD=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_MPS2_UART_CONSOLE=y
 CONFIG_SERIAL_MPS2_UART=y
 # CONFIG_HW_RANDOM is not set
diff --git a/arch/arm/configs/mvebu_v5_defconfig b/arch/arm/configs/mvebu_v5_defconfig
index 4f16716bfc32..d57ff30dabff 100644
--- a/arch/arm/configs/mvebu_v5_defconfig
+++ b/arch/arm/configs/mvebu_v5_defconfig
@@ -100,7 +100,6 @@ CONFIG_INPUT_EVDEV=y
 CONFIG_KEYBOARD_GPIO=y
 # CONFIG_INPUT_MOUSE is not set
 CONFIG_LEGACY_PTY_COUNT=16
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_RUNTIME_UARTS=2
diff --git a/arch/arm/configs/xcep_defconfig b/arch/arm/configs/xcep_defconfig
index f1fbdfc5c8c6..4d8e7f2eaef7 100644
--- a/arch/arm/configs/xcep_defconfig
+++ b/arch/arm/configs/xcep_defconfig
@@ -53,7 +53,6 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_PXA=y
 CONFIG_SERIAL_PXA_CONSOLE=y
 # CONFIG_LEGACY_PTYS is not set
diff --git a/arch/hexagon/configs/comet_defconfig b/arch/hexagon/configs/comet_defconfig
index f19ae2ab0aaa..c5a214716a38 100644
--- a/arch/hexagon/configs/comet_defconfig
+++ b/arch/hexagon/configs/comet_defconfig
@@ -34,7 +34,6 @@ CONFIG_NET_ETHERNET=y
 # CONFIG_SERIO is not set
 # CONFIG_CONSOLE_TRANSLATIONS is not set
 CONFIG_LEGACY_PTY_COUNT=64
-# CONFIG_DEVKMEM is not set
 # CONFIG_HW_RANDOM is not set
 CONFIG_SPI=y
 CONFIG_SPI_DEBUG=y
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index 3a84f24d41c8..6d9ed2198170 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -60,7 +60,6 @@ CONFIG_DM9000=y
 # CONFIG_VT is not set
 # CONFIG_UNIX98_PTYS is not set
 # CONFIG_DEVMEM is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_MCF=y
 CONFIG_SERIAL_MCF_BAUDRATE=115200
 CONFIG_SERIAL_MCF_CONSOLE=y
diff --git a/arch/openrisc/configs/or1ksim_defconfig b/arch/openrisc/configs/or1ksim_defconfig
index 75f2da324d0e..6e1e004047c7 100644
--- a/arch/openrisc/configs/or1ksim_defconfig
+++ b/arch/openrisc/configs/or1ksim_defconfig
@@ -43,7 +43,6 @@ CONFIG_MICREL_PHY=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_OF_PLATFORM=y
diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig
index ef7cc31997b1..9ee35269bee2 100644
--- a/arch/sh/configs/edosk7705_defconfig
+++ b/arch/sh/configs/edosk7705_defconfig
@@ -23,7 +23,6 @@ CONFIG_SH_PCLK_FREQ=31250000
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 # CONFIG_UNIX98_PTYS is not set
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_HW_RANDOM is not set
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig
index 315b04a8dd2f..601d062250d1 100644
--- a/arch/sh/configs/se7206_defconfig
+++ b/arch/sh/configs/se7206_defconfig
@@ -71,7 +71,6 @@ CONFIG_SMC91X=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=4
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig
index 99975db461d8..79f02f1c0dc8 100644
--- a/arch/sh/configs/sh2007_defconfig
+++ b/arch/sh/configs/sh2007_defconfig
@@ -75,7 +75,6 @@ CONFIG_INPUT_FF_MEMLESS=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
 CONFIG_VT_HW_CONSOLE_BINDING=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
 # CONFIG_LEGACY_PTYS is not set
diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig
index 2c46c0004780..cbc9389a89a8 100644
--- a/arch/sh/configs/sh7724_generic_defconfig
+++ b/arch/sh/configs/sh7724_generic_defconfig
@@ -18,7 +18,6 @@ CONFIG_CPU_IDLE=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
diff --git a/arch/sh/configs/sh7770_generic_defconfig b/arch/sh/configs/sh7770_generic_defconfig
index 88193153e51b..ee2357deba0f 100644
--- a/arch/sh/configs/sh7770_generic_defconfig
+++ b/arch/sh/configs/sh7770_generic_defconfig
@@ -20,7 +20,6 @@ CONFIG_CPU_IDLE=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig
index 9b885c14c400..5c725c75fcef 100644
--- a/arch/sh/configs/sh7785lcr_32bit_defconfig
+++ b/arch/sh/configs/sh7785lcr_32bit_defconfig
@@ -66,7 +66,6 @@ CONFIG_INPUT_FF_MEMLESS=m
 CONFIG_INPUT_EVDEV=y
 CONFIG_INPUT_EVBUG=m
 CONFIG_VT_HW_CONSOLE_BINDING=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig
index 12a4fb0bd52a..18099099583e 100644
--- a/arch/sparc/configs/sparc64_defconfig
+++ b/arch/sparc/configs/sparc64_defconfig
@@ -122,7 +122,6 @@ CONFIG_INPUT_SPARCSPKR=y
 # CONFIG_SERIO_SERPORT is not set
 CONFIG_SERIO_PCIPS2=m
 CONFIG_SERIO_RAW=m
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_SUNSU=y
 CONFIG_SERIAL_SUNSU_CONSOLE=y
 CONFIG_SERIAL_SUNSAB=y
diff --git a/arch/xtensa/configs/xip_kc705_defconfig b/arch/xtensa/configs/xip_kc705_defconfig
index 4f1ff9531f6a..062148e17135 100644
--- a/arch/xtensa/configs/xip_kc705_defconfig
+++ b/arch/xtensa/configs/xip_kc705_defconfig
@@ -72,7 +72,6 @@ CONFIG_MARVELL_PHY=y
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_SERIO is not set
-CONFIG_DEVKMEM=y
 CONFIG_SERIAL_8250=y
 # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
 CONFIG_SERIAL_8250_CONSOLE=y
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index d229a2d0c017..b151e0fcdeb5 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -334,16 +334,6 @@ config DEVMEM
 	  memory.
 	  When in doubt, say "Y".
 
-config DEVKMEM
-	bool "/dev/kmem virtual device support"
-	# On arm64, VMALLOC_START < PAGE_OFFSET, which confuses kmem read/write
-	depends on !ARM64
-	help
-	  Say Y here if you want to support the /dev/kmem device. The
-	  /dev/kmem device is rarely used, but can be used for certain
-	  kind of kernel debugging operations.
-	  When in doubt, say "N".
-
 config NVRAM
 	tristate "/dev/nvram support"
 	depends on X86 || HAVE_ARCH_NVRAM_OPS
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 869b9f5e8e03..15dc54fa1d47 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -403,221 +403,6 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
-{
-	unsigned long pfn;
-
-	/* Turn a kernel-virtual address into a physical page frame */
-	pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT;
-
-	/*
-	 * RED-PEN: on some architectures there is more mapped memory than
-	 * available in mem_map which pfn_valid checks for. Perhaps should add a
-	 * new macro here.
-	 *
-	 * RED-PEN: vmalloc is not supported right now.
-	 */
-	if (!pfn_valid(pfn))
-		return -EIO;
-
-	vma->vm_pgoff = pfn;
-	return mmap_mem(file, vma);
-}
-
-/*
- * This function reads the *virtual* memory as seen by the kernel.
- */
-static ssize_t read_kmem(struct file *file, char __user *buf,
-			 size_t count, loff_t *ppos)
-{
-	unsigned long p = *ppos;
-	ssize_t low_count, read, sz;
-	char *kbuf; /* k-addr because vread() takes vmlist_lock rwlock */
-	int err = 0;
-
-	read = 0;
-	if (p < (unsigned long) high_memory) {
-		low_count = count;
-		if (count > (unsigned long)high_memory - p)
-			low_count = (unsigned long)high_memory - p;
-
-#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED
-		/* we don't have page 0 mapped on sparc and m68k.. */
-		if (p < PAGE_SIZE && low_count > 0) {
-			sz = size_inside_page(p, low_count);
-			if (clear_user(buf, sz))
-				return -EFAULT;
-			buf += sz;
-			p += sz;
-			read += sz;
-			low_count -= sz;
-			count -= sz;
-		}
-#endif
-		while (low_count > 0) {
-			sz = size_inside_page(p, low_count);
-
-			/*
-			 * On ia64 if a page has been mapped somewhere as
-			 * uncached, then it must also be accessed uncached
-			 * by the kernel or data corruption may occur
-			 */
-			kbuf = xlate_dev_kmem_ptr((void *)p);
-			if (!virt_addr_valid(kbuf))
-				return -ENXIO;
-
-			if (copy_to_user(buf, kbuf, sz))
-				return -EFAULT;
-			buf += sz;
-			p += sz;
-			read += sz;
-			low_count -= sz;
-			count -= sz;
-			if (should_stop_iteration()) {
-				count = 0;
-				break;
-			}
-		}
-	}
-
-	if (count > 0) {
-		kbuf = (char *)__get_free_page(GFP_KERNEL);
-		if (!kbuf)
-			return -ENOMEM;
-		while (count > 0) {
-			sz = size_inside_page(p, count);
-			if (!is_vmalloc_or_module_addr((void *)p)) {
-				err = -ENXIO;
-				break;
-			}
-			sz = vread(kbuf, (char *)p, sz);
-			if (!sz)
-				break;
-			if (copy_to_user(buf, kbuf, sz)) {
-				err = -EFAULT;
-				break;
-			}
-			count -= sz;
-			buf += sz;
-			read += sz;
-			p += sz;
-			if (should_stop_iteration())
-				break;
-		}
-		free_page((unsigned long)kbuf);
-	}
-	*ppos = p;
-	return read ? read : err;
-}
-
-
-static ssize_t do_write_kmem(unsigned long p, const char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	ssize_t written, sz;
-	unsigned long copied;
-
-	written = 0;
-#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED
-	/* we don't have page 0 mapped on sparc and m68k.. */
-	if (p < PAGE_SIZE) {
-		sz = size_inside_page(p, count);
-		/* Hmm. Do something? */
-		buf += sz;
-		p += sz;
-		count -= sz;
-		written += sz;
-	}
-#endif
-
-	while (count > 0) {
-		void *ptr;
-
-		sz = size_inside_page(p, count);
-
-		/*
-		 * On ia64 if a page has been mapped somewhere as uncached, then
-		 * it must also be accessed uncached by the kernel or data
-		 * corruption may occur.
-		 */
-		ptr = xlate_dev_kmem_ptr((void *)p);
-		if (!virt_addr_valid(ptr))
-			return -ENXIO;
-
-		copied = copy_from_user(ptr, buf, sz);
-		if (copied) {
-			written += sz - copied;
-			if (written)
-				break;
-			return -EFAULT;
-		}
-		buf += sz;
-		p += sz;
-		count -= sz;
-		written += sz;
-		if (should_stop_iteration())
-			break;
-	}
-
-	*ppos += written;
-	return written;
-}
-
-/*
- * This function writes to the *virtual* memory as seen by the kernel.
- */
-static ssize_t write_kmem(struct file *file, const char __user *buf,
-			  size_t count, loff_t *ppos)
-{
-	unsigned long p = *ppos;
-	ssize_t wrote = 0;
-	ssize_t virtr = 0;
-	char *kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */
-	int err = 0;
-
-	if (p < (unsigned long) high_memory) {
-		unsigned long to_write = min_t(unsigned long, count,
-					       (unsigned long)high_memory - p);
-		wrote = do_write_kmem(p, buf, to_write, ppos);
-		if (wrote != to_write)
-			return wrote;
-		p += wrote;
-		buf += wrote;
-		count -= wrote;
-	}
-
-	if (count > 0) {
-		kbuf = (char *)__get_free_page(GFP_KERNEL);
-		if (!kbuf)
-			return wrote ? wrote : -ENOMEM;
-		while (count > 0) {
-			unsigned long sz = size_inside_page(p, count);
-			unsigned long n;
-
-			if (!is_vmalloc_or_module_addr((void *)p)) {
-				err = -ENXIO;
-				break;
-			}
-			n = copy_from_user(kbuf, buf, sz);
-			if (n) {
-				err = -EFAULT;
-				break;
-			}
-			vwrite(kbuf, (char *)p, sz);
-			count -= sz;
-			buf += sz;
-			virtr += sz;
-			p += sz;
-			if (should_stop_iteration())
-				break;
-		}
-		free_page((unsigned long)kbuf);
-	}
-
-	*ppos = p;
-	return virtr + wrote ? : err;
-}
-
 static ssize_t read_port(struct file *file, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
@@ -855,7 +640,6 @@ static int open_port(struct inode *inode, struct file *filp)
 #define write_zero	write_null
 #define write_iter_zero	write_iter_null
 #define open_mem	open_port
-#define open_kmem	open_mem
 
 static const struct file_operations __maybe_unused mem_fops = {
 	.llseek		= memory_lseek,
@@ -869,18 +653,6 @@ static const struct file_operations __maybe_unused mem_fops = {
 #endif
 };
 
-static const struct file_operations __maybe_unused kmem_fops = {
-	.llseek		= memory_lseek,
-	.read		= read_kmem,
-	.write		= write_kmem,
-	.mmap		= mmap_kmem,
-	.open		= open_kmem,
-#ifndef CONFIG_MMU
-	.get_unmapped_area = get_unmapped_area_mem,
-	.mmap_capabilities = memory_mmap_capabilities,
-#endif
-};
-
 static const struct file_operations null_fops = {
 	.llseek		= null_lseek,
 	.read		= read_null,
@@ -924,9 +696,6 @@ static const struct memdev {
 } devlist[] = {
 #ifdef CONFIG_DEVMEM
 	 [DEVMEM_MINOR] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET },
-#endif
-#ifdef CONFIG_DEVKMEM
-	 [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET },
 #endif
 	 [3] = { "null", 0666, &null_fops, 0 },
 #ifdef CONFIG_DEVPORT
diff --git a/include/linux/fs.h b/include/linux/fs.h
index acef282b97c6..c3c88fdb9b2a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -145,7 +145,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* Expect random access pattern */
 #define FMODE_RANDOM		((__force fmode_t)0x1000)
 
-/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
+/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
 #define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
 
 /* File is opened with O_PATH; almost nothing can be done with it */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 394d03cc0e92..f31ba59fb1ef 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -227,7 +227,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
 }
 #endif
 
-/* for /dev/kmem */
+/* for /proc/kcore */
 extern long vread(char *buf, char *addr, unsigned long count);
 extern long vwrite(char *buf, char *addr, unsigned long count);
 
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index d3fd428f4b92..eb701b2ac72f 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -1,5 +1,4 @@
 #  KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVKMEM is not set
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
diff --git a/mm/ksm.c b/mm/ksm.c
index b321a67ebaa9..b7cbcc7d4977 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -459,7 +459,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
  * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
  * in case the application has unmapped and remapped mm,addr meanwhile.
  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
- * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ * mmap of /dev/mem, where we would not want to touch it.
  *
  * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
  * of the process that owns 'vma'.  We also do not want to enforce
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9c539f0730a5..2868692c6807 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3219,7 +3219,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
  * Note: In usual ops, vread() is never necessary because the caller
  * should know vmalloc() area is valid and can use memcpy().
  * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
+ * any information, as /proc/kcore.
  *
  * Return: number of bytes for which addr and buf should be increased
  * (same number as @count) or %0 if [addr...addr+count) doesn't
-- 
cgit v1.2.3


From f7c8ce44ebb113b83135ada6e496db33d8a535e3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 6 May 2021 18:06:06 -0700
Subject: mm/vmalloc: remove vwrite()

The last user (/dev/kmem) is gone. Let's drop it.

Link: https://lkml.kernel.org/r/20210324102351.6932-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: huang ying <huang.ying.caritas@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |   1 -
 mm/nommu.c              |  10 -----
 mm/vmalloc.c            | 116 +-----------------------------------------------
 3 files changed, 1 insertion(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index f31ba59fb1ef..b6ff16393bf6 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -229,7 +229,6 @@ static inline void set_vm_flush_reset_perms(void *addr)
 
 /* for /proc/kcore */
 extern long vread(char *buf, char *addr, unsigned long count);
-extern long vwrite(char *buf, char *addr, unsigned long count);
 
 /*
  *	Internals.  Dont't use..
diff --git a/mm/nommu.c b/mm/nommu.c
index 5c9ab799c0e6..85a3a68dffb6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -210,16 +210,6 @@ long vread(char *buf, char *addr, unsigned long count)
 	return count;
 }
 
-long vwrite(char *buf, char *addr, unsigned long count)
-{
-	/* Don't allow overflow */
-	if ((unsigned long) addr + count < count)
-		count = -(unsigned long) addr;
-
-	memcpy(addr, buf, count);
-	return count;
-}
-
 /*
  *	vmalloc  -  allocate virtually contiguous memory
  *
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2868692c6807..a7f318c9e426 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3146,10 +3146,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
 		 * kmap() and get small overhead in this access function.
 		 */
 		if (p) {
-			/*
-			 * we can expect USER0 is not used (see vread/vwrite's
-			 * function description)
-			 */
+			/* We can expect USER0 is not used -- see vread() */
 			void *map = kmap_atomic(p);
 			memcpy(buf, map + offset, length);
 			kunmap_atomic(map);
@@ -3164,43 +3161,6 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
 	return copied;
 }
 
-static int aligned_vwrite(char *buf, char *addr, unsigned long count)
-{
-	struct page *p;
-	int copied = 0;
-
-	while (count) {
-		unsigned long offset, length;
-
-		offset = offset_in_page(addr);
-		length = PAGE_SIZE - offset;
-		if (length > count)
-			length = count;
-		p = vmalloc_to_page(addr);
-		/*
-		 * To do safe access to this _mapped_ area, we need
-		 * lock. But adding lock here means that we need to add
-		 * overhead of vmalloc()/vfree() calles for this _debug_
-		 * interface, rarely used. Instead of that, we'll use
-		 * kmap() and get small overhead in this access function.
-		 */
-		if (p) {
-			/*
-			 * we can expect USER0 is not used (see vread/vwrite's
-			 * function description)
-			 */
-			void *map = kmap_atomic(p);
-			memcpy(map + offset, buf, length);
-			kunmap_atomic(map);
-		}
-		addr += length;
-		buf += length;
-		copied += length;
-		count -= length;
-	}
-	return copied;
-}
-
 /**
  * vread() - read vmalloc area in a safe way.
  * @buf:     buffer for reading data
@@ -3283,80 +3243,6 @@ finished:
 	return buflen;
 }
 
-/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf:      buffer for source data
- * @addr:     vm address.
- * @count:    number of bytes to be read.
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
- *
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
- *
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any information, as /dev/kmem.
- *
- * Return: number of bytes for which addr and buf should be
- * increased (same number as @count) or %0 if [addr...addr+count)
- * doesn't include any intersection with valid vmalloc area
- */
-long vwrite(char *buf, char *addr, unsigned long count)
-{
-	struct vmap_area *va;
-	struct vm_struct *vm;
-	char *vaddr;
-	unsigned long n, buflen;
-	int copied = 0;
-
-	/* Don't allow overflow */
-	if ((unsigned long) addr + count < count)
-		count = -(unsigned long) addr;
-	buflen = count;
-
-	spin_lock(&vmap_area_lock);
-	list_for_each_entry(va, &vmap_area_list, list) {
-		if (!count)
-			break;
-
-		if (!va->vm)
-			continue;
-
-		vm = va->vm;
-		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + get_vm_area_size(vm))
-			continue;
-		while (addr < vaddr) {
-			if (count == 0)
-				goto finished;
-			buf++;
-			addr++;
-			count--;
-		}
-		n = vaddr + get_vm_area_size(vm) - addr;
-		if (n > count)
-			n = count;
-		if (!(vm->flags & VM_IOREMAP)) {
-			aligned_vwrite(buf, addr, n);
-			copied++;
-		}
-		buf += n;
-		addr += n;
-		count -= n;
-	}
-finished:
-	spin_unlock(&vmap_area_lock);
-	if (!copied)
-		return 0;
-	return buflen;
-}
-
 /**
  * remap_vmalloc_range_partial - map vmalloc pages to userspace
  * @vma:		vma to cover
-- 
cgit v1.2.3


From 2eb70aab25dd9b0013a0035b416dbe0e81e6ad48 Mon Sep 17 00:00:00 2001
From: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Date: Thu, 6 May 2021 18:06:24 -0700
Subject: include/linux/pgtable.h: few spelling fixes

Few spelling fixes throughout the file.

Link: https://lkml.kernel.org/r/20210318201404.6380-1-unixbhaskar@gmail.com
Signed-off-by: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pgtable.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2194a9cd885c..46b13780c2c8 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -426,7 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 
 /*
  * On some architectures hardware does not set page access bit when accessing
- * memory page, it is responsibilty of software setting this bit. It brings
+ * memory page, it is responsibility of software setting this bit. It brings
  * out extra page fault penalty to track page access bit. For optimization page
  * access bit can be set during all page fault flow on these arches.
  * To be differentiate with macro pte_mkyoung, this macro is used on platforms
@@ -519,7 +519,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 /*
  * This is an implementation of pmdp_establish() that is only suitable for an
  * architecture that doesn't have hardware dirty/accessed bits. In this case we
- * can't race with CPU which sets these bits and non-atomic aproach is fine.
+ * can't race with CPU which sets these bits and non-atomic approach is fine.
  */
 static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmdp, pmd_t pmd)
@@ -852,7 +852,7 @@ static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
  * updates, but to prevent any updates it may make from being lost.
  *
  * This does not protect against other software modifications of the
- * pte; the appropriate pte lock must be held over the transation.
+ * pte; the appropriate pte lock must be held over the transaction.
  *
  * Note that this interface is intended to be batchable, meaning that
  * ptep_modify_prot_commit may not actually update the pte, but merely
@@ -1281,13 +1281,13 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
 	 *
 	 * The complete check uses is_pmd_migration_entry() in linux/swapops.h
 	 * But using that requires moving current function and pmd_trans_unstable()
-	 * to linux/swapops.h to resovle dependency, which is too much code move.
+	 * to linux/swapops.h to resolve dependency, which is too much code move.
 	 *
 	 * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
 	 * because !pmd_present() pages can only be under migration not swapped
 	 * out.
 	 *
-	 * pmd_none() is preseved for future condition checks on pmd migration
+	 * pmd_none() is preserved for future condition checks on pmd migration
 	 * entries and not confusing with this function name, although it is
 	 * redundant with !pmd_present().
 	 */
-- 
cgit v1.2.3


From fa60ce2cb4506701c43bd4cf3ca23d970daf1b9c Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 6 May 2021 18:06:44 -0700
Subject: treewide: remove editor modelines and cruft

The section "19) Editor modelines and other cruft" in
Documentation/process/coding-style.rst clearly says, "Do not include any
of these in source files."

I recently receive a patch to explicitly add a new one.

Let's do treewide cleanups, otherwise some people follow the existing code
and attempt to upstream their favoriate editor setups.

It is even nicer if scripts/checkpatch.pl can check it.

If we like to impose coding style in an editor-independent manner, I think
editorconfig (patch [1]) is a saner solution.

[1] https://lore.kernel.org/lkml/20200703073143.423557-1-danny@kdrag0n.dev/

Link: https://lkml.kernel.org/r/20210324054457.1477489-1-masahiroy@kernel.org
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>	[auxdisplay]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/m68k/atari/time.c                               |  7 -------
 arch/parisc/include/asm/pdc_chassis.h                |  1 -
 arch/um/drivers/cow.h                                |  7 -------
 drivers/auxdisplay/panel.c                           |  7 -------
 drivers/gpu/drm/qxl/qxl_drv.c                        |  1 -
 drivers/media/usb/pwc/pwc-uncompress.c               |  3 ---
 drivers/net/ethernet/adaptec/starfire.c              |  8 --------
 drivers/net/ethernet/amd/atarilance.c                |  8 --------
 drivers/net/ethernet/amd/pcnet32.c                   |  7 -------
 .../net/wireless/intersil/orinoco/orinoco_nortel.c   |  8 --------
 drivers/net/wireless/intersil/orinoco/orinoco_pci.c  |  8 --------
 drivers/net/wireless/intersil/orinoco/orinoco_plx.c  |  8 --------
 drivers/net/wireless/intersil/orinoco/orinoco_tmd.c  |  8 --------
 drivers/parport/parport_ip32.c                       | 12 ------------
 drivers/platform/x86/dell/dell_rbu.c                 |  3 ---
 drivers/scsi/53c700.c                                |  1 -
 drivers/scsi/53c700.h                                |  1 -
 drivers/scsi/ch.c                                    |  6 ------
 drivers/scsi/ips.c                                   | 20 --------------------
 drivers/scsi/ips.h                                   | 20 --------------------
 drivers/scsi/lasi700.c                               |  1 -
 drivers/scsi/megaraid/mbox_defs.h                    |  2 --
 drivers/scsi/megaraid/mega_common.h                  |  2 --
 drivers/scsi/megaraid/megaraid_mbox.c                |  2 --
 drivers/scsi/megaraid/megaraid_mbox.h                |  2 --
 drivers/scsi/qla1280.c                               | 12 ------------
 drivers/scsi/sni_53c710.c                            |  1 -
 drivers/video/fbdev/matrox/matroxfb_base.c           |  9 ---------
 drivers/video/fbdev/vga16fb.c                        | 10 ----------
 fs/configfs/configfs_internal.h                      |  4 +---
 fs/configfs/dir.c                                    |  4 +---
 fs/configfs/file.c                                   |  4 +---
 fs/configfs/inode.c                                  |  4 +---
 fs/configfs/item.c                                   |  4 +---
 fs/configfs/mount.c                                  |  4 +---
 fs/configfs/symlink.c                                |  4 +---
 fs/nfs/dir.c                                         |  7 -------
 fs/nfs/nfs4proc.c                                    |  6 ------
 fs/nfs/nfs4renewd.c                                  |  6 ------
 fs/nfs/nfs4state.c                                   |  6 ------
 fs/nfs/nfs4xdr.c                                     |  6 ------
 fs/nfsd/nfs4proc.c                                   |  6 ------
 fs/nfsd/nfs4xdr.c                                    |  6 ------
 fs/nfsd/xdr4.h                                       |  6 ------
 fs/ocfs2/acl.c                                       |  4 +---
 fs/ocfs2/acl.h                                       |  4 +---
 fs/ocfs2/alloc.c                                     |  4 +---
 fs/ocfs2/alloc.h                                     |  4 +---
 fs/ocfs2/aops.c                                      |  4 +---
 fs/ocfs2/aops.h                                      |  4 +---
 fs/ocfs2/blockcheck.c                                |  4 +---
 fs/ocfs2/blockcheck.h                                |  4 +---
 fs/ocfs2/buffer_head_io.c                            |  4 +---
 fs/ocfs2/buffer_head_io.h                            |  4 +---
 fs/ocfs2/cluster/heartbeat.c                         |  4 +---
 fs/ocfs2/cluster/heartbeat.h                         |  4 +---
 fs/ocfs2/cluster/masklog.c                           |  4 +---
 fs/ocfs2/cluster/masklog.h                           |  4 +---
 fs/ocfs2/cluster/netdebug.c                          |  4 +---
 fs/ocfs2/cluster/nodemanager.c                       |  4 +---
 fs/ocfs2/cluster/nodemanager.h                       |  4 +---
 fs/ocfs2/cluster/ocfs2_heartbeat.h                   |  4 +---
 fs/ocfs2/cluster/ocfs2_nodemanager.h                 |  4 +---
 fs/ocfs2/cluster/quorum.c                            |  4 +---
 fs/ocfs2/cluster/quorum.h                            |  4 +---
 fs/ocfs2/cluster/sys.c                               |  4 +---
 fs/ocfs2/cluster/sys.h                               |  4 +---
 fs/ocfs2/cluster/tcp.c                               |  4 +---
 fs/ocfs2/cluster/tcp.h                               |  4 +---
 fs/ocfs2/cluster/tcp_internal.h                      |  4 +---
 fs/ocfs2/dcache.c                                    |  4 +---
 fs/ocfs2/dcache.h                                    |  4 +---
 fs/ocfs2/dir.c                                       |  4 +---
 fs/ocfs2/dir.h                                       |  4 +---
 fs/ocfs2/dlm/dlmapi.h                                |  4 +---
 fs/ocfs2/dlm/dlmast.c                                |  4 +---
 fs/ocfs2/dlm/dlmcommon.h                             |  4 +---
 fs/ocfs2/dlm/dlmconvert.c                            |  4 +---
 fs/ocfs2/dlm/dlmconvert.h                            |  4 +---
 fs/ocfs2/dlm/dlmdebug.c                              |  4 +---
 fs/ocfs2/dlm/dlmdebug.h                              |  4 +---
 fs/ocfs2/dlm/dlmdomain.c                             |  4 +---
 fs/ocfs2/dlm/dlmdomain.h                             |  4 +---
 fs/ocfs2/dlm/dlmlock.c                               |  4 +---
 fs/ocfs2/dlm/dlmmaster.c                             |  4 +---
 fs/ocfs2/dlm/dlmrecovery.c                           |  4 +---
 fs/ocfs2/dlm/dlmthread.c                             |  4 +---
 fs/ocfs2/dlm/dlmunlock.c                             |  4 +---
 fs/ocfs2/dlmfs/dlmfs.c                               |  4 +---
 fs/ocfs2/dlmfs/userdlm.c                             |  4 +---
 fs/ocfs2/dlmfs/userdlm.h                             |  4 +---
 fs/ocfs2/dlmglue.c                                   |  4 +---
 fs/ocfs2/dlmglue.h                                   |  4 +---
 fs/ocfs2/export.c                                    |  4 +---
 fs/ocfs2/export.h                                    |  4 +---
 fs/ocfs2/extent_map.c                                |  4 +---
 fs/ocfs2/extent_map.h                                |  4 +---
 fs/ocfs2/file.c                                      |  4 +---
 fs/ocfs2/file.h                                      |  4 +---
 fs/ocfs2/filecheck.c                                 |  4 +---
 fs/ocfs2/filecheck.h                                 |  4 +---
 fs/ocfs2/heartbeat.c                                 |  4 +---
 fs/ocfs2/heartbeat.h                                 |  4 +---
 fs/ocfs2/inode.c                                     |  4 +---
 fs/ocfs2/inode.h                                     |  4 +---
 fs/ocfs2/journal.c                                   |  4 +---
 fs/ocfs2/journal.h                                   |  4 +---
 fs/ocfs2/localalloc.c                                |  4 +---
 fs/ocfs2/localalloc.h                                |  4 +---
 fs/ocfs2/locks.c                                     |  4 +---
 fs/ocfs2/locks.h                                     |  4 +---
 fs/ocfs2/mmap.c                                      |  4 +---
 fs/ocfs2/move_extents.c                              |  4 +---
 fs/ocfs2/move_extents.h                              |  4 +---
 fs/ocfs2/namei.c                                     |  4 +---
 fs/ocfs2/namei.h                                     |  4 +---
 fs/ocfs2/ocfs1_fs_compat.h                           |  4 +---
 fs/ocfs2/ocfs2.h                                     |  4 +---
 fs/ocfs2/ocfs2_fs.h                                  |  4 +---
 fs/ocfs2/ocfs2_ioctl.h                               |  4 +---
 fs/ocfs2/ocfs2_lockid.h                              |  4 +---
 fs/ocfs2/ocfs2_lockingver.h                          |  4 +---
 fs/ocfs2/refcounttree.c                              |  4 +---
 fs/ocfs2/refcounttree.h                              |  4 +---
 fs/ocfs2/reservations.c                              |  4 +---
 fs/ocfs2/reservations.h                              |  4 +---
 fs/ocfs2/resize.c                                    |  4 +---
 fs/ocfs2/resize.h                                    |  4 +---
 fs/ocfs2/slot_map.c                                  |  4 +---
 fs/ocfs2/slot_map.h                                  |  4 +---
 fs/ocfs2/stack_o2cb.c                                |  4 +---
 fs/ocfs2/stack_user.c                                |  4 +---
 fs/ocfs2/stackglue.c                                 |  4 +---
 fs/ocfs2/stackglue.h                                 |  4 +---
 fs/ocfs2/suballoc.c                                  |  4 +---
 fs/ocfs2/suballoc.h                                  |  4 +---
 fs/ocfs2/super.c                                     |  4 +---
 fs/ocfs2/super.h                                     |  4 +---
 fs/ocfs2/symlink.c                                   |  4 +---
 fs/ocfs2/symlink.h                                   |  4 +---
 fs/ocfs2/sysfile.c                                   |  4 +---
 fs/ocfs2/sysfile.h                                   |  4 +---
 fs/ocfs2/uptodate.c                                  |  4 +---
 fs/ocfs2/uptodate.h                                  |  4 +---
 fs/ocfs2/xattr.c                                     |  4 +---
 fs/ocfs2/xattr.h                                     |  4 +---
 fs/reiserfs/procfs.c                                 | 10 ----------
 include/linux/configfs.h                             |  4 +---
 include/linux/genl_magic_func.h                      |  1 -
 include/linux/genl_magic_struct.h                    |  1 -
 include/uapi/linux/if_bonding.h                      | 11 -----------
 include/uapi/linux/nfs4.h                            |  6 ------
 include/xen/interface/elfnote.h                      | 10 ----------
 include/xen/interface/hvm/hvm_vcpu.h                 | 10 ----------
 include/xen/interface/io/xenbus.h                    | 10 ----------
 samples/configfs/configfs_sample.c                   |  2 --
 tools/usb/hcd-tests.sh                               |  2 --
 157 files changed, 110 insertions(+), 627 deletions(-)

(limited to 'include/linux')

diff --git a/arch/m68k/atari/time.c b/arch/m68k/atari/time.c
index 1068670cb741..7e44d0e9d0f8 100644
--- a/arch/m68k/atari/time.c
+++ b/arch/m68k/atari/time.c
@@ -317,10 +317,3 @@ int atari_tt_hwclk( int op, struct rtc_time *t )
 
     return( 0 );
 }
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
diff --git a/arch/parisc/include/asm/pdc_chassis.h b/arch/parisc/include/asm/pdc_chassis.h
index ae3e108d22ad..d6d82f53d3d0 100644
--- a/arch/parisc/include/asm/pdc_chassis.h
+++ b/arch/parisc/include/asm/pdc_chassis.h
@@ -365,4 +365,3 @@ void parisc_pdc_chassis_init(void);
 					 PDC_CHASSIS_EOM_SET		)
 
 #endif /* _PARISC_PDC_CHASSIS_H */
-/* vim: set ts=8 */
diff --git a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h
index 103adac691ed..9a67c017000f 100644
--- a/arch/um/drivers/cow.h
+++ b/arch/um/drivers/cow.h
@@ -24,10 +24,3 @@ extern void cow_sizes(int version, __u64 size, int sectorsize, int align,
 		      int *data_offset_out);
 
 #endif
-
-/*
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c
index ff5755ee5694..eba04c0de7eb 100644
--- a/drivers/auxdisplay/panel.c
+++ b/drivers/auxdisplay/panel.c
@@ -1737,10 +1737,3 @@ module_init(panel_init_module);
 module_exit(panel_cleanup_module);
 MODULE_AUTHOR("Willy Tarreau");
 MODULE_LICENSE("GPL");
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c
index 1864467f1063..6754f578fed2 100644
--- a/drivers/gpu/drm/qxl/qxl_drv.c
+++ b/drivers/gpu/drm/qxl/qxl_drv.c
@@ -1,4 +1,3 @@
-/* vim: set ts=8 sw=8 tw=78 ai noexpandtab */
 /* qxl_drv.c -- QXL driver -*- linux-c -*-
  *
  * Copyright 2011 Red Hat, Inc.
diff --git a/drivers/media/usb/pwc/pwc-uncompress.c b/drivers/media/usb/pwc/pwc-uncompress.c
index abfc88391036..68bc3829c6b3 100644
--- a/drivers/media/usb/pwc/pwc-uncompress.c
+++ b/drivers/media/usb/pwc/pwc-uncompress.c
@@ -9,9 +9,6 @@
    Please send bug reports and support requests to <luc@saillard.org>.
    The decompression routines have been implemented by reverse-engineering the
    Nemosoft binary pwcx module. Caveat emptor.
-
-
-   vim: set ts=8:
 */
 
 #include <asm/current.h>
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index 555299737b51..7965e5e3c985 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -2070,11 +2070,3 @@ static void __exit starfire_cleanup (void)
 
 module_init(starfire_init);
 module_exit(starfire_cleanup);
-
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/ethernet/amd/atarilance.c b/drivers/net/ethernet/amd/atarilance.c
index 961796abab35..c1eab916438f 100644
--- a/drivers/net/ethernet/amd/atarilance.c
+++ b/drivers/net/ethernet/amd/atarilance.c
@@ -1156,11 +1156,3 @@ static void __exit atarilance_module_exit(void)
 module_init(atarilance_module_init);
 module_exit(atarilance_module_exit);
 #endif /* MODULE */
-
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 4
- * End:
- */
diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index aa412506832d..4100ab07e6b7 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -3029,10 +3029,3 @@ static void __exit pcnet32_cleanup_module(void)
 
 module_init(pcnet32_init_module);
 module_exit(pcnet32_cleanup_module);
-
-/*
- * Local variables:
- *  c-indent-level: 4
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c b/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c
index 96a03d10a080..18bd0d9876c2 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c
@@ -312,11 +312,3 @@ static void __exit orinoco_nortel_exit(void)
 
 module_init(orinoco_nortel_init);
 module_exit(orinoco_nortel_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_pci.c b/drivers/net/wireless/intersil/orinoco/orinoco_pci.c
index f3c86b07b1b9..7e3a6dd60c15 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_pci.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_pci.c
@@ -255,11 +255,3 @@ static void __exit orinoco_pci_exit(void)
 
 module_init(orinoco_pci_init);
 module_exit(orinoco_pci_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_plx.c b/drivers/net/wireless/intersil/orinoco/orinoco_plx.c
index 16dada94c774..73e6ae124013 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_plx.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_plx.c
@@ -360,11 +360,3 @@ static void __exit orinoco_plx_exit(void)
 
 module_init(orinoco_plx_init);
 module_exit(orinoco_plx_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c b/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c
index 9a9d335611ac..939d5a1dce97 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c
@@ -235,11 +235,3 @@ static void __exit orinoco_tmd_exit(void)
 
 module_init(orinoco_tmd_init);
 module_exit(orinoco_tmd_exit);
-
-/*
- * Local variables:
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c
index 48b084e86dc6..0919ed99ba94 100644
--- a/drivers/parport/parport_ip32.c
+++ b/drivers/parport/parport_ip32.c
@@ -2224,15 +2224,3 @@ MODULE_PARM_DESC(features,
 		 ", bit 2: hardware SPP mode"
 		 ", bit 3: hardware EPP mode"
 		 ", bit 4: hardware ECP mode");
-
-/*--- Inform (X)Emacs about preferred coding style ---------------------*/
-/*
- * Local Variables:
- * mode: c
- * c-file-style: "linux"
- * indent-tabs-mode: t
- * tab-width: 8
- * fill-column: 78
- * ispell-local-dictionary: "american"
- * End:
- */
diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c
index 03c3ff34bcf5..085ad0a0d22e 100644
--- a/drivers/platform/x86/dell/dell_rbu.c
+++ b/drivers/platform/x86/dell/dell_rbu.c
@@ -675,6 +675,3 @@ static __exit void dcdrbu_exit(void)
 
 module_exit(dcdrbu_exit);
 module_init(dcdrbu_init);
-
-/* vim:noet:ts=8:sw=8
-*/
diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c
index ab42feab233f..77ccb96e5ed4 100644
--- a/drivers/scsi/53c700.c
+++ b/drivers/scsi/53c700.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* NCR (or Symbios) 53c700 and 53c700-66 Driver
  *
diff --git a/drivers/scsi/53c700.h b/drivers/scsi/53c700.h
index c9f8c497babb..2df347ca91af 100644
--- a/drivers/scsi/53c700.h
+++ b/drivers/scsi/53c700.h
@@ -1,5 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* Driver for 53c700 and 53c700-66 chips from NCR and Symbios
  *
diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c
index cb74ab1ae5a4..9b89c26ccfdb 100644
--- a/drivers/scsi/ch.c
+++ b/drivers/scsi/ch.c
@@ -1058,9 +1058,3 @@ static void __exit exit_ch_module(void)
 
 module_init(init_ch_module);
 module_exit(exit_ch_module);
-
-/*
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 1a3c534826ba..bc33d54a4011 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -7099,23 +7099,3 @@ ips_init_phase2(int index)
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("IBM ServeRAID Adapter Driver " IPS_VER_STRING);
 MODULE_VERSION(IPS_VER_STRING);
-
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 2
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -2
- * c-argdecl-indent: 2
- * c-label-offset: -2
- * c-continued-statement-offset: 2
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h
index 6c0678fb9a67..65edf000e447 100644
--- a/drivers/scsi/ips.h
+++ b/drivers/scsi/ips.h
@@ -1211,23 +1211,3 @@ typedef struct {
       IPS_COMPAT_TAMPA, \
       IPS_COMPAT_KEYWEST \
    }
-
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 2
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -2
- * c-argdecl-indent: 2
- * c-label-offset: -2
- * c-continued-statement-offset: 2
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */
diff --git a/drivers/scsi/lasi700.c b/drivers/scsi/lasi700.c
index de71d240a56f..6d14a7a94d0b 100644
--- a/drivers/scsi/lasi700.c
+++ b/drivers/scsi/lasi700.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* PARISC LASI driver for the 53c700 chip
  *
diff --git a/drivers/scsi/megaraid/mbox_defs.h b/drivers/scsi/megaraid/mbox_defs.h
index 01a1bfb8ea2a..f0ef8f7f82c1 100644
--- a/drivers/scsi/megaraid/mbox_defs.h
+++ b/drivers/scsi/megaraid/mbox_defs.h
@@ -781,5 +781,3 @@ typedef struct {
 } __attribute__ ((packed)) mbox_sgl32;
 
 #endif		// _MRAID_MBOX_DEFS_H_
-
-/* vim: set ts=8 sw=8 tw=78: */
diff --git a/drivers/scsi/megaraid/mega_common.h b/drivers/scsi/megaraid/mega_common.h
index 3a7596e47a88..2ad0aa2f837d 100644
--- a/drivers/scsi/megaraid/mega_common.h
+++ b/drivers/scsi/megaraid/mega_common.h
@@ -282,5 +282,3 @@ struct mraid_pci_blk {
 };
 
 #endif // _MEGA_COMMON_H_
-
-// vim: set ts=8 sw=8 tw=78:
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c
index b1a2d3536add..145fde302d7d 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -4068,5 +4068,3 @@ megaraid_sysfs_show_ldnum(struct device *dev, struct device_attribute *attr, cha
  */
 module_init(megaraid_init);
 module_exit(megaraid_exit);
-
-/* vim: set ts=8 sw=8 tw=78 ai si: */
diff --git a/drivers/scsi/megaraid/megaraid_mbox.h b/drivers/scsi/megaraid/megaraid_mbox.h
index 3e4347c6dab1..d2fe7f69cd5d 100644
--- a/drivers/scsi/megaraid/megaraid_mbox.h
+++ b/drivers/scsi/megaraid/megaraid_mbox.h
@@ -230,5 +230,3 @@ typedef struct {
 #define WROUTDOOR(rdev, value)	writel(value, (rdev)->baseaddr + 0x2C)
 
 #endif // _MEGARAID_H_
-
-// vim: set ts=8 sw=8 tw=78:
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 8f35174a1f9a..928da90b79be 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4403,15 +4403,3 @@ MODULE_FIRMWARE("qlogic/1040.bin");
 MODULE_FIRMWARE("qlogic/1280.bin");
 MODULE_FIRMWARE("qlogic/12160.bin");
 MODULE_VERSION(QLA1280_VERSION);
-
-/*
- * Overrides for Emacs so that we almost follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * tab-width: 8
- * End:
- */
diff --git a/drivers/scsi/sni_53c710.c b/drivers/scsi/sni_53c710.c
index 97c6f81b1d2a..678651b9b4dd 100644
--- a/drivers/scsi/sni_53c710.c
+++ b/drivers/scsi/sni_53c710.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8 -*- */
 
 /* SNI RM driver
  *
diff --git a/drivers/video/fbdev/matrox/matroxfb_base.c b/drivers/video/fbdev/matrox/matroxfb_base.c
index a3853421b263..4325bf7f388c 100644
--- a/drivers/video/fbdev/matrox/matroxfb_base.c
+++ b/drivers/video/fbdev/matrox/matroxfb_base.c
@@ -2608,12 +2608,3 @@ EXPORT_SYMBOL(matroxfb_register_driver);
 EXPORT_SYMBOL(matroxfb_unregister_driver);
 EXPORT_SYMBOL(matroxfb_wait_for_sync);
 EXPORT_SYMBOL(matroxfb_enable_irq);
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
-
diff --git a/drivers/video/fbdev/vga16fb.c b/drivers/video/fbdev/vga16fb.c
index 1e8a38a7967d..e2757ff1c23d 100644
--- a/drivers/video/fbdev/vga16fb.c
+++ b/drivers/video/fbdev/vga16fb.c
@@ -1451,13 +1451,3 @@ MODULE_DESCRIPTION("Legacy VGA framebuffer device driver");
 MODULE_LICENSE("GPL");
 module_init(vga16fb_init);
 module_exit(vga16fb_exit);
-
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-basic-offset: 8
- * End:
- */
-
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 9a3aed249692..c0395363eab9 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset:8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * configfs_internal.h - Internal stuff for configfs
  *
  * Based on sysfs:
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index b6098e02e20b..ac5e0c0e9181 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.c - Operations for configfs directories.
  *
  * Based on sysfs:
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index da8351d1e455..e26060dae70a 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.c - operations for regular (text) files.
  *
  * Based on sysfs:
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 42c348bb2903..eb5ec3e46283 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.c - basic inode and dentry operations.
  *
  * Based on sysfs:
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 704a4356f137..254170a82aa3 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * item.c - library routines for handling generic config items
  *
  * Based on kobject:
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 0c6e8cf61953..c2d820063ec4 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * mount.c - operations for initializing and mounting configfs.
  *
  * Based on sysfs:
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 77c854364e60..0623c3edcfb9 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * symlink.c - operations for configfs symlinks.
  *
  * Based on sysfs:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fc4f490f2d78..3d8e3698d3df 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -3004,10 +3004,3 @@ out_notsup:
 	goto out;
 }
 EXPORT_SYMBOL_GPL(nfs_permission);
-
-/*
- * Local variables:
- *  version-control: t
- *  kept-new-versions: 5
- * End:
- */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c65c4b41e2c1..545010d6cbf3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -10427,9 +10427,3 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {
 #endif
 	NULL
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index ff876dda7f06..db3811af0796 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -149,9 +149,3 @@ void nfs4_set_lease_period(struct nfs_client *clp,
 	/* Cap maximum reconnect timeout at 1/2 lease period */
 	rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1);
 }
-
-/*
- * Local variables:
- *   c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3a51351bdc6a..2eec5bbb55c8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2695,9 +2695,3 @@ static int nfs4_run_state_manager(void *ptr)
 	module_put_and_exit(0);
 	return 0;
 }
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ac6b79ee9355..d4fd3be0e8ca 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7629,9 +7629,3 @@ const struct rpc_version nfs_version4 = {
 	.procs			= nfs4_procedures,
 	.counts			= nfs_version4_counts,
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index daf43b980d4b..f4ce93d7f26e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -3317,9 +3317,3 @@ const struct svc_version nfsd_version4 = {
 	.vs_rpcb_optnl		= true,
 	.vs_need_cong_ctrl	= true,
 };
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e0f06d3cbd44..7abeccb975b2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5448,9 +5448,3 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
 	nfsd4_sequence_done(resp);
 	return 1;
 }
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index fe540a3415c6..a7c425254fee 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -866,9 +866,3 @@ struct nfsd4_operation {
 
 
 #endif
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 5259badabb56..5c72a7e6d6c5 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * acl.c
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 4e86450917b2..f59d8d0a61fa 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * acl.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 78710788c237..e032f2e2c2c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * alloc.c
  *
  * Extent allocs and frees
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 7f973dd76dbc..4af7abaa6e40 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * alloc.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ad20403b383f..1294925ac94a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 70ed4382750d..3a520117fa59 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index dabfef9c2bc0..863a5316030b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * blockcheck.c
  *
  * Checksum and ECC codes for the OCFS2 userspace library.
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index 8f17d2c85f40..d0578e98ee8d 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * blockcheck.h
  *
  * Checksum and ECC codes for the OCFS2 userspace library.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f0b104e483d8..e7758778abef 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * io.c
  *
  * Buffer cache handling
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 1c5e533fba04..2d51649fc090 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_buffer_head.h
  *
  * Buffer cache handling functions defined
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 12a7590601dd..e829c2595543 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index beed31ea86cf..1d4100abf6f8 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1d696c96b8b2..810d32815593 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 446e452ac7a6..b73fc42e46ff 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 667a5c5e1f66..7524994e3199 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * netdebug.c
  *
  * debug functionality for o2net
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 7a7640c59f3c..bb82e6b1ff4e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 3e0006631cc4..3490e77a952d 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * nodemanager.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
index 760d850be11e..6088c9f974dd 100644
--- a/fs/ocfs2/cluster/ocfs2_heartbeat.h
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_heartbeat.h
  *
  * On-disk structures for ocfs2_heartbeat
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 21ad307419a8..c9a0b77443e7 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_nodemanager.h
  *
  * Header describing the interface between userspace and the kernel
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index cea739be77c4..189c111bc371 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+/*
  *
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
index 6d45ce8b18a1..d64bf4482a4a 100644
--- a/fs/ocfs2/cluster/quorum.h
+++ b/fs/ocfs2/cluster/quorum.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index d6067c3d84c1..022f716c74ff 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sys.c
  *
  * OCFS2 cluster sysfs interface
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
index ce380517cf17..70aaba65317e 100644
--- a/fs/ocfs2/cluster/sys.h
+++ b/fs/ocfs2/cluster/sys.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sys.h
  *
  * Function prototypes for o2cb sysfs interface
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 3bd8119bed5e..f660c0dbdb63 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
+/*
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 736338f45c59..a75b551d31c7 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * tcp.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index e6a2b9dfcd16..601c99bd2611 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * Copyright (C) 2005 Oracle.  All rights reserved.
  */
 
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 42a61eecdacd..04fc8344063a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dcache.c
  *
  * dentry cache handling code
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index 3686a52ba143..7f246c5692d8 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dcache.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index bdfba9db558a..bd8d534f11cb 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.c
  *
  * Creates, reads, walks and deletes directory-nodes
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e3e7d5dd29e8..4b9f5a12c7d2 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dir.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index 6456c0fbcbb2..bae60ca2672a 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmapi.h
  *
  * externally exported dlm interfaces
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 70a10764f249..c681ba957932 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmast.c
  *
  * AST and BAST functionality for local and remote nodes
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 58d57e25d384..fd2022712167 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmcommon.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 6051edc33aef..450d46eefab3 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmconvert.c
  *
  * underlying calls for lock conversion
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
index 12d9c28bc52f..1f371716513b 100644
--- a/fs/ocfs2/dlm/dlmconvert.h
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmconvert.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 4b8b41d23e91..d442cf5dda8a 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdebug.c
  *
  * debug functionality for the dlm
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index f8fd8680a4b6..e08f7357e7ec 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdebug.h
  *
  * Copyright (C) 2008 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 357cfc702ce3..9f90fc9551e1 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdomain.c
  *
  * defines domain join / leave apis
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 7c21664d23d0..815abe30ad09 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmdomain.h
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83f0760e4fba..041fd1791ae7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmlock.c
  *
  * underlying calls for lock creation
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f105746063ed..4960a6de768d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmmod.c
  *
  * standalone DLM module
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index afc51736686c..0e7aad1b11cc 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmrecovery.c
  *
  * recovery stuff
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5ccc4ff0b82a..c350bd4df770 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmthread.c
  *
  * standalone DLM module
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index dcb17ca8ae74..61103b2d69fb 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmunlock.c
  *
  * underlying calls for unlocking locks
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b2870f1a31df..fa0a14f199eb 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmfs.c
  *
  * Code which implements the kernel side of a minimal userspace
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 339f098d9592..29f183a15798 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * userdlm.c
  *
  * Code which implements the kernel side of a minimal userspace
diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0558ae768200..47ba18eac423 100644
--- a/fs/ocfs2/dlmfs/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * userdlm.h
  *
  * Userspace dlm defines
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0fbe8bf7190f..48fd369c29a4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmglue.c
  *
  * Code which implements an OCFS2 specific interface to our DLM.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index b8fbed25df89..e5da5809ed95 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * dlmglue.h
  *
  * description here
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 69ed278dd84d..eaa8c80ace3c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * export.c
  *
  * Functions to facilitate NFS exporting
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
index d485da0c3439..636357400505 100644
--- a/fs/ocfs2/export.h
+++ b/fs/ocfs2/export.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * export.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 7b93e9c766f6..70a768b623cf 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * extent_map.c
  *
  * Block/Cluster mapping functions
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e5464f6cee8a..bc4ed59fb925 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * extent_map.h
  *
  * In-memory file extent mappings for OCFS2.
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index db8a6265b749..f17c3d33fb18 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.c
  *
  * File open, close, extend, truncate
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 8536cec5f122..71db8f3aa027 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * file.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 50f11bfdc8c2..90b8d300c1ee 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * filecheck.c
  *
  * Code which implements online file check.
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
index 4d006777ac54..d3bcb8bcfeb0 100644
--- a/fs/ocfs2/filecheck.h
+++ b/fs/ocfs2/filecheck.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * filecheck.h
  *
  * Online file check.
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 60c5f995d30c..9099d8fc7599 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.c
  *
  * Register ourselves with the heartbaet service, keep our node maps
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index 5fedb2d35dc0..f1f8b1802fe4 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * heartbeat.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7c9dfd50c1c1..bc8f32fab964 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.c
  *
  * vfs' aops, fops, dops and iops
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 51a4f7197987..82b28fdacc7e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * inode.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index db52e843002a..4e589ce2fce6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * journal.c
  *
  * Defines functions of journalling api
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index bfe611ed1b1d..d158acb8b38a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * journal.h
  *
  * Defines journalling api and structures.
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index fc8252a28cb1..5f6bacbeef6b 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * localalloc.c
  *
  * Node local data allocation
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index e8a5cea48639..08f925b7ec6d 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * localalloc.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 7edc4e5c7c2c..fab7c6a4a7d0 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * locks.c
  *
  * Userspace file locking support
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 389fe1fce3a5..b52de3947d5f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * locks.h
  *
  * Function prototypes for Userspace file locking support
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 25cabbfe87fc..1834f26522ed 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * mmap.c
  *
  * Code to deal with the mess that is clustered mmap.
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 758d9661ef1e..192cad0662d8 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * move_extents.c
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
index 28cac43892c5..987f9e559f30 100644
--- a/fs/ocfs2/move_extents.h
+++ b/fs/ocfs2/move_extents.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * move_extents.h
  *
  * Copyright (C) 2011 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 05ced86580d1..2c46ff6ba4ea 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * namei.c
  *
  * Create and rename file, directory, symlinks
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index cc091ed02b4a..9cc891eb874e 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * namei.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
index 01ae48c4834d..6dbcf3d467fb 100644
--- a/fs/ocfs2/ocfs1_fs_compat.h
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs1_fs_compat.h
  *
  * OCFS1 volume header definitions.  OCFS2 creates valid but unmountable
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7993d527edae..bb62cc2e0211 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2.h
  *
  * Defines macros and structures used in OCFS2
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 19137c6d087b..638d875eccc7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_fs.h
  *
  * On-disk structures for OCFS2.
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 273616bd4f19..9680797bc531 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_ioctl.h
  *
  * Defines OCFS2 ioctls.
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index b4be84956bc1..8ac357ce6a30 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_lockid.h
  *
  * Defines OCFS2 lockid bits.
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 5c9c105b33ee..31a5e1619e7f 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * ocfs2_lockingver.h
  *
  * Defines OCFS2 Locking version values.
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index c19a463fac55..7f6355cbb587 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * refcounttree.c
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 0b9014495726..8197a94feec0 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * refcounttree.h
  *
  * Copyright (C) 2009 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index bf3842e34fb9..769e466887b0 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * reservations.c
  *
  * Allocation reservations implementation
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 6ac88122896d..677c50663595 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * reservations.h
  *
  * Allocation reservations function prototypes and structures.
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 24eb52f9059c..d65d43c61857 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * resize.c
  *
  * volume resize.
diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h
index 0af0c023042c..4990637219ef 100644
--- a/fs/ocfs2/resize.h
+++ b/fs/ocfs2/resize.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * resize.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 4da0e4b1e79b..0b0ae3ebb0cf 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * slot_map.c
  *
  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 93b53e73f0f7..a43644570b53 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * slotmap.h
  *
  * description here
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index f70012038383..88f75f7f02d7 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stack_o2cb.c
  *
  * Code which interfaces ocfs2 with the o2cb stack.
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 7397064c3f35..85a47621e0c0 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stack_user.c
  *
  * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 8d33ebc6b6fc..d50e8b8dfea4 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stackglue.c
  *
  * Code which implements an OCFS2 specific interface to underlying
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index e9d26cbeb3b8..3636847fae19 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * stackglue.h
  *
  * Glue to the underlying cluster stack.
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 8c8cf7f4eb34..8521942f5af2 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * suballoc.c
  *
  * metadata alloc and free
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 50b36250beb6..5805a03d100b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * suballoc.h
  *
  * Defines sub allocator api
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 079f8826993e..c86bd4e60e20 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * super.c
  *
  * load/unload driver, mount/dismount volumes
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 76facaf63336..8312651135b9 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * super.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 94cfacc9bad7..f755a4985821 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  *  linux/cluster/ssi/cfs/symlink.c
  *
  *	This program is free software; you can redistribute it and/or
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
index 167094d1e5aa..ffcf0210545c 100644
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * symlink.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bb701c4e449f..53a945da873b 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sysfile.c
  *
  * Initialize, read, write, etc. system files.
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
index a83dd962fccb..2b38c75990fd 100644
--- a/fs/ocfs2/sysfile.h
+++ b/fs/ocfs2/sysfile.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * sysfile.h
  *
  * Function prototypes
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 580852ba05c4..09854925fa5c 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * uptodate.c
  *
  * Tracking the up-to-date-ness of a local buffer_head with respect to
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 77a30cae4879..85d94134001b 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * uptodate.h
  *
  * Cluster uptodate tracking
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 36ae47a4aef6..dd784eb0cd7c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * xattr.c
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 9c80382da1f5..00308b57f64f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * xattr.h
  *
  * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 155b82870333..4a7cb16e9345 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -488,13 +488,3 @@ int reiserfs_proc_info_global_done(void)
  * (available at http://www.namesys.com/legalese.html)
  *
  */
-
-/*
- * Make Linus happy.
- * Local variables:
- * c-indentation-style: "K&R"
- * mode-name: "LC"
- * c-basic-offset: 8
- * tab-width: 8
- * End:
- */
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 2e8c69b43c64..97cfd13bae51 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
+/*
  * configfs.h - definitions for the device driver filesystem
  *
  * Based on sysfs:
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 6cb82301d8e9..939b1a8f571b 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -404,4 +404,3 @@ s_fields								\
 
 /* }}}1 */
 #endif /* GENL_MAGIC_FUNC_H */
-/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h
index 35d21fddaf2d..f81d48987528 100644
--- a/include/linux/genl_magic_struct.h
+++ b/include/linux/genl_magic_struct.h
@@ -283,4 +283,3 @@ enum {									\
 
 /* }}}1 */
 #endif /* GENL_MAGIC_STRUCT_H */
-/* vim: set foldmethod=marker nofoldenable : */
diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
index e8eb4ad03cf1..d174914a837d 100644
--- a/include/uapi/linux/if_bonding.h
+++ b/include/uapi/linux/if_bonding.h
@@ -153,14 +153,3 @@ enum {
 #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1)
 
 #endif /* _LINUX_IF_BONDING_H */
-
-/*
- * Local variables:
- *  version-control: t
- *  kept-new-versions: 5
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
-
diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index ed5415e0f1c1..800bb0ffa6e6 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -178,9 +178,3 @@
 #define NFS4_MAX_BACK_CHANNEL_OPS 2
 
 #endif /* _UAPI_LINUX_NFS4_H */
-
-/*
- * Local variables:
- *  c-basic-offset: 8
- * End:
- */
diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h
index 9e9f9bf7c66d..449bd383cb76 100644
--- a/include/xen/interface/elfnote.h
+++ b/include/xen/interface/elfnote.h
@@ -208,13 +208,3 @@
 #define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY
 
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/include/xen/interface/hvm/hvm_vcpu.h b/include/xen/interface/hvm/hvm_vcpu.h
index 32ca83edd44d..bfc2138e0bf5 100644
--- a/include/xen/interface/hvm/hvm_vcpu.h
+++ b/include/xen/interface/hvm/hvm_vcpu.h
@@ -131,13 +131,3 @@ struct vcpu_hvm_context {
 typedef struct vcpu_hvm_context vcpu_hvm_context_t;
 
 #endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h
index aaf2951b1cce..fb8716112251 100644
--- a/include/xen/interface/io/xenbus.h
+++ b/include/xen/interface/io/xenbus.h
@@ -39,13 +39,3 @@ enum xenbus_state
 };
 
 #endif /* _XEN_PUBLIC_IO_XENBUS_H */
-
-/*
- * Local variables:
- *  c-file-style: "linux"
- *  indent-tabs-mode: t
- *  c-indent-level: 8
- *  c-basic-offset: 8
- *  tab-width: 8
- * End:
- */
diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c
index f9008be7a8a1..37a657b25d58 100644
--- a/samples/configfs/configfs_sample.c
+++ b/samples/configfs/configfs_sample.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * vim: noexpandtab ts=8 sts=0 sw=8:
- *
  * configfs_example_macros.c - This file is a demonstration module
  *      containing a number of configfs subsystems.  It uses the helper
  *      macros defined by configfs.h
diff --git a/tools/usb/hcd-tests.sh b/tools/usb/hcd-tests.sh
index e8cad6a4f9c9..73f914d13f5c 100644
--- a/tools/usb/hcd-tests.sh
+++ b/tools/usb/hcd-tests.sh
@@ -272,5 +272,3 @@ do
 	echo ''
     done
 done
-
-# vim: sw=4
-- 
cgit v1.2.3


From f0953a1bbaca71e1ebbcb9864eb1b273156157ed Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 6 May 2021 18:06:47 -0700
Subject: mm: fix typos in comments

Fix ~94 single-word typos in locking code comments, plus a few
very obvious grammar mistakes.

Link: https://lkml.kernel.org/r/20210322212624.GA1963421@gmail.com
Link: https://lore.kernel.org/r/20210322205203.GB1959563@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Bhaskar Chowdhury <unixbhaskar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h      |  2 +-
 include/linux/vmalloc.h |  4 ++--
 mm/balloon_compaction.c |  4 ++--
 mm/compaction.c         |  4 ++--
 mm/filemap.c            |  2 +-
 mm/gup.c                |  2 +-
 mm/highmem.c            |  2 +-
 mm/huge_memory.c        |  6 +++---
 mm/hugetlb.c            |  6 +++---
 mm/internal.h           |  2 +-
 mm/kasan/kasan.h        |  8 ++++----
 mm/kasan/quarantine.c   |  4 ++--
 mm/kasan/shadow.c       |  4 ++--
 mm/kfence/report.c      |  2 +-
 mm/khugepaged.c         |  2 +-
 mm/ksm.c                |  4 ++--
 mm/madvise.c            |  4 ++--
 mm/memcontrol.c         | 18 +++++++++---------
 mm/memory-failure.c     |  2 +-
 mm/memory.c             | 10 +++++-----
 mm/mempolicy.c          |  4 ++--
 mm/migrate.c            |  8 ++++----
 mm/mmap.c               |  4 ++--
 mm/mprotect.c           |  2 +-
 mm/mremap.c             |  2 +-
 mm/oom_kill.c           |  2 +-
 mm/page-writeback.c     |  4 ++--
 mm/page_alloc.c         | 14 +++++++-------
 mm/page_owner.c         |  2 +-
 mm/percpu-internal.h    |  2 +-
 mm/percpu.c             |  2 +-
 mm/pgalloc-track.h      |  6 +++---
 mm/slab.c               |  6 +++---
 mm/slub.c               |  2 +-
 mm/swap_slots.c         |  2 +-
 mm/vmalloc.c            |  6 +++---
 mm/vmstat.c             |  2 +-
 mm/zpool.c              |  2 +-
 mm/zsmalloc.c           |  2 +-
 39 files changed, 83 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 76e27ebb28a3..322ec61d0da7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -106,7 +106,7 @@ extern int mmap_rnd_compat_bits __read_mostly;
  * embedding these tags into addresses that point to these memory regions, and
  * checking that the memory and the pointer tags match on memory accesses)
  * redefine this macro to strip tags from pointers.
- * It's defined as noop for arcitectures that don't support memory tagging.
+ * It's defined as noop for architectures that don't support memory tagging.
  */
 #ifndef untagged_addr
 #define untagged_addr(addr) (addr)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index b6ff16393bf6..4d668abb6391 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -33,7 +33,7 @@ struct notifier_block;		/* in notifier.h */
  *
  * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
  * shadow memory has been mapped. It's used to handle allocation errors so that
- * we don't try to poision shadow on free if it was never allocated.
+ * we don't try to poison shadow on free if it was never allocated.
  *
  * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
  * determine which allocations need the module shadow freed.
@@ -43,7 +43,7 @@ struct notifier_block;		/* in notifier.h */
 
 /*
  * Maximum alignment for ioremap() regions.
- * Can be overriden by arch-specific value.
+ * Can be overridden by arch-specific value.
  */
 #ifndef IOREMAP_MAX_ORDER
 #define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 26de020aae7b..907fefde2572 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
 /**
  * balloon_page_list_dequeue() - removes pages from balloon's page list and
  *				 returns a list of the pages.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
  * @pages: pointer to the list of pages that would be returned to the caller.
  * @n_req_pages: number of requested pages.
  *
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue);
 /*
  * balloon_page_dequeue - removes a page from balloon's page list and returns
  *			  its address to allow the driver to release the page.
- * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @b_dev_info: balloon device descriptor where we will grab a page from.
  *
  * Driver must call this function to properly dequeue a previously enqueued page
  * before definitively releasing it back to the guest system.
diff --git a/mm/compaction.c b/mm/compaction.c
index 3a6c6b821f80..84fde270ae74 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2012,8 +2012,8 @@ static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
 	unsigned int wmark_low;
 
 	/*
-	 * Cap the low watermak to avoid excessive compaction
-	 * activity in case a user sets the proactivess tunable
+	 * Cap the low watermark to avoid excessive compaction
+	 * activity in case a user sets the proactiveness tunable
 	 * close to 100 (maximum).
 	 */
 	wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
diff --git a/mm/filemap.c b/mm/filemap.c
index 7fadf211643c..66f7e9fdfbc4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2755,7 +2755,7 @@ unsigned int seek_page_size(struct xa_state *xas, struct page *page)
  * entirely memory-based such as tmpfs, and filesystems which support
  * unwritten extents.
  *
- * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * Return: The requested offset on success, or -ENXIO if @whence specifies
  * SEEK_DATA and there is no data after @start.  There is an implicit hole
  * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
  * and @end contain data.
diff --git a/mm/gup.c b/mm/gup.c
index aa09535cf4d4..0697134b6a12 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1575,7 +1575,7 @@ finish_or_fault:
  * Returns NULL on any kind of failure - a hole must then be inserted into
  * the corefile, to preserve alignment with its headers; and also returns
  * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
+ * allowing a hole to be left in the corefile to save disk space.
  *
  * Called without mmap_lock (takes and releases the mmap_lock by itself).
  */
diff --git a/mm/highmem.c b/mm/highmem.c
index e389337e00b4..4fb51d735aa6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -519,7 +519,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
 
 	/*
 	 * Disable migration so resulting virtual address is stable
-	 * accross preemption.
+	 * across preemption.
 	 */
 	migrate_disable();
 	preempt_disable();
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 98456017744d..63ed6b25deaa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1792,8 +1792,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 /*
  * Returns
  *  - 0 if PMD could not be locked
- *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
@@ -2469,7 +2469,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 		xa_lock(&swap_cache->i_pages);
 	}
 
-	/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+	/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
 	lruvec = lock_page_lruvec(head);
 
 	for (i = nr - 1; i >= 1; i--) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 629aa4c2259c..3db405dea3dc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -466,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv,
 			      resv->region_cache_count;
 
 		/* At this point, we should have enough entries in the cache
-		 * for all the existings adds_in_progress. We should only be
+		 * for all the existing adds_in_progress. We should only be
 		 * needing to allocate for regions_needed.
 		 */
 		VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
@@ -5536,8 +5536,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 		v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
 
 	/*
-	 * vma need span at least one aligned PUD size and the start,end range
-	 * must at least partialy within it.
+	 * vma needs to span at least one aligned PUD size, and the range
+	 * must be at least partially within in.
 	 */
 	if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
 		(*end <= v_start) || (*start >= v_end))
diff --git a/mm/internal.h b/mm/internal.h
index feeaaf06705d..54bd0dc2c23c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -334,7 +334,7 @@ static inline bool is_exec_mapping(vm_flags_t flags)
 }
 
 /*
- * Stack area - atomatically grows in one direction
+ * Stack area - automatically grows in one direction
  *
  * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
  * do_mmap() forbids all other combinations.
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 3820ca54743b..8f450bc28045 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -55,9 +55,9 @@ extern bool kasan_flag_async __ro_after_init;
 #define KASAN_TAG_MAX		0xFD /* maximum value for random tags */
 
 #ifdef CONFIG_KASAN_HW_TAGS
-#define KASAN_TAG_MIN		0xF0 /* mimimum value for random tags */
+#define KASAN_TAG_MIN		0xF0 /* minimum value for random tags */
 #else
-#define KASAN_TAG_MIN		0x00 /* mimimum value for random tags */
+#define KASAN_TAG_MIN		0x00 /* minimum value for random tags */
 #endif
 
 #ifdef CONFIG_KASAN_GENERIC
@@ -403,7 +403,7 @@ static inline bool kasan_byte_accessible(const void *addr)
 #else /* CONFIG_KASAN_HW_TAGS */
 
 /**
- * kasan_poison - mark the memory range as unaccessible
+ * kasan_poison - mark the memory range as inaccessible
  * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
  * @size - range size, must be aligned to KASAN_GRANULE_SIZE
  * @value - value that's written to metadata for the range
@@ -434,7 +434,7 @@ bool kasan_byte_accessible(const void *addr);
 
 /**
  * kasan_poison_last_granule - mark the last granule of the memory range as
- * unaccessible
+ * inaccessible
  * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
  * @size - range size
  *
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 728fb24c5683..d8ccff4c1275 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -27,7 +27,7 @@
 /* Data structure and operations for quarantine queues. */
 
 /*
- * Each queue is a signle-linked list, which also stores the total size of
+ * Each queue is a single-linked list, which also stores the total size of
  * objects inside of it.
  */
 struct qlist_head {
@@ -138,7 +138,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
 		local_irq_save(flags);
 
 	/*
-	 * As the object now gets freed from the quaratine, assume that its
+	 * As the object now gets freed from the quarantine, assume that its
 	 * free track is no longer valid.
 	 */
 	*(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 727ad4629173..082ee5b6d9a1 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -316,7 +316,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
 	 * // rest of vmalloc process		<data dependency>
 	 * STORE p, a				LOAD shadow(x+99)
 	 *
-	 * If there is no barrier between the end of unpoisioning the shadow
+	 * If there is no barrier between the end of unpoisoning the shadow
 	 * and the store of the result to p, the stores could be committed
 	 * in a different order by CPU#0, and CPU#1 could erroneously observe
 	 * poison in the shadow.
@@ -384,7 +384,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  * How does this work?
  * -------------------
  *
- * We have a region that is page aligned, labelled as A.
+ * We have a region that is page aligned, labeled as A.
  * That might not map onto the shadow in a way that is page-aligned:
  *
  *                    start                     end
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index e3f71451ad9e..2a319c21c939 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -263,6 +263,6 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
 	if (panic_on_warn)
 		panic("panic_on_warn set ...\n");
 
-	/* We encountered a memory unsafety error, taint the kernel! */
+	/* We encountered a memory safety error, taint the kernel! */
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
 }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ea74da3232ab..6c0185fdd815 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -667,7 +667,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		 *
 		 * The page table that maps the page has been already unlinked
 		 * from the page table tree and this process cannot get
-		 * an additinal pin on the page.
+		 * an additional pin on the page.
 		 *
 		 * New pins can come later if the page is shared across fork,
 		 * but not from this process. The other process cannot write to
diff --git a/mm/ksm.c b/mm/ksm.c
index b7cbcc7d4977..6bbe314c5260 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1065,7 +1065,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		/*
 		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
 		 * take any lock, therefore the check that we are going to make
-		 * with the pagecount against the mapcount is racey and
+		 * with the pagecount against the mapcount is racy and
 		 * O_DIRECT can happen right after the check.
 		 * So we clear the pte and flush the tlb before the check
 		 * this assure us that no O_DIRECT can happen after the check
@@ -1435,7 +1435,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
 			 */
 			*_stable_node = found;
 			/*
-			 * Just for robustneess as stable_node is
+			 * Just for robustness, as stable_node is
 			 * otherwise left as a stable pointer, the
 			 * compiler shall optimize it away at build
 			 * time.
diff --git a/mm/madvise.c b/mm/madvise.c
index 01fef79ac761..63e489e5bfdb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -799,7 +799,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
 		if (end > vma->vm_end) {
 			/*
 			 * Don't fail if end > vma->vm_end. If the old
-			 * vma was splitted while the mmap_lock was
+			 * vma was split while the mmap_lock was
 			 * released the effect of the concurrent
 			 * operation may not cause madvise() to
 			 * have an undefined result. There may be an
@@ -1039,7 +1039,7 @@ process_madvise_behavior_valid(int behavior)
  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
  *  MADV_COLD - the application is not expected to use this memory soon,
  *		deactivate pages in this range so that they can be reclaimed
- *		easily if memory pressure hanppens.
+ *		easily if memory pressure happens.
  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
  *		page out the pages in this range immediately.
  *
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3004afb6d090..64ada9e650a5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -215,7 +215,7 @@ enum res_type {
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
-/* Used for OOM nofiier */
+/* Used for OOM notifier */
 #define OOM_CONTROL		(0)
 
 /*
@@ -786,7 +786,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
  * __count_memcg_events - account VM events in a cgroup
  * @memcg: the memory cgroup
  * @idx: the event item
- * @count: the number of events that occured
+ * @count: the number of events that occurred
  */
 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 			  unsigned long count)
@@ -904,7 +904,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	rcu_read_lock();
 	do {
 		/*
-		 * Page cache insertions can happen withou an
+		 * Page cache insertions can happen without an
 		 * actual mm context, e.g. during disk probing
 		 * on boot, loopback IO, acct() writes etc.
 		 */
@@ -1712,7 +1712,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 	struct mem_cgroup *iter;
 
 	/*
-	 * Be careful about under_oom underflows becase a child memcg
+	 * Be careful about under_oom underflows because a child memcg
 	 * could have been added after mem_cgroup_mark_under_oom.
 	 */
 	spin_lock(&memcg_oom_lock);
@@ -1884,7 +1884,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
 		/*
 		 * There is no guarantee that an OOM-lock contender
 		 * sees the wakeups triggered by the OOM kill
-		 * uncharges.  Wake any sleepers explicitely.
+		 * uncharges.  Wake any sleepers explicitly.
 		 */
 		memcg_oom_recover(memcg);
 	}
@@ -4364,7 +4364,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
  * Foreign dirty flushing
  *
  * There's an inherent mismatch between memcg and writeback.  The former
- * trackes ownership per-page while the latter per-inode.  This was a
+ * tracks ownership per-page while the latter per-inode.  This was a
  * deliberate design decision because honoring per-page ownership in the
  * writeback path is complicated, may lead to higher CPU and IO overheads
  * and deemed unnecessary given that write-sharing an inode across
@@ -4379,9 +4379,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
  * triggering background writeback.  A will be slowed down without a way to
  * make writeback of the dirty pages happen.
  *
- * Conditions like the above can lead to a cgroup getting repatedly and
+ * Conditions like the above can lead to a cgroup getting repeatedly and
  * severely throttled after making some progress after each
- * dirty_expire_interval while the underyling IO device is almost
+ * dirty_expire_interval while the underlying IO device is almost
  * completely idle.
  *
  * Solving this problem completely requires matching the ownership tracking
@@ -5774,7 +5774,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
 		return 0;
 
 	/*
-	 * We are now commited to this value whatever it is. Changes in this
+	 * We are now committed to this value whatever it is. Changes in this
 	 * tunable will only affect upcoming migrations, not the current one.
 	 * So we need to save it, and keep it going.
 	 */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index bd3945446d47..85ad98c00fd9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -75,7 +75,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
 		if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
 			/*
 			 * We could fail to take off the target page from buddy
-			 * for example due to racy page allocaiton, but that's
+			 * for example due to racy page allocation, but that's
 			 * acceptable because soft-offlined page is not broken
 			 * and if someone really want to use it, they should
 			 * take it.
diff --git a/mm/memory.c b/mm/memory.c
index 8c491f813687..730daa00952b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3727,7 +3727,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 		return ret;
 
 	/*
-	 * Archs like ppc64 need additonal space to store information
+	 * Archs like ppc64 need additional space to store information
 	 * related to pte entry. Use the preallocated table for that.
 	 */
 	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
@@ -4503,7 +4503,7 @@ retry_pud:
 }
 
 /**
- * mm_account_fault - Do page fault accountings
+ * mm_account_fault - Do page fault accounting
  *
  * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
  *        of perf event counters, but we'll still do the per-task accounting to
@@ -4512,9 +4512,9 @@ retry_pud:
  * @flags: the fault flags.
  * @ret: the fault retcode.
  *
- * This will take care of most of the page fault accountings.  Meanwhile, it
+ * This will take care of most of the page fault accounting.  Meanwhile, it
  * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
- * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
  * still be in per-arch page fault handlers at the entry of page fault.
  */
 static inline void mm_account_fault(struct pt_regs *regs,
@@ -4848,7 +4848,7 @@ out:
 /**
  * generic_access_phys - generic implementation for iomem mmap access
  * @vma: the vma to access
- * @addr: userspace addres, not relative offset within @vma
+ * @addr: userspace address, not relative offset within @vma
  * @buf: buffer to read/write
  * @len: length of transfer
  * @write: set to FOLL_WRITE when writing, otherwise reading
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3ebe2cfc64af..5690513c5668 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1867,7 +1867,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
 	 *
 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
-	 * so if the following test faile, it implies
+	 * so if the following test fails, it implies
 	 * policy->v.nodes has movable memory only.
 	 */
 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
@@ -2098,7 +2098,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
  *
  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
  * policy.  Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
+ * nodemask for 'bind' or 'interleave' policy.  For 'preferred' or 'local'
  * policy, always return true since it may allocate elsewhere on fallback.
  *
  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
diff --git a/mm/migrate.c b/mm/migrate.c
index 6b37d00890ca..b234c3f3acb7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2779,11 +2779,11 @@ restore:
  *
  * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
  * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
- * allowing the caller to allocate device memory for those unback virtual
- * address.  For this the caller simply has to allocate device memory and
+ * allowing the caller to allocate device memory for those unbacked virtual
+ * addresses.  For this the caller simply has to allocate device memory and
  * properly set the destination entry like for regular migration.  Note that
- * this can still fails and thus inside the device driver must check if the
- * migration was successful for those entries after calling migrate_vma_pages()
+ * this can still fail, and thus inside the device driver you must check if the
+ * migration was successful for those entries after calling migrate_vma_pages(),
  * just like for regular migration.
  *
  * After that, the callers must call migrate_vma_pages() to go over each entry
diff --git a/mm/mmap.c b/mm/mmap.c
index c1b848fa7da6..0584e540246e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
 	unsigned long nr_pages = 0;
 	struct vm_area_struct *vma;
 
-	/* Find first overlaping mapping */
+	/* Find first overlapping mapping */
 	vma = find_vma_intersection(mm, addr, end);
 	if (!vma)
 		return 0;
@@ -2875,7 +2875,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
 	if (unlikely(uf)) {
 		/*
 		 * If userfaultfd_unmap_prep returns an error the vmas
-		 * will remain splitted, but userland will get a
+		 * will remain split, but userland will get a
 		 * highly unexpected error anyway. This is no
 		 * different than the case where the first of the two
 		 * __split_vma fails, but we don't undo the first
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94188df1ee55..e7a443157988 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -699,7 +699,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
 	mmap_write_unlock(current->mm);
 
 	/*
-	 * We could provie warnings or errors if any VMA still
+	 * We could provide warnings or errors if any VMA still
 	 * has the pkey set here.
 	 */
 	return ret;
diff --git a/mm/mremap.c b/mm/mremap.c
index d22629ff8f3c..47c255b60150 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -730,7 +730,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	 * So, to avoid such scenario we can pre-compute if the whole
 	 * operation has high chances to success map-wise.
 	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
-	 * split in 3 before unmaping it.
+	 * split in 3 before unmapping it.
 	 * That means 2 more maps (1 for each) to the ones we already hold.
 	 * Check whether current map count plus 2 still leads us to 4 maps below
 	 * the threshold, otherwise return -ENOMEM here to be more safe.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3df2ac6b8686..eefd3f5fde46 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -74,7 +74,7 @@ static inline bool is_memcg_oom(struct oom_control *oc)
 
 #ifdef CONFIG_NUMA
 /**
- * oom_cpuset_eligible() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligibility for kill
  * @start: task struct of which task to consider
  * @oc: pointer to struct oom_control
  *
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5e761fb62800..0062d5c57d41 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1806,7 +1806,7 @@ pause:
 			break;
 
 		/*
-		 * In the case of an unresponding NFS server and the NFS dirty
+		 * In the case of an unresponsive NFS server and the NFS dirty
 		 * pages exceeds dirty_thresh, give the other good wb's a pipe
 		 * to go through, so that tasks on them still remain responsive.
 		 *
@@ -2216,7 +2216,7 @@ int write_cache_pages(struct address_space *mapping,
 			 * Page truncated or invalidated. We can freely skip it
 			 * then, even for data integrity operations: the page
 			 * has disappeared concurrently, so there could be no
-			 * real expectation of this data interity operation
+			 * real expectation of this data integrity operation
 			 * even if there is now a new, dirty page at the same
 			 * pagecache address.
 			 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bcdc0c6f21f1..0582c85da08c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -893,7 +893,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
 		return false;
 
 	/*
-	 * Do not let lower order allocations polluate a movable pageblock.
+	 * Do not let lower order allocations pollute a movable pageblock.
 	 * This might let an unmovable request use a reclaimable pageblock
 	 * and vice-versa but no more than normal fallback logic which can
 	 * have trouble finding a high-order free page.
@@ -2776,7 +2776,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			/*
 			 * In page freeing path, migratetype change is racy so
 			 * we can counter several free pages in a pageblock
-			 * in this loop althoug we changed the pageblock type
+			 * in this loop although we changed the pageblock type
 			 * from highatomic to ac->migratetype. So we should
 			 * adjust the count once.
 			 */
@@ -3080,7 +3080,7 @@ static void drain_local_pages_wq(struct work_struct *work)
 	 * drain_all_pages doesn't use proper cpu hotplug protection so
 	 * we can race with cpu offline when the WQ can move this from
 	 * a cpu pinned worker to an unbound one. We can operate on a different
-	 * cpu which is allright but we also have to make sure to not move to
+	 * cpu which is alright but we also have to make sure to not move to
 	 * a different one.
 	 */
 	preempt_disable();
@@ -5929,7 +5929,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
 static int __parse_numa_zonelist_order(char *s)
 {
 	/*
-	 * We used to support different zonlists modes but they turned
+	 * We used to support different zonelists modes but they turned
 	 * out to be just not useful. Let's keep the warning in place
 	 * if somebody still use the cmd line parameter so that we do
 	 * not fail it silently
@@ -7670,7 +7670,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
 }
 
 /*
- * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
  * such cases we allow max_zone_pfn sorted in the descending order
  */
 bool __weak arch_has_descending_max_zone_pfns(void)
@@ -8728,7 +8728,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
- * @migratetype:	migratetype of the underlaying pageblocks (either
+ * @migratetype:	migratetype of the underlying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
@@ -8988,7 +8988,7 @@ EXPORT_SYMBOL(free_contig_range);
 
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalulated.
+ * page high values need to be recalculated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9661d5320a07..adfabb560eb9 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -233,7 +233,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
 	/*
 	 * We don't clear the bit on the oldpage as it's going to be freed
 	 * after migration. Until then, the info can be useful in case of
-	 * a bug, and the overal stats will be off a bit only temporarily.
+	 * a bug, and the overall stats will be off a bit only temporarily.
 	 * Also, migrate_misplaced_transhuge_page() can still fail the
 	 * migration and then we want the oldpage to retain the info. But
 	 * in that case we also don't need to explicitly clear the info from
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 095d7eaa0db4..ae26b118e246 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -170,7 +170,7 @@ struct percpu_stats {
 	u64 nr_max_alloc;	/* max # of live allocations */
 	u32 nr_chunks;		/* current # of live chunks */
 	u32 nr_max_chunks;	/* max # of live chunks */
-	size_t min_alloc_size;	/* min allocaiton size */
+	size_t min_alloc_size;	/* min allocation size */
 	size_t max_alloc_size;	/* max allocation size */
 };
 
diff --git a/mm/percpu.c b/mm/percpu.c
index 23308113a5ff..f99e9306b939 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1862,7 +1862,7 @@ fail:
 			pr_info("limit reached, disable warning\n");
 	}
 	if (is_atomic) {
-		/* see the flag handling in pcpu_blance_workfn() */
+		/* see the flag handling in pcpu_balance_workfn() */
 		pcpu_atomic_alloc_failed = true;
 		pcpu_schedule_balance_work();
 	} else {
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
index 1dcc865029a2..e9e879de8649 100644
--- a/mm/pgalloc-track.h
+++ b/mm/pgalloc-track.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PGALLLC_TRACK_H
-#define _LINUX_PGALLLC_TRACK_H
+#ifndef _LINUX_PGALLOC_TRACK_H
+#define _LINUX_PGALLOC_TRACK_H
 
 #if defined(CONFIG_MMU)
 static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
@@ -48,4 +48,4 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
 	  (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
 		NULL: pte_offset_kernel(pmd, address))
 
-#endif /* _LINUX_PGALLLC_TRACK_H */
+#endif /* _LINUX_PGALLOC_TRACK_H */
diff --git a/mm/slab.c b/mm/slab.c
index d56607a80fa6..d0f725637663 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -259,7 +259,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 
 #define BATCHREFILL_LIMIT	16
 /*
- * Optimization question: fewer reaps means less probability for unnessary
+ * Optimization question: fewer reaps means less probability for unnecessary
  * cpucache drain/refill cycles.
  *
  * OTOH the cpuarrays can contain lots of objects,
@@ -2381,8 +2381,8 @@ union freelist_init_state {
 };
 
 /*
- * Initialize the state based on the randomization methode available.
- * return true if the pre-computed list is available, false otherwize.
+ * Initialize the state based on the randomization method available.
+ * return true if the pre-computed list is available, false otherwise.
  */
 static bool freelist_state_initialize(union freelist_init_state *state,
 				struct kmem_cache *cachep,
diff --git a/mm/slub.c b/mm/slub.c
index 68123b21e65f..feda53ae62ba 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3391,7 +3391,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
  */
 
 /*
- * Mininum / Maximum order of slab pages. This influences locking overhead
+ * Minimum / Maximum order of slab pages. This influences locking overhead
  * and slab fragmentation. A higher order reduces the number of partial slabs
  * and increases the number of allocations possible without having to
  * take the list_lock.
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index be9de6d5b516..6248d1030a9b 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -16,7 +16,7 @@
  * to local caches without needing to acquire swap_info
  * lock.  We do not reuse the returned slots directly but
  * move them back to the global pool in a batch.  This
- * allows the slots to coaellesce and reduce fragmentation.
+ * allows the slots to coalesce and reduce fragmentation.
  *
  * The swap entry allocated is marked with SWAP_HAS_CACHE
  * flag in map_count that prevents it from being allocated
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a7f318c9e426..a13ac524f6ff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1583,7 +1583,7 @@ static unsigned long lazy_max_pages(void)
 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
 
 /*
- * Serialize vmap purging.  There is no actual criticial section protected
+ * Serialize vmap purging.  There is no actual critical section protected
  * by this look, but we want to avoid concurrent calls for performance
  * reasons and to make the pcpu_get_vm_areas more deterministic.
  */
@@ -2628,7 +2628,7 @@ static void __vfree(const void *addr)
  * May sleep if called *not* from interrupt context.
  * Must not be called in NMI context (strictly speaking, it could be
  * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea).
+ * conventions for vfree() arch-dependent would be a really bad idea).
  */
 void vfree(const void *addr)
 {
@@ -3141,7 +3141,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
 		/*
 		 * To do safe access to this _mapped_ area, we need
 		 * lock. But adding lock here means that we need to add
-		 * overhead of vmalloc()/vfree() calles for this _debug_
+		 * overhead of vmalloc()/vfree() calls for this _debug_
 		 * interface, rarely used. Instead of that, we'll use
 		 * kmap() and get small overhead in this access function.
 		 */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5ba118521ded..cccee36b289c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -934,7 +934,7 @@ void cpu_vm_stats_fold(int cpu)
 
 /*
  * this is only called if !populated_zone(zone), which implies no other users of
- * pset->vm_stat_diff[] exsist.
+ * pset->vm_stat_diff[] exist.
  */
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 {
diff --git a/mm/zpool.c b/mm/zpool.c
index 5ed71207ced7..6d9ed48141e5 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -336,7 +336,7 @@ int zpool_shrink(struct zpool *zpool, unsigned int pages,
  * This may hold locks, disable interrupts, and/or preemption,
  * and the zpool_unmap_handle() must be called to undo those
  * actions.  The code that uses the mapped handle should complete
- * its operatons on the mapped handle memory quickly and unmap
+ * its operations on the mapped handle memory quickly and unmap
  * as soon as possible.  As the implementation may use per-cpu
  * data, multiple handles should not be mapped concurrently on
  * any cpu.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5004c176b045..19b563bc6c48 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1227,7 +1227,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
  * zs_map_object - get address of allocated object from handle.
  * @pool: pool from which the object was allocated
  * @handle: handle returned from zs_malloc
- * @mm: maping mode to use
+ * @mm: mapping mode to use
  *
  * Before using an object allocated from zs_malloc, it must be mapped using
  * this function. When done with the object, it must be unmapped using
-- 
cgit v1.2.3


From 0ab1438bad43d95877f848b7df551bd431680270 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 6 May 2021 02:45:15 +0900
Subject: linux/kconfig.h: replace IF_ENABLED() with PTR_IF() in
 <linux/kernel.h>

<linux/kconfig.h> is included from all the kernel-space source files,
including C, assembly, linker scripts. It is intended to contain a
minimal set of macros to evaluate CONFIG options.

IF_ENABLED() is an intruder here because (x ? y : z) is C code, which
should not be included from assembly files or linker scripts.

Also, <linux/kconfig.h> is no longer self-contained because NULL is
defined in <linux/stddef.h>.

Move IF_ENABLED() out to <linux/kernel.h> as PTR_IF(). PTF_IF()
takes the general boolean expression instead of a CONFIG option
so that it fits better in <linux/kernel.h>.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 drivers/pinctrl/pinctrl-ingenic.c | 3 +++
 include/linux/kconfig.h           | 6 ------
 include/linux/kernel.h            | 2 ++
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c
index 651a36b9dcc0..983ba9865f77 100644
--- a/drivers/pinctrl/pinctrl-ingenic.c
+++ b/drivers/pinctrl/pinctrl-ingenic.c
@@ -11,6 +11,7 @@
 #include <linux/gpio/driver.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
@@ -3854,6 +3855,8 @@ static int __init ingenic_pinctrl_probe(struct platform_device *pdev)
 	return 0;
 }
 
+#define IF_ENABLED(cfg, ptr)	PTR_IF(IS_ENABLED(cfg), (ptr))
+
 static const struct of_device_id ingenic_pinctrl_of_match[] = {
 	{
 		.compatible = "ingenic,jz4730-pinctrl",
diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index 24a59cb06963..cc8fa109cfa3 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -70,10 +70,4 @@
  */
 #define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option))
 
-/*
- * IF_ENABLED(CONFIG_FOO, ptr) evaluates to (ptr) if CONFIG_FOO is set to 'y'
- * or 'm', NULL otherwise.
- */
-#define IF_ENABLED(option, ptr) (IS_ENABLED(option) ? (ptr) : NULL)
-
 #endif /* __LINUX_KCONFIG_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5b7ed6dc99ac..2f9d15410c93 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -48,6 +48,8 @@
  */
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
+#define PTR_IF(cond, ptr)	((cond) ? (ptr) : NULL)
+
 #define u64_to_user_ptr(x) (		\
 {					\
 	typecheck(u64, (x));		\
-- 
cgit v1.2.3


From 35c820e71565d1fa835b82499359218b219828ac Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 8 May 2021 21:49:48 -0600
Subject: Revert "bio: limit bio max size"

This reverts commit cd2c7545ae1beac3b6aae033c7f31193b3255946.

Alex reports that the commit causes corruption with LUKS on ext4. Revert
it for now so that this can be investigated properly.

Link: https://lore.kernel.org/linux-block/1620493841.bxdq8r5haw.none@localhost/
Reported-by: Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c            | 13 ++-----------
 block/blk-settings.c   |  5 -----
 include/linux/bio.h    |  4 +---
 include/linux/blkdev.h |  2 --
 4 files changed, 3 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 221dc56ba22f..44205dfb6b60 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -255,13 +255,6 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 }
 EXPORT_SYMBOL(bio_init);
 
-unsigned int bio_max_size(struct bio *bio)
-{
-	struct block_device *bdev = bio->bi_bdev;
-
-	return bdev ? bdev->bd_disk->queue->limits.bio_max_bytes : UINT_MAX;
-}
-
 /**
  * bio_reset - reinitialize a bio
  * @bio:	bio to reset
@@ -873,7 +866,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
 		if (page_is_mergeable(bv, page, len, off, same_page)) {
-			if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
+			if (bio->bi_iter.bi_size > UINT_MAX - len) {
 				*same_page = false;
 				return false;
 			}
@@ -1002,7 +995,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-	unsigned int bytes_left = bio_max_size(bio) - bio->bi_iter.bi_size;
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
 	bool same_page = false;
@@ -1018,8 +1010,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-	size = iov_iter_get_pages(iter, pages, bytes_left, nr_pages,
-				  &offset);
+	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c646503e55d2..9c009090c4b5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -31,7 +31,6 @@ EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
-	lim->bio_max_bytes = UINT_MAX;
 	lim->max_segments = BLK_MAX_SEGMENTS;
 	lim->max_discard_segments = 1;
 	lim->max_integrity_segments = 0;
@@ -140,10 +139,6 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 				 limits->logical_block_size >> SECTOR_SHIFT);
 	limits->max_sectors = max_sectors;
 
-	if (check_shl_overflow(max_sectors, SECTOR_SHIFT,
-				&limits->bio_max_bytes))
-		limits->bio_max_bytes = UINT_MAX;
-
 	q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f1a99f0a240c..a0b4cfdf62a4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -106,8 +106,6 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
-extern unsigned int bio_max_size(struct bio *bio);
-
 /**
  * bio_full - check if the bio is full
  * @bio:	bio to check
@@ -121,7 +119,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return true;
 
-	if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
+	if (bio->bi_iter.bi_size > UINT_MAX - len)
 		return true;
 
 	return false;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 40c7c4d87aa1..b91ba6207365 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -327,8 +327,6 @@ enum blk_bounce {
 };
 
 struct queue_limits {
-	unsigned int		bio_max_bytes;
-
 	enum blk_bounce		bounce;
 	unsigned long		seg_boundary_mask;
 	unsigned long		virt_boundary_mask;
-- 
cgit v1.2.3


From 28ec344bb8911bb0d4910456b22ba0dd4f662521 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Wed, 5 May 2021 17:44:22 -0700
Subject: usb: typec: tcpm: Don't block probing of consumers of "connector"
 nodes

fw_devlink expects DT device nodes with "compatible" property to have
struct devices created for them. Since the connector node might not be
populated as a device, mark it as such so that fw_devlink knows not to
wait on this fwnode being populated as a struct device.

Without this patch, USB functionality can be broken on some boards.

Fixes: f7514a663016 ("of: property: fw_devlink: Add support for remote-endpoint")
Reported-by: John Stultz <john.stultz@linaro.org>
Tested-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210506004423.345199-1-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c           | 3 ++-
 drivers/usb/typec/tcpm/tcpm.c | 9 +++++++++
 include/linux/fwnode.h        | 1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4a8bf8cda52b..628e33939aca 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -150,7 +150,7 @@ void fwnode_links_purge(struct fwnode_handle *fwnode)
 	fwnode_links_purge_consumers(fwnode);
 }
 
-static void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
+void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
 {
 	struct fwnode_handle *child;
 
@@ -164,6 +164,7 @@ static void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
 	fwnode_for_each_available_child_node(fwnode, child)
 		fw_devlink_purge_absent_suppliers(child);
 }
+EXPORT_SYMBOL_GPL(fw_devlink_purge_absent_suppliers);
 
 #ifdef CONFIG_SRCU
 static DEFINE_MUTEX(device_links_lock);
diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index c4fdc00a3bc8..bffa342d4e38 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -5754,6 +5754,15 @@ static int tcpm_fw_get_caps(struct tcpm_port *port,
 	if (!fwnode)
 		return -EINVAL;
 
+	/*
+	 * This fwnode has a "compatible" property, but is never populated as a
+	 * struct device. Instead we simply parse it to read the properties.
+	 * This it breaks fw_devlink=on. To maintain backward compatibility
+	 * with existing DT files, we work around this by deleting any
+	 * fwnode_links to/from this fwnode.
+	 */
+	fw_devlink_purge_absent_suppliers(fwnode);
+
 	/* USB data support is optional */
 	ret = fwnode_property_read_string(fwnode, "data-role", &cap_str);
 	if (ret == 0) {
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index ed4e67a7ff1c..59828516ebaf 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -187,5 +187,6 @@ extern u32 fw_devlink_get_flags(void);
 extern bool fw_devlink_is_strict(void);
 int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup);
 void fwnode_links_purge(struct fwnode_handle *fwnode);
+void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode);
 
 #endif
-- 
cgit v1.2.3


From c745253e2a691a40c66790defe85c104a887e14a Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Wed, 5 May 2021 14:09:15 +0300
Subject: PM: runtime: Fix unpaired parent child_count for force_resume

As pm_runtime_need_not_resume() relies also on usage_count, it can return
a different value in pm_runtime_force_suspend() compared to when called in
pm_runtime_force_resume(). Different return values can happen if anything
calls PM runtime functions in between, and causes the parent child_count
to increase on every resume.

So far I've seen the issue only for omapdrm that does complicated things
with PM runtime calls during system suspend for legacy reasons:

omap_atomic_commit_tail() for omapdrm.0
 dispc_runtime_get()
  wakes up 58000000.dss as it's the dispc parent
   dispc_runtime_resume()
    rpm_resume() increases parent child_count
 dispc_runtime_put() won't idle, PM runtime suspend blocked
pm_runtime_force_suspend() for 58000000.dss, !pm_runtime_need_not_resume()
 __update_runtime_status()
system suspended
pm_runtime_force_resume() for 58000000.dss, pm_runtime_need_not_resume()
 pm_runtime_enable() only called because of pm_runtime_need_not_resume()
omap_atomic_commit_tail() for omapdrm.0
 dispc_runtime_get()
  wakes up 58000000.dss as it's the dispc parent
   dispc_runtime_resume()
    rpm_resume() increases parent child_count
 dispc_runtime_put() won't idle, PM runtime suspend blocked
...
rpm_suspend for 58000000.dss but parent child_count is now unbalanced

Let's fix the issue by adding a flag for needs_force_resume and use it in
pm_runtime_force_resume() instead of pm_runtime_need_not_resume().

Additionally omapdrm system suspend could be simplified later on to avoid
lots of unnecessary PM runtime calls and the complexity it adds. The
driver can just use internal functions that are shared between the PM
runtime and system suspend related functions.

Fixes: 4918e1f87c5f ("PM / runtime: Rework pm_runtime_force_suspend/resume()")
Signed-off-by: Tony Lindgren <tony@atomide.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Cc: 4.16+ <stable@vger.kernel.org> # 4.16+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 10 +++++++---
 include/linux/pm.h           |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 1fc1a992f90c..b570848d23e0 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1637,6 +1637,7 @@ void pm_runtime_init(struct device *dev)
 	dev->power.request_pending = false;
 	dev->power.request = RPM_REQ_NONE;
 	dev->power.deferred_resume = false;
+	dev->power.needs_force_resume = 0;
 	INIT_WORK(&dev->power.work, pm_runtime_work);
 
 	dev->power.timer_expires = 0;
@@ -1804,10 +1805,12 @@ int pm_runtime_force_suspend(struct device *dev)
 	 * its parent, but set its status to RPM_SUSPENDED anyway in case this
 	 * function will be called again for it in the meantime.
 	 */
-	if (pm_runtime_need_not_resume(dev))
+	if (pm_runtime_need_not_resume(dev)) {
 		pm_runtime_set_suspended(dev);
-	else
+	} else {
 		__update_runtime_status(dev, RPM_SUSPENDED);
+		dev->power.needs_force_resume = 1;
+	}
 
 	return 0;
 
@@ -1834,7 +1837,7 @@ int pm_runtime_force_resume(struct device *dev)
 	int (*callback)(struct device *);
 	int ret = 0;
 
-	if (!pm_runtime_status_suspended(dev) || pm_runtime_need_not_resume(dev))
+	if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume)
 		goto out;
 
 	/*
@@ -1853,6 +1856,7 @@ int pm_runtime_force_resume(struct device *dev)
 
 	pm_runtime_mark_last_busy(dev);
 out:
+	dev->power.needs_force_resume = 0;
 	pm_runtime_enable(dev);
 	return ret;
 }
diff --git a/include/linux/pm.h b/include/linux/pm.h
index c9657408fee1..1d8209c09686 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -601,6 +601,7 @@ struct dev_pm_info {
 	unsigned int		idle_notification:1;
 	unsigned int		request_pending:1;
 	unsigned int		deferred_resume:1;
+	unsigned int		needs_force_resume:1;
 	unsigned int		runtime_auto:1;
 	bool			ignore_children:1;
 	unsigned int		no_callbacks:1;
-- 
cgit v1.2.3


From 2515dd6ce8e545b0b2eece84920048ef9ed846c4 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Mon, 19 Apr 2021 16:17:41 -0700
Subject: stack: Replace "o" output with "r" input constraint

"o" isn't a common asm() constraint to use; it triggers an assertion in
assert-enabled builds of LLVM that it's not recognized when targeting
aarch64 (though it appears to fall back to "m"). It's fixed in LLVM 13 now,
but there isn't really a good reason to use "o" in particular here. To
avoid causing build issues for those using assert-enabled builds of earlier
LLVM versions, the constraint needs changing.

Instead, if the point is to retain the __builtin_alloca(), make ptr appear
to "escape" via being an input to an empty inline asm block. This is
preferable anyways, since otherwise this looks like a dead store.

While the use of "r" was considered in

  https://lore.kernel.org/lkml/202104011447.2E7F543@keescook/

it was only tested as an output (which looks like a dead store, and wasn't
sufficient).

Use "r" as an input constraint instead, which behaves correctly across
compilers and architectures.

Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall")
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://reviews.llvm.org/D100412
Link: https://bugs.llvm.org/show_bug.cgi?id=49956
Link: https://lore.kernel.org/r/20210419231741.4084415-1-keescook@chromium.org
---
 include/linux/randomize_kstack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h
index fd80fab663a9..bebc911161b6 100644
--- a/include/linux/randomize_kstack.h
+++ b/include/linux/randomize_kstack.h
@@ -38,7 +38,7 @@ void *__builtin_alloca(size_t size);
 		u32 offset = raw_cpu_read(kstack_offset);		\
 		u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset));	\
 		/* Keep allocation even after "ptr" loses scope. */	\
-		asm volatile("" : "=o"(*ptr) :: "memory");		\
+		asm volatile("" :: "r"(ptr) : "memory");		\
 	}								\
 } while (0)
 
-- 
cgit v1.2.3


From efed9a3337e341bd0989161b97453b52567bc59d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 10 May 2021 17:05:35 -0700
Subject: kyber: fix out of bounds access when preempted

__blk_mq_sched_bio_merge() gets the ctx and hctx for the current CPU and
passes the hctx to ->bio_merge(). kyber_bio_merge() then gets the ctx
for the current CPU again and uses that to get the corresponding Kyber
context in the passed hctx. However, the thread may be preempted between
the two calls to blk_mq_get_ctx(), and the ctx returned the second time
may no longer correspond to the passed hctx. This "works" accidentally
most of the time, but it can cause us to read garbage if the second ctx
came from an hctx with more ctx's than the first one (i.e., if
ctx->index_hw[hctx->type] > hctx->nr_ctx).

This manifested as this UBSAN array index out of bounds error reported
by Jakub:

UBSAN: array-index-out-of-bounds in ../kernel/locking/qspinlock.c:130:9
index 13106 is out of range for type 'long unsigned int [128]'
Call Trace:
 dump_stack+0xa4/0xe5
 ubsan_epilogue+0x5/0x40
 __ubsan_handle_out_of_bounds.cold.13+0x2a/0x34
 queued_spin_lock_slowpath+0x476/0x480
 do_raw_spin_lock+0x1c2/0x1d0
 kyber_bio_merge+0x112/0x180
 blk_mq_submit_bio+0x1f5/0x1100
 submit_bio_noacct+0x7b0/0x870
 submit_bio+0xc2/0x3a0
 btrfs_map_bio+0x4f0/0x9d0
 btrfs_submit_data_bio+0x24e/0x310
 submit_one_bio+0x7f/0xb0
 submit_extent_page+0xc4/0x440
 __extent_writepage_io+0x2b8/0x5e0
 __extent_writepage+0x28d/0x6e0
 extent_write_cache_pages+0x4d7/0x7a0
 extent_writepages+0xa2/0x110
 do_writepages+0x8f/0x180
 __writeback_single_inode+0x99/0x7f0
 writeback_sb_inodes+0x34e/0x790
 __writeback_inodes_wb+0x9e/0x120
 wb_writeback+0x4d2/0x660
 wb_workfn+0x64d/0xa10
 process_one_work+0x53a/0xa80
 worker_thread+0x69/0x5b0
 kthread+0x20b/0x240
 ret_from_fork+0x1f/0x30

Only Kyber uses the hctx, so fix it by passing the request_queue to
->bio_merge() instead. BFQ and mq-deadline just use that, and Kyber can
map the queues itself to avoid the mismatch.

Fixes: a6088845c2bf ("block: kyber: make kyber more friendly with merging")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Link: https://lore.kernel.org/r/c7598605401a48d5cfeadebb678abd10af22b83f.1620691329.git.osandov@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c      | 3 +--
 block/blk-mq-sched.c     | 8 +++++---
 block/kyber-iosched.c    | 5 +++--
 block/mq-deadline.c      | 3 +--
 include/linux/elevator.h | 2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0270cd7ca165..59b2499d3f8b 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2263,10 +2263,9 @@ static void bfq_remove_request(struct request_queue *q,
 
 }
 
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs)
 {
-	struct request_queue *q = hctx->queue;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct request *free = NULL;
 	/*
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 42a365b1b9c0..996a4b2f73aa 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -358,14 +358,16 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs)
 {
 	struct elevator_queue *e = q->elevator;
-	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+	struct blk_mq_ctx *ctx;
+	struct blk_mq_hw_ctx *hctx;
 	bool ret = false;
 	enum hctx_type type;
 
 	if (e && e->type->ops.bio_merge)
-		return e->type->ops.bio_merge(hctx, bio, nr_segs);
+		return e->type->ops.bio_merge(q, bio, nr_segs);
 
+	ctx = blk_mq_get_ctx(q);
+	hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
 	type = hctx->type;
 	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
 	    list_empty_careful(&ctx->rq_lists[type]))
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 8969e122f081..81e3279ecd57 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -561,11 +561,12 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 	}
 }
 
-static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs)
 {
+	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
 	struct kyber_hctx_data *khd = hctx->sched_data;
-	struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
 	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
 	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
 	struct list_head *rq_list = &kcq->rq_list[sched_domain];
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 04aded71ead2..8eea2cbf2bf4 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -461,10 +461,9 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
 	return ELEVATOR_NO_MERGE;
 }
 
-static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs)
 {
-	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
 	struct request *free = NULL;
 	bool ret;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1fe8e105b83b..dcb2f9022c1d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -34,7 +34,7 @@ struct elevator_mq_ops {
 	void (*depth_updated)(struct blk_mq_hw_ctx *);
 
 	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
-	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *, unsigned int);
+	bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int);
 	int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
 	void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
 	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
-- 
cgit v1.2.3


From 190515f610946db025cdedebde93958b725fb583 Mon Sep 17 00:00:00 2001
From: Lin Feng <linf@wangsu.com>
Date: Wed, 12 May 2021 18:01:24 +0800
Subject: blkdev.h: remove unused codes blk_account_rq

Last users of blk_account_rq gone with patch commit a1ce35fa49852db
("block: remove dead elevator code") and now it gets no caller, it can
be safely removed.

Signed-off-by: Lin Feng <linf@wangsu.com>
Link: https://lore.kernel.org/r/20210512100124.173769-1-linf@wangsu.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b91ba6207365..26c3e368656f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -677,11 +677,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
 
-static inline bool blk_account_rq(struct request *rq)
-{
-	return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
-}
-
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define rq_data_dir(rq)		(op_is_write(req_op(rq)) ? WRITE : READ)
-- 
cgit v1.2.3


From 681865a03d3ec6ac3dda147044ed2a1a0f49f7bf Mon Sep 17 00:00:00 2001
From: Wan Jiabing <wanjiabing@vivo.com>
Date: Mon, 19 Apr 2021 19:27:25 +0800
Subject: libnvdimm: Remove duplicate struct declaration

struct device is declared at 133rd line. The second declaration is
unnecessary, remove it.

Signed-off-by: Wan Jiabing <wanjiabing@vivo.com>
Link: https://lore.kernel.org/r/20210419112725.42145-1-wanjiabing@vivo.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/libnvdimm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 01f251b6e36c..89b69e645ac7 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -141,7 +141,6 @@ static inline void __iomem *devm_nvdimm_ioremap(struct device *dev,
 
 struct nvdimm_bus;
 struct module;
-struct device;
 struct nd_blk_region;
 struct nd_blk_region_desc {
 	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
-- 
cgit v1.2.3


From 860dafa902595fb5f1d23bbcce1215188c3341e6 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Thu, 13 May 2021 11:51:50 +0200
Subject: vt: Fix character height handling with VT_RESIZEX

Restore the original intent of the VT_RESIZEX ioctl's `v_clin' parameter
which is the number of pixel rows per character (cell) rather than the
height of the font used.

For framebuffer devices the two values are always the same, because the
former is inferred from the latter one.  For VGA used as a true text
mode device these two parameters are independent from each other: the
number of pixel rows per character is set in the CRT controller, while
font height is in fact hardwired to 32 pixel rows and fonts of heights
below that value are handled by padding their data with blanks when
loaded to hardware for use by the character generator.  One can change
the setting in the CRT controller and it will update the screen contents
accordingly regardless of the font loaded.

The `v_clin' parameter is used by the `vgacon' driver to set the height
of the character cell and then the cursor position within.  Make the
parameter explicit then, by defining a new `vc_cell_height' struct
member of `vc_data', set it instead of `vc_font.height' from `v_clin' in
the VT_RESIZEX ioctl, and then use it throughout the `vgacon' driver
except where actual font data is accessed which as noted above is
independent from the CRTC setting.

This way the framebuffer console driver is free to ignore the `v_clin'
parameter as irrelevant, as it always should have, avoiding any issues
attempts to give the parameter a meaning there could have caused, such
as one that has led to commit 988d0763361b ("vt_ioctl: make VT_RESIZEX
behave like VT_RESIZE"):

 "syzbot is reporting UAF/OOB read at bit_putcs()/soft_cursor() [1][2],
  for vt_resizex() from ioctl(VT_RESIZEX) allows setting font height
  larger than actual font height calculated by con_font_set() from
  ioctl(PIO_FONT). Since fbcon_set_font() from con_font_set() allocates
  minimal amount of memory based on actual font height calculated by
  con_font_set(), use of vt_resizex() can cause UAF/OOB read for font
  data."

The problem first appeared around Linux 2.5.66 which predates our repo
history, but the origin could be identified with the old MIPS/Linux repo
also at: <git://git.kernel.org/pub/scm/linux/kernel/git/ralf/linux.git>
as commit 9736a3546de7 ("Merge with Linux 2.5.66."), where VT_RESIZEX
code in `vt_ioctl' was updated as follows:

 		if (clin)
-			video_font_height = clin;
+			vc->vc_font.height = clin;

making the parameter apply to framebuffer devices as well, perhaps due
to the use of "font" in the name of the original `video_font_height'
variable.  Use "cell" in the new struct member then to avoid ambiguity.

References:

[1] https://syzkaller.appspot.com/bug?id=32577e96d88447ded2d3b76d71254fb855245837
[2] https://syzkaller.appspot.com/bug?id=6b8355d27b2b94fb5cedf4655e3a59162d9e48e3

Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org # v2.6.12+
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/vt/vt_ioctl.c      |  6 +++---
 drivers/video/console/vgacon.c | 44 +++++++++++++++++++++---------------------
 include/linux/console_struct.h |  1 +
 3 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
index 95d10197566b..0e0cd9e9e589 100644
--- a/drivers/tty/vt/vt_ioctl.c
+++ b/drivers/tty/vt/vt_ioctl.c
@@ -706,17 +706,17 @@ static int vt_resizex(struct vc_data *vc, struct vt_consize __user *cs)
 		if (vcp) {
 			int ret;
 			int save_scan_lines = vcp->vc_scan_lines;
-			int save_font_height = vcp->vc_font.height;
+			int save_cell_height = vcp->vc_cell_height;
 
 			if (v.v_vlin)
 				vcp->vc_scan_lines = v.v_vlin;
 			if (v.v_clin)
-				vcp->vc_font.height = v.v_clin;
+				vcp->vc_cell_height = v.v_clin;
 			vcp->vc_resize_user = 1;
 			ret = vc_resize(vcp, v.v_cols, v.v_rows);
 			if (ret) {
 				vcp->vc_scan_lines = save_scan_lines;
-				vcp->vc_font.height = save_font_height;
+				vcp->vc_cell_height = save_cell_height;
 				console_unlock();
 				return ret;
 			}
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 511e7d06b148..631eb918f8e1 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -383,7 +383,7 @@ static void vgacon_init(struct vc_data *c, int init)
 		vc_resize(c, vga_video_num_columns, vga_video_num_lines);
 
 	c->vc_scan_lines = vga_scan_lines;
-	c->vc_font.height = vga_video_font_height;
+	c->vc_font.height = c->vc_cell_height = vga_video_font_height;
 	c->vc_complement_mask = 0x7700;
 	if (vga_512_chars)
 		c->vc_hi_font_mask = 0x0800;
@@ -518,32 +518,32 @@ static void vgacon_cursor(struct vc_data *c, int mode)
 		switch (CUR_SIZE(c->vc_cursor_type)) {
 		case CUR_UNDERLINE:
 			vgacon_set_cursor_size(c->state.x,
-					       c->vc_font.height -
-					       (c->vc_font.height <
+					       c->vc_cell_height -
+					       (c->vc_cell_height <
 						10 ? 2 : 3),
-					       c->vc_font.height -
-					       (c->vc_font.height <
+					       c->vc_cell_height -
+					       (c->vc_cell_height <
 						10 ? 1 : 2));
 			break;
 		case CUR_TWO_THIRDS:
 			vgacon_set_cursor_size(c->state.x,
-					       c->vc_font.height / 3,
-					       c->vc_font.height -
-					       (c->vc_font.height <
+					       c->vc_cell_height / 3,
+					       c->vc_cell_height -
+					       (c->vc_cell_height <
 						10 ? 1 : 2));
 			break;
 		case CUR_LOWER_THIRD:
 			vgacon_set_cursor_size(c->state.x,
-					       (c->vc_font.height * 2) / 3,
-					       c->vc_font.height -
-					       (c->vc_font.height <
+					       (c->vc_cell_height * 2) / 3,
+					       c->vc_cell_height -
+					       (c->vc_cell_height <
 						10 ? 1 : 2));
 			break;
 		case CUR_LOWER_HALF:
 			vgacon_set_cursor_size(c->state.x,
-					       c->vc_font.height / 2,
-					       c->vc_font.height -
-					       (c->vc_font.height <
+					       c->vc_cell_height / 2,
+					       c->vc_cell_height -
+					       (c->vc_cell_height <
 						10 ? 1 : 2));
 			break;
 		case CUR_NONE:
@@ -554,7 +554,7 @@ static void vgacon_cursor(struct vc_data *c, int mode)
 			break;
 		default:
 			vgacon_set_cursor_size(c->state.x, 1,
-					       c->vc_font.height);
+					       c->vc_cell_height);
 			break;
 		}
 		break;
@@ -565,13 +565,13 @@ static int vgacon_doresize(struct vc_data *c,
 		unsigned int width, unsigned int height)
 {
 	unsigned long flags;
-	unsigned int scanlines = height * c->vc_font.height;
+	unsigned int scanlines = height * c->vc_cell_height;
 	u8 scanlines_lo = 0, r7 = 0, vsync_end = 0, mode, max_scan;
 
 	raw_spin_lock_irqsave(&vga_lock, flags);
 
 	vgacon_xres = width * VGA_FONTWIDTH;
-	vgacon_yres = height * c->vc_font.height;
+	vgacon_yres = height * c->vc_cell_height;
 	if (vga_video_type >= VIDEO_TYPE_VGAC) {
 		outb_p(VGA_CRTC_MAX_SCAN, vga_video_port_reg);
 		max_scan = inb_p(vga_video_port_val);
@@ -626,9 +626,9 @@ static int vgacon_doresize(struct vc_data *c,
 static int vgacon_switch(struct vc_data *c)
 {
 	int x = c->vc_cols * VGA_FONTWIDTH;
-	int y = c->vc_rows * c->vc_font.height;
+	int y = c->vc_rows * c->vc_cell_height;
 	int rows = screen_info.orig_video_lines * vga_default_font_height/
-		c->vc_font.height;
+		c->vc_cell_height;
 	/*
 	 * We need to save screen size here as it's the only way
 	 * we can spot the screen has been resized and we need to
@@ -1041,7 +1041,7 @@ static int vgacon_adjust_height(struct vc_data *vc, unsigned fontheight)
 				cursor_size_lastto = 0;
 				c->vc_sw->con_cursor(c, CM_DRAW);
 			}
-			c->vc_font.height = fontheight;
+			c->vc_font.height = c->vc_cell_height = fontheight;
 			vc_resize(c, 0, rows);	/* Adjust console size */
 		}
 	}
@@ -1096,12 +1096,12 @@ static int vgacon_resize(struct vc_data *c, unsigned int width,
 		 */
 		screen_info.orig_video_cols = width;
 		screen_info.orig_video_lines = height;
-		vga_default_font_height = c->vc_font.height;
+		vga_default_font_height = c->vc_cell_height;
 		return 0;
 	}
 	if (width % 2 || width > screen_info.orig_video_cols ||
 	    height > (screen_info.orig_video_lines * vga_default_font_height)/
-	    c->vc_font.height)
+	    c->vc_cell_height)
 		return -EINVAL;
 
 	if (con_is_visible(c) && !vga_is_gfx) /* who knows */
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index 153734816b49..d5b9c8d40c18 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -101,6 +101,7 @@ struct vc_data {
 	unsigned int	vc_rows;
 	unsigned int	vc_size_row;		/* Bytes per row */
 	unsigned int	vc_scan_lines;		/* # of scan lines */
+	unsigned int	vc_cell_height;		/* CRTC character cell height */
 	unsigned long	vc_origin;		/* [!] Start of real screen */
 	unsigned long	vc_scr_end;		/* [!] End of real screen */
 	unsigned long	vc_visible_origin;	/* [!] Top of visible window */
-- 
cgit v1.2.3


From 22247efd822e6d263f3c8bd327f3f769aea9b1d9 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 14 May 2021 17:27:04 -0700
Subject: mm/hugetlb: fix F_SEAL_FUTURE_WRITE

Patch series "mm/hugetlb: Fix issues on file sealing and fork", v2.

Hugh reported issue with F_SEAL_FUTURE_WRITE not applied correctly to
hugetlbfs, which I can easily verify using the memfd_test program, which
seems that the program is hardly run with hugetlbfs pages (as by default
shmem).

Meanwhile I found another probably even more severe issue on that hugetlb
fork won't wr-protect child cow pages, so child can potentially write to
parent private pages.  Patch 2 addresses that.

After this series applied, "memfd_test hugetlbfs" should start to pass.

This patch (of 2):

F_SEAL_FUTURE_WRITE is missing for hugetlb starting from the first day.
There is a test program for that and it fails constantly.

$ ./memfd_test hugetlbfs
memfd-hugetlb: CREATE
memfd-hugetlb: BASIC
memfd-hugetlb: SEAL-WRITE
memfd-hugetlb: SEAL-FUTURE-WRITE
mmap() didn't fail as expected
Aborted (core dumped)

I think it's probably because no one is really running the hugetlbfs test.

Fix it by checking FUTURE_WRITE also in hugetlbfs_file_mmap() as what we
do in shmem_mmap().  Generalize a helper for that.

Link: https://lkml.kernel.org/r/20210503234356.9097-1-peterx@redhat.com
Link: https://lkml.kernel.org/r/20210503234356.9097-2-peterx@redhat.com
Fixes: ab3948f58ff84 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd")
Signed-off-by: Peter Xu <peterx@redhat.com>
Reported-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c |  5 +++++
 include/linux/mm.h   | 32 ++++++++++++++++++++++++++++++++
 mm/shmem.c           | 22 ++++------------------
 3 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a2a42335e8fd..9d9e0097c1d3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -131,6 +131,7 @@ static void huge_pagevec_release(struct pagevec *pvec)
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
+	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 	struct hstate *h = hstate_file(file);
@@ -146,6 +147,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
 	vma->vm_ops = &hugetlb_vm_ops;
 
+	ret = seal_check_future_write(info->seals, vma);
+	if (ret)
+		return ret;
+
 	/*
 	 * page based offset in vm_pgoff could be sufficiently large to
 	 * overflow a loff_t when converted to byte offset.  This can
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 322ec61d0da7..c274f75efcf9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3216,5 +3216,37 @@ void mem_dump_obj(void *object);
 static inline void mem_dump_obj(void *object) {}
 #endif
 
+/**
+ * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
+ * @seals: the seals to check
+ * @vma: the vma to operate on
+ *
+ * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
+ * the vma flags.  Return 0 if check pass, or <0 for errors.
+ */
+static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
+{
+	if (seals & F_SEAL_FUTURE_WRITE) {
+		/*
+		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+		 * "future write" seal active.
+		 */
+		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+			return -EPERM;
+
+		/*
+		 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+		 * MAP_SHARED and read-only, take care to not allow mprotect to
+		 * revert protections on such mappings. Do this only for shared
+		 * mappings. For private mappings, don't need to mask
+		 * VM_MAYWRITE as we still want them to be COW-writable.
+		 */
+		if (vma->vm_flags & VM_SHARED)
+			vma->vm_flags &= ~(VM_MAYWRITE);
+	}
+
+	return 0;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index a08cedefbfaa..eb131b9fb190 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2258,25 +2258,11 @@ out_nomem:
 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+	int ret;
 
-	if (info->seals & F_SEAL_FUTURE_WRITE) {
-		/*
-		 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
-		 * "future write" seal active.
-		 */
-		if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
-			return -EPERM;
-
-		/*
-		 * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
-		 * MAP_SHARED and read-only, take care to not allow mprotect to
-		 * revert protections on such mappings. Do this only for shared
-		 * mappings. For private mappings, don't need to mask
-		 * VM_MAYWRITE as we still want them to be COW-writable.
-		 */
-		if (vma->vm_flags & VM_SHARED)
-			vma->vm_flags &= ~(VM_MAYWRITE);
-	}
+	ret = seal_check_future_write(info->seals, vma);
+	if (ret)
+		return ret;
 
 	/* arm64 - allow memory tagging on RAM-based files */
 	vma->vm_flags |= VM_MTE_ALLOWED;
-- 
cgit v1.2.3


From 9ddb3c14afba8bc5950ed297f02d4ae05ff35cd1 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 May 2021 17:27:24 -0700
Subject: mm: fix struct page layout on 32-bit systems

32-bit architectures which expect 8-byte alignment for 8-byte integers and
need 64-bit DMA addresses (arm, mips, ppc) had their struct page
inadvertently expanded in 2019.  When the dma_addr_t was added, it forced
the alignment of the union to 8 bytes, which inserted a 4 byte gap between
'flags' and the union.

Fix this by storing the dma_addr_t in one or two adjacent unsigned longs.
This restores the alignment to that of an unsigned long.  We always
store the low bits in the first word to prevent the PageTail bit from
being inadvertently set on a big endian platform.  If that happened,
get_user_pages_fast() racing against a page which was freed and
reallocated to the page_pool could dereference a bogus compound_head(),
which would be hard to trace back to this cause.

Link: https://lkml.kernel.org/r/20210510153211.1504886-1-willy@infradead.org
Fixes: c25fff7171be ("mm: add dma_addr_t to struct page")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Matteo Croce <mcroce@linux.microsoft.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  4 ++--
 include/net/page_pool.h  | 12 +++++++++++-
 net/core/page_pool.c     | 12 +++++++-----
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..5aacc1c10a45 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -97,10 +97,10 @@ struct page {
 		};
 		struct {	/* page_pool used by netstack */
 			/**
-			 * @dma_addr: might require a 64-bit value even on
+			 * @dma_addr: might require a 64-bit value on
 			 * 32-bit architectures.
 			 */
-			dma_addr_t dma_addr;
+			unsigned long dma_addr[2];
 		};
 		struct {	/* slab, slob and slub */
 			union {
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 6d517a37c18b..b4b6de909c93 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -198,7 +198,17 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
-	return page->dma_addr;
+	dma_addr_t ret = page->dma_addr[0];
+	if (sizeof(dma_addr_t) > sizeof(unsigned long))
+		ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
+	return ret;
+}
+
+static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+	page->dma_addr[0] = addr;
+	if (sizeof(dma_addr_t) > sizeof(unsigned long))
+		page->dma_addr[1] = upper_32_bits(addr);
 }
 
 static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 9ec1aa9640ad..3c4c4c7a0402 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -174,8 +174,10 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool,
 					  struct page *page,
 					  unsigned int dma_sync_size)
 {
+	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+
 	dma_sync_size = min(dma_sync_size, pool->p.max_len);
-	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
 					 pool->p.offset, dma_sync_size,
 					 pool->p.dma_dir);
 }
@@ -195,7 +197,7 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	if (dma_mapping_error(pool->p.dev, dma))
 		return false;
 
-	page->dma_addr = dma;
+	page_pool_set_dma_addr(page, dma);
 
 	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
@@ -331,13 +333,13 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
 		 */
 		goto skip_dma_unmap;
 
-	dma = page->dma_addr;
+	dma = page_pool_get_dma_addr(page);
 
-	/* When page is unmapped, it cannot be returned our pool */
+	/* When page is unmapped, it cannot be returned to our pool */
 	dma_unmap_page_attrs(pool->p.dev, dma,
 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 			     DMA_ATTR_SKIP_CPU_SYNC);
-	page->dma_addr = 0;
+	page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
 	/* This may be the last page returned, releasing the pool, so
 	 * it is not safe to reference pool afterwards.
-- 
cgit v1.2.3


From 076171a67789ad0107de44c2964f2e46a7d0d7b8 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 14 May 2021 17:27:30 -0700
Subject: mm/filemap: fix readahead return types

A readahead request will not allocate more memory than can be represented
by a size_t, even on systems that have HIGHMEM available.  Change the
length functions from returning an loff_t to a size_t.

Link: https://lkml.kernel.org/r/20210510201201.1558972-1-willy@infradead.org
Fixes: 32c0a6bcaa1f57 ("btrfs: add and use readahead_batch_length")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/iomap/buffered-io.c  | 4 ++--
 include/linux/pagemap.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f2cd2034a87b..9023717c5188 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -394,7 +394,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
 {
 	struct inode *inode = rac->mapping->host;
 	loff_t pos = readahead_pos(rac);
-	loff_t length = readahead_length(rac);
+	size_t length = readahead_length(rac);
 	struct iomap_readpage_ctx ctx = {
 		.rac	= rac,
 	};
@@ -402,7 +402,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
 	trace_iomap_readahead(inode, readahead_count(rac));
 
 	while (length > 0) {
-		loff_t ret = iomap_apply(inode, pos, length, 0, ops,
+		ssize_t ret = iomap_apply(inode, pos, length, 0, ops,
 				&ctx, iomap_readahead_actor);
 		if (ret <= 0) {
 			WARN_ON_ONCE(ret == 0);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a4bd41128bf3..e89df447fae3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -997,9 +997,9 @@ static inline loff_t readahead_pos(struct readahead_control *rac)
  * readahead_length - The number of bytes in this readahead request.
  * @rac: The readahead request.
  */
-static inline loff_t readahead_length(struct readahead_control *rac)
+static inline size_t readahead_length(struct readahead_control *rac)
 {
-	return (loff_t)rac->_nr_pages * PAGE_SIZE;
+	return rac->_nr_pages * PAGE_SIZE;
 }
 
 /**
@@ -1024,7 +1024,7 @@ static inline unsigned int readahead_count(struct readahead_control *rac)
  * readahead_batch_length - The number of bytes in the current batch.
  * @rac: The readahead request.
  */
-static inline loff_t readahead_batch_length(struct readahead_control *rac)
+static inline size_t readahead_batch_length(struct readahead_control *rac)
 {
 	return rac->_batch_count * PAGE_SIZE;
 }
-- 
cgit v1.2.3